<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/_Advanced_Research_and_Observations_Step_9_Distributed_Computing_and_Parallel_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install numpy pandas dask scikit-learn matplotlib seaborn torch jinja2 weasyprint

In [None]:
pip install dask[dataframe]

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML

# Configuration
CONFIG = {
    "data_params": {"mean": 0, "std": 1, "size": 1000},
    "nn_params": {
        "input_size": 5,
        "hidden_size": 10,
        "output_size": 1,
        "batch_size": 32,
        "epochs": 10,
        "learning_rate": 0.001,
    },
    "output_dir": "reports",
    "log_file": "pipeline.log",
}

# Logger Setup
logging.basicConfig(filename=CONFIG["log_file"], level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# 1. Data Generation and Preprocessing
def generate_data():
    logger.info("Generating synthetic data.")
    X = np.random.rand(1000, CONFIG["nn_params"]["input_size"])
    y = X @ np.array([1.5, -2.0, 0.5, 3.0, 2.5]) + np.random.normal(0, 0.1, 1000)
    return X, y

def preprocess_data(X, y):
    logger.info("Splitting data into training and testing sets.")
    return train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Statistical Analysis
class StatisticalAnalysis:
    @staticmethod
    def hypothesis_test(data, pop_mean=0, alpha=0.05):
        t_stat, p_value = stats.ttest_1samp(data, pop_mean)
        logger.info(f"T-statistic: {t_stat}, P-value: {p_value}")
        return t_stat, p_value

    @staticmethod
    def confidence_interval(data, confidence=0.95):
        mean = np.mean(data)
        sem = stats.sem(data)
        margin = sem * stats.t.ppf((1 + confidence) / 2., len(data) - 1)
        ci = (mean - margin, mean + margin)
        logger.info(f"Confidence Interval: {ci}")
        return ci

# 3. Visualization
class Visualization:
    @staticmethod
    def plot_distribution(data, filename="distribution_plot.png"):
        sns.histplot(data, kde=True)
        plt.title("Data Distribution")
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.savefig(filename)
        plt.close()
        logger.info(f"Distribution plot saved as {filename}.")

    @staticmethod
    def plot_confidence_intervals(data, ci, filename="confidence_interval_plot.png"):
        mean = np.mean(data)
        lower, upper = ci
        sns.histplot(data, kde=True)
        plt.axvline(mean, color='blue', linestyle='--', label='Mean')
        plt.axvline(lower, color='red', linestyle='--', label='Lower CI')
        plt.axvline(upper, color='green', linestyle='--', label='Upper CI')
        plt.title("Confidence Intervals")
        plt.legend()
        plt.savefig(filename)
        plt.close()
        logger.info(f"Confidence interval plot saved as {filename}.")

# 4. Hyperparameter Optimization
class HyperparameterOptimizer:
    @staticmethod
    def perform_grid_search(X, y):
        param_grid = {
            "n_estimators": [100, 200],
            "max_depth": [10, 20],
            "min_samples_split": [2, 5],
        }
        model = RandomForestRegressor()
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring="neg_mean_squared_error")
        grid_search.fit(X, y)
        logger.info(f"Best Params: {grid_search.best_params_}")
        return grid_search.best_estimator_

# 5. Neural Network Model and Training
class NeuralNetworkModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class ModelTrainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def train(self, train_loader, epochs):
        self.model.train()
        for epoch in range(epochs):
            running_loss = 0.0
            for inputs, targets in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, targets)
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()
            logger.info(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss:.4f}")

# 6. Report Generation
class ReportGenerator:
    def __init__(self, output_dir):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def generate_report(self, context):
        template = """<!DOCTYPE html>
<html>
<head><title>{{ title }}</title></head>
<body>
    <h1>{{ title }}</h1>
    <h2>Data Summary</h2>
    <p>Mean: {{ data_mean }}</p>
    <p>Confidence Interval: {{ confidence_interval }}</p>
    <h2>Plots</h2>
    <img src="{{ distribution_plot }}" alt="Distribution Plot">
    <img src="{{ ci_plot }}" alt="Confidence Interval Plot">
</body>
</html>"""
        env = Environment(loader=FileSystemLoader("."))
        env.from_string(template).render(context)
        report_path = os.path.join(self.output_dir, "report.html")
        with open(report_path, "w") as f:
            f.write(template)
        HTML(report_path).write_pdf(os.path.join(self.output_dir, "report.pdf"))
        logger.info("Report generated successfully.")

# Main Execution Pipeline
if __name__ == "__main__":
    logger.info("Pipeline started.")

    # Data Preparation
    X, y = generate_data()
    X_train, X_test, y_train, y_test = preprocess_data(X, y)

    # Statistical Analysis
    analysis = StatisticalAnalysis()
    ci = analysis.confidence_interval(y_train)

    # Visualization
    Visualization.plot_distribution(y_train)
    Visualization.plot_confidence_intervals(y_train, ci)

    # Hyperparameter Optimization
    optimizer = HyperparameterOptimizer()
    best_model = optimizer.perform_grid_search(X_train, y_train)

    # Neural Network Training
    nn_model = NeuralNetworkModel(CONFIG["nn_params"]["input_size"], CONFIG["nn_params"]["hidden_size"], CONFIG["nn_params"]["output_size"])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=CONFIG["nn_params"]["learning_rate"])
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32).view(-1, 1))
    train_loader = DataLoader(train_dataset, batch_size=CONFIG["nn_params"]["batch_size"], shuffle=True)
    trainer = ModelTrainer(nn_model, criterion, optimizer)
    trainer.train(train_loader, CONFIG["nn_params"]["epochs"])

    # Report Generation
    report_gen = ReportGenerator(CONFIG["output_dir"])
    context = {
        "title": "Pipeline Report",
        "data_mean": np.mean(y_train),
        "confidence_interval": ci,
        "distribution_plot": "distribution_plot.png",
        "ci_plot": "confidence_interval_plot.png",
    }
    report_gen.generate_report(context)
    logger.info("Pipeline completed successfully.")