<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/_Advanced_Research_and_Observations_Step_8_Integrate_Domain_Specific_Computational_Tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install weasyprint

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.stats import bayes_mvs
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
import multiprocessing as mp
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import logging
from jinja2 import Environment, FileSystemLoader
import os
from weasyprint import HTML

# 1. Data Collection Class
class DataCollection:
    def __init__(self, data_source: str):
        self.data_source = data_source
        self.data = None

    def collect_data(self) -> np.ndarray:
        self.data = np.random.normal(0, 1, 1000)
        print("Data collected from source.")
        return self.data

    def preprocess_data(self) -> np.ndarray:
        self.data = (self.data - np.mean(self.data)) / np.std(self.data)
        print("Data preprocessed.")
        return self.data

# 2. Error Analysis Class
class ErrorAnalysis:
    @staticmethod
    def calculate_standard_error(data: np.ndarray) -> float:
        n = len(data)
        standard_error = np.std(data) / np.sqrt(n)
        print(f"Standard Error: {standard_error}")
        return standard_error

    @staticmethod
    def confidence_interval(data: np.ndarray, confidence: float = 0.95) -> tuple:
        mean = np.mean(data)
        sem = stats.sem(data)
        margin = sem * stats.t.ppf((1 + confidence) / 2., len(data) - 1)
        interval = (mean - margin, mean + margin)
        print(f"Confidence Interval ({confidence*100}%): {interval}")
        return interval

# 3. Model Validation Class
class ModelValidation:
    def __init__(self, model, X: np.ndarray, y: np.ndarray):
        self.model = model
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2)

    def validate_model(self) -> float:
        self.model.fit(self.X_train, self.y_train)
        predictions = self.model.predict(self.X_test)
        mse = mean_squared_error(self.y_test, predictions)
        print(f"Model Validation - MSE: {mse}")
        return mse

    def k_fold_validation(self, k: int = 5) -> None:
        kf = KFold(n_splits=k)
        mse_scores = []
        for train_index, test_index in kf.split(self.X_train):
            X_train_kf, X_test_kf = self.X_train[train_index], self.X_train[test_index]
            y_train_kf, y_test_kf = self.y_train[train_index], self.y_train[test_index]
            self.model.fit(X_train_kf, y_train_kf)
            predictions = self.model.predict(X_test_kf)
            mse = mean_squared_error(y_test_kf, predictions)
            mse_scores.append(mse)
        mean_mse = np.mean(mse_scores)
        print(f"{k}-Fold Cross-Validation Mean MSE: {mean_mse}")

# 4. Scalable Computing Class for Parallel Processing
class ScalableComputing:
    @staticmethod
    def parallel_computation(func, data: list, num_processes: int = 4) -> list:
        with mp.Pool(num_processes) as pool:
            results = pool.map(func, data)
        print("Parallel computation completed.")
        return results

# 5. Statistical Analysis Class
class StatisticalAnalysis:
    def __init__(self, data):
        self.data = data

    def hypothesis_test(self, pop_mean, alpha=0.05):
        t_stat, p_value = stats.ttest_1samp(self.data, pop_mean)
        print(f"T-statistic: {t_stat}, P-value: {p_value}")
        if p_value < alpha:
            print("Reject the null hypothesis")
        else:
            print("Fail to reject the null hypothesis")
        return t_stat, p_value

    def bayesian_inference(self):
        mean_ci, var_ci, std_ci = bayes_mvs(self.data, alpha=0.95)
        print(f"Bayesian Mean CI: {mean_ci}")
        print(f"Bayesian Variance CI: {var_ci}")
        return mean_ci, var_ci, std_ci

    def monte_carlo_simulation(self, func, num_simulations=1000):
        results = [func(self.data) for _ in range(num_simulations)]
        mean_result = np.mean(results)
        print(f"Monte Carlo Simulation Mean Result: {mean_result}")
        return results

# 6. Visualization Class
class Visualization:
    def __init__(self, data):
        self.data = data

    def plot_distribution(self):
        plt.figure(figsize=(10, 6))
        sns.histplot(self.data, kde=True)
        plt.title("Data Distribution with KDE")
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.savefig("distribution_plot.png")
        plt.show()

    def plot_confidence_intervals(self, ci):
        lower, upper = ci  # ci should be a tuple or list containing two values
        mean = np.mean(self.data)
        plt.figure(figsize=(10, 6))
        sns.histplot(self.data, kde=True)
        plt.axvline(mean, color='blue', linestyle='--', label='Mean')
        plt.axvline(lower, color='red', linestyle='--', label='Lower CI')
        plt.axvline(upper, color='green', linestyle='--', label='Upper CI')
        plt.title("Confidence Intervals")
        plt.legend()
        plt.savefig("confidence_interval_plot.png")
        plt.show()

# 7. Logger Class
class Logger:
    def __init__(self, log_file='research_log.log'):
        logging.basicConfig(filename=log_file, level=logging.INFO,
                            format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger()

    def log(self, message):
        self.logger.info(message)
        print(f"LOG: {message}")

# 8. Report Generation Class
class ReportGenerator:
    def __init__(self, output_dir="reports"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def generate_report(self, context, template_name="report_template.html"):
        base_path = os.getcwd()
        template_path = os.path.join(base_path, "templates")
        os.makedirs(template_path, exist_ok=True)

        # Check if template exists, create a simple one if it does not
        template_file = os.path.join(template_path, template_name)
        if not os.path.exists(template_file):
            with open(template_file, "w") as file:
                file.write(
                    """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{{ title }}</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
        }
        h1 {
            color: #333;
        }
        h2 {
            color: #555;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
        }
        table, th, td {
            border: 1px solid #ccc;
        }
        th, td {
            padding: 10px;
            text-align: left;
        }
        .plot {
            text-align: center;
            margin: 20px 0;
        }
    </style>
</head>
<body>
    <h1>{{ title }}</h1>

    <h2>Data Summary</h2>
    <table>
        <tr>
            <th>Metric</th>
            <th>Value</th>
        </tr>
        <tr>
            <td>Standard Error</td>
            <td>{{ data_summary.standard_error }}</td>
        </tr>
        <tr>
            <td>Confidence Interval</td>
            <td>{{ data_summary.confidence_interval }}</td>
        </tr>
        <tr>
            <td>Mean Squared Error (MSE)</td>
            <td>{{ data_summary.mse }}</td>
        </tr>
        <tr>
            <td>T-statistic</td>
            <td>{{ data_summary.t_stat }}</td>
        </tr>
        <tr>
            <td>P-value</td>
            <td>{{ data_summary.p_value }}</td>
        </tr>
    </table>

    <h2>Plots</h2>
    <div class="plot">
        <h3>Distribution Plot</h3>
        <img src="{{ plots.distribution_plot }}" alt="Distribution Plot">
    </div>
    <div class="plot">
        <h3>Confidence Interval Plot</h3>
        <img src="{{ plots.confidence_interval_plot }}" alt="Confidence Interval Plot">
    </div>

    <h2>Conclusion</h2>
    <p>{{ conclusion }}</p>
</body>
</html>"""
                )

        env = Environment(loader=FileSystemLoader(template_path))
        template = env.get_template(template_name)

        html_content = template.render(context)
        report_path = os.path.join(self.output_dir, "research_report.html")

        with open(report_path, "w") as file:
            file.write(html_content)

        pdf_path = os.path.join(self.output_dir, "research_report.pdf")

        # Using WeasyPrint to generate PDF
        HTML(report_path).write_pdf(pdf_path)

        print("Report generated successfully.")
        return report_path, pdf_path

# 9. Hyperparameter Optimization Class
class HyperparameterOptimizer:
    def __init__(self, model, param_grid):
        self.model = model
        self.param_grid = param_grid

    def perform_grid_search(self, X, y):
        grid_search = GridSearchCV(self.model, self.param_grid, cv=5, scoring="neg_mean_squared_error")
        grid_search.fit(X, y)
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Best Score (MSE): {-grid_search.best_score_}")
        return grid_search.best_estimator_

# 10. Neural Network Model Class (Example in PyTorch)
class NeuralNetworkModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# 11. Training and Evaluation of Deep Learning Model
class ModelTrainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def train(self, train_loader, epochs=10):
        for epoch in range(epochs):
            for data, target in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(data)
                loss = self.criterion(outputs, target)
                loss.backward()
                self.optimizer.step()
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

    def evaluate(self, test_loader):
        total, correct = 0, 0
        with torch.no_grad():
            for data, target in test_loader:
                outputs = self.model(data)
                _, predicted = torch.max(outputs.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
        accuracy = 100 * correct / total
        print(f'Test Accuracy: {accuracy:.2f}%')

# Example Usage
if __name__ == "__main__":
    X = np.random.rand(1000, 5)  # Sample input data
    y = X @ np.array([1.5, -2.0, 0.5, 3.0, 2.5]) + np.random.normal(0, 0.1, 1000)  # Simulated output with noise

    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf_model = RandomForestRegressor()
    optimizer = HyperparameterOptimizer(rf_model, param_grid)
    best_model = optimizer.perform_grid_search(X, y)

    # Logger
    logger = Logger()
    logger.log("Starting pipeline.")

    # Data Collection
    data_collector = DataCollection(data_source="sensor")
    data = data_collector.collect_data()
    data = data_collector.preprocess_data()

    # Error Analysis
    error_analysis = ErrorAnalysis()
    std_error = error_analysis.calculate_standard_error(data)
    ci = error_analysis.confidence_interval(data)

    # Statistical Analysis
    stats_analysis = StatisticalAnalysis(data)
    t_stat, p_value = stats_analysis.hypothesis_test(pop_mean=0)

    # Model Validation
    X = np.random.rand(1000, 1)
    y = 3.5 * X.flatten() + np.random.normal(0, 0.1, 1000)
    model = LinearRegression()
    validator = ModelValidation(model, X, y)
    mse = validator.validate_model()
    logger.log(f"Model validation completed with MSE: {mse}")

    # Hyperparameter Optimization Example
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf_model = RandomForestRegressor()
    optimizer = HyperparameterOptimizer(rf_model, param_grid)
    best_model = optimizer.perform_grid_search(X, y)

    # Visualization
    vis = Visualization(data)
    vis.plot_distribution()
    vis.plot_confidence_intervals(ci)

    # Report
    context = {
        "title": "Research Report",
        "data_summary": {
            "standard_error": std_error,
            "confidence_interval": ci,
            "mse": mse,
            "t_stat": t_stat,
            "p_value": p_value,
        },
        "plots": {
            "distribution_plot": "distribution_plot.png",
            "confidence_interval_plot": "confidence_interval_plot.png"
        },
        "conclusion": "The pipeline was executed successfully.",
    }
    report_gen = ReportGenerator()

    # Capture paths from generate_report
    report_path, pdf_path = report_gen.generate_report(context)

    print("PDF generated using WeasyPrint.")