In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class TorchNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size=1):
        super(TorchNN, self).__init__()
        layers = []
        current_input_size = input_size
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(current_input_size, hidden_size))
            layers.append(nn.ReLU())
            current_input_size = hidden_size
        layers.append(nn.Linear(current_input_size, output_size))
        layers.append(nn.Sigmoid())  # For binary classification
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [2]:
class NeuralNetworkClassifier:
    def __init__(self, input_size, hidden_sizes=(64, 32), batch_size=32, epochs=10, lr=0.001):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.model = TorchNN(input_size, hidden_sizes)
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def fit(self, X_train, y_train):
        dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        for epoch in range(self.epochs):
            for X_batch, y_batch in dataloader:
                self.optimizer.zero_grad()
                outputs = self.model(X_batch).squeeze()
                loss = self.criterion(outputs, y_batch)
                loss.backward()
                self.optimizer.step()

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.float32)).squeeze()
            return (outputs > 0.5).numpy().astype(int)

    def predict_proba(self, X):
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.float32)).squeeze()
            return outputs.numpy()

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                           roc_curve, auc, precision_recall_curve, average_precision_score)
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from imblearn.combine import SMOTETomek
import warnings
import os
import time
from datetime import datetime
import json
import torch
from tqdm import tqdm
from itertools import product

warnings.filterwarnings('ignore')

# Define the MLAnalysis class
class MLAnalysis:
    def __init__(self):
        self.start_time = time.time()
        self.results_text = []
        self.feature_importance = None
        # self.hyperparameters = {
        #     'random_forest': {'n_estimators': [100], 'max_depth': [10, 20], 'class_weight': ['balanced']},
        #     'xgboost': {'learning_rate': [0.1], 'max_depth': [3, 6], 'n_estimators': [100]},
        #     'lightgbm': {'learning_rate': [0.1], 'max_depth': [3, 6], 'n_estimators': [100]},
        # }
        self.hyperparameters = {
            'random_forest': {
                'params': {
                    # 'n_estimators': [100, 200, 300],      # Number of trees
                    # 'max_depth': [10, 20, None],          # Maximum depth of trees
                    # 'min_samples_split': [2, 5, 10],      # Minimum samples required to split
                    # 'min_samples_leaf': [1, 2, 4],        # Minimum samples in leaf nodes
                    # 'max_features': ['sqrt', 'log2'],     # Feature selection method
                    # 'bootstrap': [True, False],           # Bootstrap samples
                    # 'class_weight': ['balanced', None]    # Class weight consideration
                    'n_estimators': [100,],      # Number of trees
                    'max_depth': [10,20],          # Maximum depth of trees
                    'class_weight': ['balanced']    # Class weight consideration
                },
                'param_explanations': {
                    'n_estimators': 'Controls the number of trees in the forest. More trees provide better accuracy but increase computation time.',
                    'max_depth': 'Maximum depth of each tree. None allows unlimited growth, while specific values prevent overfitting.',
                    'min_samples_split': 'Minimum samples required to split a node. Higher values prevent overfitting but might underfit.',
                    'min_samples_leaf': 'Minimum samples required in a leaf node. Higher values create more conservative trees.',
                    'max_features': 'Method for selecting features for splits. sqrt and log2 are common choices for classification.',
                    'bootstrap': 'Whether to use bootstrap samples. False means use whole dataset for each tree.',
                    'class_weight': 'Handling class imbalance. balanced adjusts weights inversely proportional to frequencies.'
                }
            },
            'xgboost': {
                'params': {
                    # 'learning_rate': [0.01, 0.1],         # Learning rate
                    # 'max_depth': [3, 5, 7],               # Maximum tree depth
                    # 'n_estimators': [100, 200],           # Number of boosting rounds
                    # 'subsample': [0.8, 1.0],              # Subsample ratio of training instances
                    # 'colsample_bytree': [0.8, 1.0],       # Subsample ratio of columns
                    # 'min_child_weight': [1, 3, 5],        # Minimum sum of instance weight in child
                    # 'gamma': [0, 0.1, 0.2],               # Minimum loss reduction for split
                    # 'reg_alpha': [0, 0.1, 0.5],           # L1 regularization
                    # 'reg_lambda': [0.1, 1.0]              # L2 regularization
                    'learning_rate': [0.1],         # Learning rate
                    'max_depth': [3, 6],               # Maximum tree depth
                    'n_estimators': [100],           # Number of boosting rounds
                  
                },
                'param_explanations': {
                    'learning_rate': 'Controls the contribution of each tree. Lower values mean more conservative boosting.',
                    'max_depth': 'Maximum depth of trees. Deeper trees can model more complex patterns but may overfit.',
                    'n_estimators': 'Number of boosting rounds. More rounds might improve performance but may overfit.',
                    'subsample': 'Fraction of samples used for training each tree. Helps prevent overfitting.',
                    'colsample_bytree': 'Fraction of features used for training each tree. Controls feature selection.',
                    'min_child_weight': 'Minimum sum of instance weight in child. Controls tree splitting behavior.',
                    'gamma': 'Minimum loss reduction required for split. Higher values make algorithm more conservative.',
                    'reg_alpha': 'L1 regularization term. Helps create sparse trees.',
                    'reg_lambda': 'L2 regularization term. Helps stabilize the model.'
                }
            },
            'lightgbm': {
                'params': {
                    # 'learning_rate': [0.01, 0.1],          # Learning rate
                    # 'num_leaves': [31, 63, 127],           # Maximum number of leaves
                    # 'max_depth': [3, 5, 7],                # Maximum tree depth
                    # 'n_estimators': [100, 200],            # Number of boosting iterations
                    # 'min_child_samples': [20, 50],         # Minimum samples in leaf
                    # 'min_child_weight': [0.001, 0.1],      # Minimum sum of instance weight
                    # 'min_split_gain': [0.0, 0.1],          # Minimum gain for split
                    # 'subsample': [0.8, 1.0],               # Sample ratio of training instances
                    # 'colsample_bytree': [0.8, 1.0],        # Feature selection ratio
                    # 'reg_alpha': [0.0, 0.1, 0.5],          # L1 regularization
                    # 'reg_lambda': [0.0, 0.1, 0.5],         # L2 regularization
                    # 'boosting_type': ['gbdt', 'dart']      # Boosting type
                    'learning_rate': [0.1],          # Learning rate
                    'max_depth': [3,6],                # Maximum tree depth
                    'n_estimators': [100],            # Number of boosting iterations
                    

                    'min_child_samples': [50],         # 增加最小樣本數要求
                    'min_child_weight': [0.01],        # 調整最小權重要求
                    'subsample': [0.8],                # 使用子採樣防止過擬合
                    'colsample_bytree': [0.8],         # 特徵採樣
                    'reg_alpha': [0.1],                # 增加一點 L1 正則化
                    'reg_lambda': [0.1],               # 增加一點 L2 正則化
                    'min_split_gain': [0.1]            # 設置最小分割增益
                },
                'param_explanations': {
                    'learning_rate': 'Step size shrinkage to prevent overfitting. Lower values need more iterations.',
                    'num_leaves': 'Maximum number of leaves in one tree. Controls model complexity.',
                    'max_depth': 'Maximum depth of the tree. -1 means no limit.',
                    'n_estimators': 'Number of boosting iterations. More iterations might improve performance.',
                    'min_child_samples': 'Minimum number of data needed in a leaf. Controls overfitting.',
                    'min_child_weight': 'Minimum sum of instance weight in leaf. Similar to min_child_samples.',
                    'min_split_gain': 'Minimum gain to make a split. Controls tree growth.',
                    'subsample': 'Training instance sampling ratio. Helps prevent overfitting.',
                    'colsample_bytree': 'Feature sampling ratio for each tree. Controls feature selection.',
                    'reg_alpha': 'L1 regularization. Helps create sparse trees.',
                    'reg_lambda': 'L2 regularization. Helps create more conservative trees.',
                    'boosting_type': 'Algorithm type. DART often provides better accuracy but might be unstable.'
                }
            },
            'neural_network': {
                'params': {
                    'hidden_sizes': [(50, 25), (128, 64, 32)],
                    'batch_size': [64],
                    'epochs': [100],
                    'lr': [ 0.0001]
                    
                    
                    # 'hidden_layer_sizes': [(50, 25)],  # Layer architecture
                    # 'activation': ['relu', 'tanh'],                        # Activation function
                    # 'alpha': [0.0001, 0.001, 0.01],                       # L2 penalty parameter
                    # 'learning_rate': ['constant', 'adaptive'],             # Learning rate schedule
                    # 'max_iter': [100],                               # Maximum iterations
                    # 'early_stopping': [True],                             # Early stopping usage
                    # 'validation_fraction': [0.1],                         # Validation set size
                    # 'batch_size': [ 64,128,256]                        # Batch size for training
                    # 'hidden_layer_sizes': [ (50, 25)],  # Layer architecture
                    # 'activation': ['relu'],                        # Activation function
                    # 'alpha': [0.0001],                       # L2 penalty parameter
                    # 'learning_rate': [ 'adaptive'],             # Learning rate schedule
                    # 'max_iter': [200],                               # Maximum iterations
                    # 'early_stopping': [True],                             # Early stopping usage
                    # 'validation_fraction': [0.1],                         # Validation set size
                    # 'batch_size': ['auto']                        # Batch size for training
                },
                'param_explanations': {
                    'hidden_layer_sizes': 'Architecture of hidden layers. More complex architectures can model more complex patterns.',
                    'activation': 'Activation function for hidden layers. ReLU is often default, tanh can work better for some cases.',
                    'alpha': 'L2 regularization term. Higher values mean stronger regularization.',
                    'learning_rate': 'Learning rate schedule for weight updates. Adaptive can be better for complex problems.',
                    'max_iter': 'Maximum number of iterations. Should be increased if model doesnt converge.',
                    'early_stopping': 'Whether to use early stopping to prevent overfitting.',
                    'validation_fraction': 'Fraction of training data to use for validation.',
                    'batch_size': 'Size of minibatches for training. Auto lets algorithm decide best size.'
                }
            }
        }
        # self.experiment_results = {}
        self.experiment_results = {
            'data_analysis': {},
            'feature_engineering': {},
            'model_training': {},
            'model_evaluation': {},
            'statistical_tests': {},
            'parameter_analysis': {}  # New section for parameter analysis
        }

    def load_data(self):
        columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                   'marital-status', 'occupation', 'relationship', 'race', 'sex',
                   'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
        self.train_data = pd.read_csv('dataset/adult.data', names=columns, skipinitialspace=True)
        self.test_data = pd.read_csv('dataset/adult.test', names=columns, skipinitialspace=True, skiprows=1)
        self.test_data['income'] = self.test_data['income'].str.replace('.', '')
        print(f"Training set shape: {self.train_data.shape}")
        print(f"Test set shape: {self.test_data.shape}")

    def feature_engineering(self):
        for data in [self.train_data, self.test_data]:
            data.replace('?', np.nan, inplace=True)
            for col in ['workclass', 'occupation', 'native-country']:
                data[col].fillna(data[col].mode()[0], inplace=True)
            data['capital_total'] = data['capital-gain'] - data['capital-loss']
            data['has_capital'] = (data['capital_total'] != 0).astype(int)
            data['capital_per_hour'] = data['capital_total'] / (data['hours-per-week'] + 1)
            data['work_intensity'] = data['hours-per-week'] / data['age']
            education_map = {
                'Preschool': 1, '1st-4th': 1, '5th-6th': 1, '7th-8th': 2, '9th': 2,
                '10th': 2, '11th': 2, '12th': 2, 'HS-grad': 3, 'Some-college': 3,
                'Assoc-voc': 4, 'Assoc-acdm': 4, 'Bachelors': 5, 'Masters': 6,
                'Prof-school': 7, 'Doctorate': 7
            }
            data['education_level'] = data['education'].map(education_map)
        print("Feature engineering completed.")

    def prepare_data(self):
        categorical_cols = ['workclass', 'education', 'marital-status', 'occupation',
                            'relationship', 'race', 'sex', 'native-country']
        self.encoders = {}
        for col in categorical_cols:
            self.encoders[col] = LabelEncoder()
            self.train_data[col] = self.encoders[col].fit_transform(self.train_data[col])
            self.test_data[col] = self.encoders[col].transform(self.test_data[col])
        
        self.feature_cols = ['age', 'workclass', 'education-num', 'education_level',
                             'marital-status', 'occupation', 'relationship', 'race', 'sex',
                             'capital_total', 'has_capital', 'work_intensity', 
                             'capital_per_hour', 'hours-per-week']
        
        X_train = self.train_data[self.feature_cols]
        y_train = (self.train_data['income'] == '>50K').astype(int)
        X_test = self.test_data[self.feature_cols]
        y_test = (self.test_data['income'] == '>50K').astype(int)
        
        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        smote_tomek = SMOTETomek(random_state=42)
        self.X_train_balanced, self.y_train_balanced = smote_tomek.fit_resample(X_train_scaled, y_train)
        self.X_test, self.y_test = X_test_scaled, y_test
        print("Data preparation completed.")

    def create_model_performance_plots(self, model_name, model, y_pred):
        # Ensure the plots directory exists
        if not os.path.exists('plots'):
            os.makedirs('plots')
    
        # Create feature importance plot if available
        if hasattr(model, 'feature_importances_'):
            importance_df = pd.DataFrame({
                'feature': self.feature_cols,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            plt.figure(figsize=(10, 6))
            sns.barplot(data=importance_df.head(10), x='importance', y='feature')
            plt.title(f'{model_name} - Feature Importance')
            plt.tight_layout()
            plt.savefig(f'plots/{model_name.lower()}_feature_importance.png')
            plt.close()
    
        # Create confusion matrix plot
        cm = confusion_matrix(self.y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{model_name} - Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'plots/{model_name.lower()}_confusion_matrix.png')
        plt.close()
    
        # Optionally, create other plots such as ROC curves if needed
        # Example: ROC curve
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(self.X_test)[:, 1]
            fpr, tpr, _ = roc_curve(self.y_test, y_proba)
            plt.figure(figsize=(8, 6))
            plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc(fpr, tpr):.2f})')
            plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
            plt.title(f'{model_name} - ROC Curve')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.legend(loc='lower right')
            plt.tight_layout()
            plt.savefig(f'plots/{model_name.lower()}_roc_curve.png')
            plt.close()
    
    def train_random_forest(self):
        model = RandomForestClassifier(random_state=42)
        params = self.hyperparameters['random_forest']['params']  # Flattened access
        grid_search = GridSearchCV(model, param_grid=params, cv=3, scoring='f1', n_jobs=-1, verbose=1)
        grid_search.fit(self.X_train_balanced, self.y_train_balanced)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(self.X_test)
        
        # Record results
        self.experiment_results.setdefault('model_training', {}).setdefault('model_results', {})['random_forest'] = {
            'test_performance': {
                'accuracy': accuracy_score(self.y_test, y_pred),
                'classification_report': classification_report(self.y_test, y_pred, output_dict=True)
            },
            'cross_validation': {
                'mean': grid_search.best_score_,
                'std': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
            },
            'best_parameters': grid_search.best_params_
        }
        
        # Create performance plots
        self.create_model_performance_plots('Random Forest', best_model, y_pred)
        
        print("Random Forest Test Accuracy:", accuracy_score(self.y_test, y_pred))
        print(classification_report(self.y_test, y_pred))
    
    def train_xgboost(self):
        model = xgb.XGBClassifier(random_state=42)
        params = self.hyperparameters['xgboost']['params']  # Flattened access
        grid_search = GridSearchCV(model, param_grid=params, cv=3, scoring='f1', n_jobs=-1, verbose=1)
        grid_search.fit(self.X_train_balanced, self.y_train_balanced)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(self.X_test)
        
        # Record results
        self.experiment_results.setdefault('model_training', {}).setdefault('model_results', {})['xgboost'] = {
            'test_performance': {
                'accuracy': accuracy_score(self.y_test, y_pred),
                'classification_report': classification_report(self.y_test, y_pred, output_dict=True)
            },
            'cross_validation': {
                'mean': grid_search.best_score_,
                'std': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
            },
            'best_parameters': grid_search.best_params_
        }
        
        # Create performance plots
        self.create_model_performance_plots('XGBoost', best_model, y_pred)
        
        print("XGBoost Test Accuracy:", accuracy_score(self.y_test, y_pred))
        print(classification_report(self.y_test, y_pred))
    
    def train_lightgbm(self):
        model = lgb.LGBMClassifier(random_state=42)
        params = self.hyperparameters['lightgbm']['params']  # Flattened access
        grid_search = GridSearchCV(model, param_grid=params, cv=3, scoring='f1', n_jobs=-1, verbose=1)
        grid_search.fit(self.X_train_balanced, self.y_train_balanced)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(self.X_test)
        
        # Record results
        self.experiment_results.setdefault('model_training', {}).setdefault('model_results', {})['lightgbm'] = {
            'test_performance': {
                'accuracy': accuracy_score(self.y_test, y_pred),
                'classification_report': classification_report(self.y_test, y_pred, output_dict=True)
            },
            'cross_validation': {
                'mean': grid_search.best_score_,
                'std': grid_search.cv_results_['std_test_score'][grid_search.best_index_]
            },
            'best_parameters': grid_search.best_params_
        }
        
        # Create performance plots
        self.create_model_performance_plots('LightGBM', best_model, y_pred)
        
        print("LightGBM Test Accuracy:", accuracy_score(self.y_test, y_pred))
        print(classification_report(self.y_test, y_pred))

    def train_neural_network(self):
        # Extract hyperparameters
        nn_params = self.hyperparameters['neural_network']['params']
        hidden_sizes_list = nn_params['hidden_sizes']
        batch_sizes = nn_params['batch_size']
        epochs_list = nn_params['epochs']
        learning_rates = nn_params['lr']
    
        best_accuracy = 0
        best_model = None
        best_params = None
    
        # Iterate over all combinations of hyperparameters
        for hidden_sizes, batch_size, epochs, lr in product(hidden_sizes_list, batch_sizes, epochs_list, learning_rates):
            # Initialize the model
            model = NeuralNetworkClassifier(input_size=self.X_train_balanced.shape[1],
                                            hidden_sizes=hidden_sizes,
                                            batch_size=batch_size,
                                            epochs=epochs,
                                            lr=lr)
            
            # Train the model
            model.fit(self.X_train_balanced, self.y_train_balanced)
            
            # Evaluate on validation set
            y_pred = model.predict(self.X_test)
            accuracy = accuracy_score(self.y_test, y_pred)
            
            # Check if this is the best model so far
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_params = {
                    'hidden_sizes': hidden_sizes,
                    'batch_size': batch_size,
                    'epochs': epochs,
                    'lr': lr
                }
        
        # Record results
        y_pred = best_model.predict(self.X_test)
        self.experiment_results.setdefault('model_training', {}).setdefault('model_results', {})['neural_network'] = {
            'test_performance': {
                'accuracy': accuracy_score(self.y_test, y_pred),
                'classification_report': classification_report(self.y_test, y_pred, output_dict=True)
            },
            'best_parameters': best_params
        }
        
        # Create performance plots
        self.create_model_performance_plots('Neural Network', best_model, y_pred)
        
        print("Neural Network Test Accuracy:", accuracy_score(self.y_test, y_pred))
        print(classification_report(self.y_test, y_pred))

    def save_experiment_state(self):
        experiment_data = {
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'duration': time.time() - self.start_time,
            'configuration': {'hyperparameters': self.hyperparameters}
        }
        with open('results/experiment_state.json', 'w') as f:
            json.dump(experiment_data, f, indent=4)
        print("Experiment state saved.")

    def save_analysis_reports(self):
        # Ensure the results directory exists
        if not os.path.exists('results'):
            os.makedirs('results')
    
        # Create main report file
        report_path = "results/model_analysis_report.txt"
        
        with open(report_path, 'w') as f:
            # Write overall summary
            f.write("MACHINE LEARNING MODEL ANALYSIS REPORT\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write("="*80 + "\n\n")
            
            # Write model comparison summary
            f.write("MODEL COMPARISON SUMMARY\n")
            f.write("="*80 + "\n")
            
            comparison_data = {}
            if 'model_training' in self.experiment_results:
                for model_name, results in self.experiment_results['model_training']['model_results'].items():
                    model_data = {
                        'Test Accuracy': results['test_performance']['accuracy'],
                        'CV Mean Score': results['cross_validation']['mean'],
                        'CV Std': results['cross_validation']['std']
                    }
                    comparison_data[model_name] = model_data
    
                # Convert comparison data to DataFrame for better formatting
                comparison_df = pd.DataFrame(comparison_data).T
                f.write(comparison_df.to_string())
                f.write("\n\n")
    
            # Write detailed reports for each model
            for model_name, results in self.experiment_results['model_training']['model_results'].items():
                f.write(f"\n{'='*80}\n")
                f.write(f"DETAILED REPORT FOR {model_name.upper()}\n")
                f.write(f"{'='*80}\n\n")
                
                # Overall performance
                f.write("1. OVERALL PERFORMANCE\n")
                f.write("-" * 50 + "\n")
                if 'cross_validation' in results:
                    f.write(f"Mean CV Score: {results['cross_validation']['mean']:.4f} "
                            f"(±{results['cross_validation']['std']*2:.4f})\n")
                f.write(f"Test Accuracy: {results['test_performance']['accuracy']:.4f}\n")
                f.write("\nClassification Report:\n")
                f.write(pd.DataFrame(results['test_performance']['classification_report']).to_string())
                f.write("\n\n")
                
                # Parameter analysis
                if 'best_parameters' in results:
                    f.write("2. PARAMETER ANALYSIS\n")
                    f.write("-" * 50 + "\n")
                    f.write("\nBest Parameters Selected:\n")
                    for param, value in results['best_parameters'].items():
                        f.write(f"{param}: {value}\n")
                f.write("\n")
        
        print("Analysis reports saved.")


In [4]:
# Instantiate the analysis class
analysis = MLAnalysis()

In [5]:
analysis.load_data()
analysis.feature_engineering()
analysis.prepare_data()

Training set shape: (32561, 15)
Test set shape: (16281, 15)
Feature engineering completed.
Data preparation completed.


In [None]:
analysis.train_random_forest()

Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [7]:
analysis.train_xgboost()

Fitting 3 folds for each of 2 candidates, totalling 6 fits
XGBoost Test Accuracy: 0.8485351022664456
              precision    recall  f1-score   support

           0       0.93      0.87      0.90     12435
           1       0.65      0.79      0.71      3846

    accuracy                           0.85     16281
   macro avg       0.79      0.83      0.80     16281
weighted avg       0.86      0.85      0.85     16281



In [6]:
analysis.train_lightgbm()

Fitting 3 folds for each of 2 candidates, totalling 6 fits
LightGBM Test Accuracy: 0.8457097229899884
              precision    recall  f1-score   support

           0       0.93      0.86      0.90     12435
           1       0.64      0.79      0.71      3846

    accuracy                           0.85     16281
   macro avg       0.79      0.83      0.80     16281
weighted avg       0.86      0.85      0.85     16281



In [None]:
# analysis.train_neural_network()

In [41]:
analysis.save_experiment_state()
analysis.save_analysis_reports()

Experiment state saved.
Analysis reports saved.
