Program to do Analysis Faster Model Comparision 

In [None]:
# E3.ipynb

import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Read data from Excel file
file_path = 'E3-MLR3.xlsx'
train_data = pd.read_excel(file_path, sheet_name='train')
test_data = pd.read_excel(file_path, sheet_name='test')

# Separate features and target variable in train data
X_train = train_data.drop(columns=['y'])
y_train = train_data['y']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['y'])
y_test = test_data['y']

# Use PolynomialFeatures to create features 
deg = 6
poly = PolynomialFeatures(degree=deg)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

####################################
# Save the augmented data set to a file for review

# Create dataframe with test data and additional features
# Get feature names
feature_names = poly.get_feature_names_out(X_train.columns)
augmented_data = pd.DataFrame(X_test_poly, columns=feature_names)
augmented_data['y'] = test_data['y']

# Write dataframe to CSV
#augmented_data.to_csv(f'aug_deg_{deg}.csv', index=False)
####################################

# Algorithms
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='rbf'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'XGBoost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'Neural Network': MLPRegressor(hidden_layer_sizes=[20,20], max_iter=4000),
   # 'Elastic Net' : ElasticNet()
}

# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

# Create a grid of subplots 
fig, axs = plt.subplots(len(algorithms), 4, figsize=(20, 20))
fig_row = -1

# Run the algorithms ... create metrics and plots 
for algorithm_name, algorithm in algorithms.items():
    
    # Train model
    algorithm.fit(X_train_poly, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train_poly)
    
    # Test predictions
    y_test_pred = algorithm.predict(X_test_poly)

    # Train metrics
    r2_train = algorithm.score(X_train_poly, y_train)
    mse_train = mean_squared_error(y_train, y_train_pred)
    
    # Test metrics
    r2_test = algorithm.score(X_test_poly, y_test)
    mse_test = mean_squared_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred
    
    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)
    
    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)
    
    # Update metric tables
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test
    
    # Create the plots
    fig_row = fig_row+1
    
    axs[fig_row, 0].scatter(train_data['x1'], y_train)
    axs[fig_row, 0].scatter(train_data['x1'], y_train_pred)
    axs[fig_row, 0].set_title(algorithm_name + " - Train")
    
    axs[fig_row, 1].scatter(train_data['x1'], residuals_train)
    axs[fig_row, 1].set_title(algorithm_name + " Residuals - Train")
    
    axs[fig_row, 2].scatter(test_data['x1'], y_test)
    axs[fig_row, 2].scatter(test_data['x1'], y_test_pred)
    axs[fig_row, 2].set_title(algorithm_name + " - Test")
    
    axs[fig_row, 3].scatter(test_data['x1'], residuals_test)
    axs[fig_row, 3].set_title(algorithm_name + " Residuals - Test")
############################

plt.tight_layout()
plt.show()

In [None]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# Read the datasets
datasets = ['Clusters-4-v0.csv', 'Clusters-4-v1.csv', 'Clusters-4-v2.csv']
data = [pd.read_csv(dataset) for dataset in datasets]

# Split each dataset into training and testing sets
train_test_data = [train_test_split(dataset, test_size=0.2, random_state=42) for dataset in data]

# Define the machine learning algorithms
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'SVC Linear': SVC(kernel='linear', probability=True),
    'SVC RBF': SVC(kernel='rbf', probability=True),
    'Random Forest 1': RandomForestClassifier(min_samples_leaf=1),
    'Random Forest 3': RandomForestClassifier(min_samples_leaf=3),
    'Random Forest 5': RandomForestClassifier(min_samples_leaf=5),
    'Neural Network 5': MLPClassifier(hidden_layer_sizes=(5)),
    'Neural Network 5,5': MLPClassifier(hidden_layer_sizes=(5,5)),
    'Neural Network 5,5,5': MLPClassifier(hidden_layer_sizes=(5,5,5)),
    'Neural Network 10': MLPClassifier(hidden_layer_sizes=(10))
}

# Initialize the metrics dataframe
metrics = pd.DataFrame(columns=['Algorithm', 'Dataset', 'Data', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC'])

# For each algorithm
for name, classifier in classifiers.items():
    # For each dataset
    for i, (X_train, X_test, y_train, y_test) in enumerate(train_test_data):
        # Train the model
        classifier.fit(X_train, y_train)
        
        # Make predictions
        y_train_pred = classifier.predict(X_train)
        y_test_pred = classifier.predict(X_test)
        
        # Calculate metrics
        for data, y, y_pred in [('Train', y_train, y_train_pred), ('Test', y_test, y_test_pred)]:
            accuracy = accuracy_score(y, y_pred)
            precision = precision_score(y, y_pred, average=None)
            recall = recall_score(y, y_pred, average=None)
            f1 = f1_score(y, y_pred, average=None)
            auc = roc_auc_score(y, y_pred)
            
            # Save the metrics
            metrics = metrics.append({
                'Algorithm': name,
                'Dataset': datasets[i],
                'Data': data,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1-score': f1,
                'AUC': auc
            }, ignore_index=True)
            
        # Plot the classification boundaries
        X_set, y_set = X_train, y_train
        X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                             np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
        plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                     alpha = 0.75, cmap = ListedColormap(('red', 'green')))
        plt.xlim(X1.min(), X1.max())
        plt.ylim(X2.min(), X2.max())
        for i, j in enumerate(np.unique(y_set)):
            plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                        c = ListedColormap(('red', 'green'))(i), label = j)
        plt.title('Classifier (Training set)')
        plt.xlabel('PC1')
        plt.ylabel('PC2')
        plt.legend()
        plt.show()

# Save the metrics into a CSV file
metrics.to_csv('metrics.csv', index=False)

The metrics generated for classification algorithms provide a way to evaluate the performance of the models. Here's a brief explanation of each metric:

Accuracy: This is the ratio of the total number of correct predictions to the total number of predictions. It's a good measure when the target classes are well balanced. However, it can be misleading if the classes are imbalanced.

Precision (per class and average): Precision is the ratio of true positives (correctly predicted positives) to the total predicted positives. It's a measure of a classifier's exactness. Low precision indicates a high number of false positives.

Recall (per class and average): Recall (also known as sensitivity) is the ratio of true positives to the total actual positives. It's a measure of a classifier's completeness. Low recall indicates a high number of false negatives.

F1-score (per class and average): The F1 score is the harmonic mean of precision and recall. It tries to find the balance between precision and recall. It's a better measure than accuracy, especially for imbalanced classes.

AUC (per class and average): AUC stands for "Area under the ROC Curve". The ROC curve is a plot of the true positive rate (recall) against the false positive rate (1 - specificity). AUC measures the entire two-dimensional area underneath the entire ROC curve. AUC ranges in value from 0 to 1. A model whose predictions are 100% wrong has an AUC of 0 and one whose predictions are 100% correct has an AUC of 1.

By comparing these metrics across different models, you can determine which model performs best for your specific task. Remember, the choice of metric depends on your business objective. For example, in a fraud detection scenario, you might want to prioritize recall (to catch as many fraud cases as possible) over precision (to avoid false alarms).

