In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
df = pd.read_csv('/content/PCA_Training_Set_Preprocessed_Final.csv')

In [3]:
df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

In [4]:
print(f"Original dataframe size: {len(df)}")
print(f"Training set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Testing set size: {len(df_test)}")

Original dataframe size: 50000
Training set size: 35000
Validation set size: 7500
Testing set size: 7500


In [5]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],   # Regularization strength
    'penalty': ['l2'],              # L1 requires solver='liblinear' or 'saga'
    'solver': ['lbfgs']             # Suitable for small-to-medium datasets
}

In [6]:
# === Define model
logistic_regression = LogisticRegression(max_iter=1000)

# === Grid search with 5-fold cross-validation
grid_search_lr = GridSearchCV(
    estimator=logistic_regression,
    param_grid=param_grid_lr,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# === Train model using PCA-reduced features
X_train_pca = df_train.drop('Test Results', axis=1)
y_train = df_train['Test Results']

grid_search_lr.fit(X_train_pca, y_train)

# === Best model
print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best cross-validation accuracy for Logistic Regression:", grid_search_lr.best_score_)

# === Evaluation on validation and test sets
X_val_pca = df_val.drop('Test Results', axis=1)
y_val = df_val['Test Results']
y_val_pred = grid_search_lr.predict(X_val_pca)

print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))

X_test_pca = df_test.drop('Test Results', axis=1)
y_test = df_test['Test Results']
y_test_pred = grid_search_lr.predict(X_test_pca)

print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validation accuracy for Logistic Regression: 0.9625142857142859

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2464
           1       0.93      0.95      0.94      2509
           2       0.97      0.97      0.97      2527

    accuracy                           0.96      7500
   macro avg       0.96      0.96      0.96      7500
weighted avg       0.96      0.96      0.96      7500


Test Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2547
           1       0.93      0.95      0.94      2440
           2       0.97      0.97      0.97      2513

    accuracy                           0.96      7500
   macro avg       0.96      0.96      0.96      7500
weighted

In [7]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

# Get the best model from GridSearchCV
best_lr_model = grid_search_lr.best_estimator_

# Make predictions on the test set
y_pred_test_lr = best_lr_model.predict(df_test.drop('Test Results', axis=1))
y_true_test_lr = df_test['Test Results']

# Get the classification report as a dictionary
report = classification_report(y_true_test_lr, y_pred_test_lr, output_dict=True)

# Extract the desired metrics.
# We'll focus on the weighted averages.
precision = report['weighted avg']['precision']
sensitivity = report['weighted avg']['recall'] # Recall is sensitivity
f1_score = report['weighted avg']['f1-score']
accuracy = accuracy_score(y_true_test_lr, y_pred_test_lr)

# Create a dictionary to store the metrics
metrics_data = {
    'Metric': ['Precision', 'Sensitivity', 'F1 Score', 'Accuracy'],
    'Value': [precision, sensitivity, f1_score, accuracy]
}

# Create a pandas DataFrame
metrics_df = pd.DataFrame(metrics_data)

# Display the DataFrame
print("Performance Metrics for Logistic Regression on Test Set:")
display(metrics_df)

Performance Metrics for Logistic Regression on Test Set:


Unnamed: 0,Metric,Value
0,Precision,0.960447
1,Sensitivity,0.960267
2,F1 Score,0.960332
3,Accuracy,0.960267


In [8]:
# Print Accuracy
print("Accuracy:", accuracy)

Accuracy: 0.9602666666666667
