In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [None]:
model_metrics = {}

In [2]:
df = pd.read_csv('data_Atom.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,Activity
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
'Smiles' in df.columns

False

In [4]:
df.shape

(23791, 2049)

In [5]:
# Get columns with null values
null_columns = df.columns[df.isnull().any()].tolist()

# Print the columns with null values
print("Columns with null values:", null_columns)

for i in null_columns:
    print(df[i].isnull().mean()*100)

Columns with null values: []


In [6]:
df.dropna(inplace = True)

In [7]:
df.shape

(23791, 2049)

In [8]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)

# Split data into training and testing sets (one-time split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the number of folds for cross-validation
n_splits = 10

# Define the parameter grid for hyperparameter tuning (smaller range for faster results)
param_dist = {
    'n_estimators': np.arange(50, 201, 10),  # Reduced range for faster computation
    'max_depth': np.arange(5, 21, 1),        # Reduced range for faster computation
    'random_state': [42]                    # Fixed for reproducibility
}

# Create the Random Forest model
rf = RandomForestClassifier()

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=20, cv=n_splits, scoring='roc_auc', n_jobs=-1, random_state=42, verbose=2)

# Perform RandomizedSearchCV to find the best parameters
random_search.fit(X_train, y_train)

# Get the best parameters and train the final Random Forest model
best_params = random_search.best_params_

# Train the Random Forest model using the best parameters
best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_rf.predict(X_test)
y_pred_prob = best_rf.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics with the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Store the metrics for the final model in the global dictionary
model_metrics['RandomForest'] = [
    accuracy, roc_auc, precision, recall, f1, specificity, balanced_accuracy, fpr, mcc, [TN, FP, FN, TP]
]

# Beautified printing of metrics
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")


Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters: {'random_state': 42, 'n_estimators': 190, 'max_depth': 18}


        Model Evaluation Metrics        
Metric                            Value
----------------------------------------
Accuracy                         0.8661
ROC AUC                          0.8568
Precision                        0.7500
Recall                           0.2444
F1 Score                         0.3687
Specificity                      0.9845
Balanced Accuracy                0.6145
FPR                              0.0155
Matthews CC                      0.3775


            Confusion Matrix            
TN         FP         FN         TP        
----------------------------------------
3936       62         575        186       




In [9]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'n_estimators': randint(50, 301),  # Random integer range for n_estimators
    'max_depth': randint(1, 31),       # Random integer range for max_depth
    'scale_pos_weight': uniform(0.1, 10.0)  # Uniform distribution for scale_pos_weight
}

# Create the XGBClassifier model
xgb = XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss')

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=20, random_state=42, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')

# Perform hyperparameter tuning
random_search.fit(X_train, y_train)

# Get the best parameters from RandomizedSearchCV
best_params = random_search.best_params_

# Train the XGBClassifier model on the entire training set using the best parameters
best_xgb = XGBClassifier(**best_params, objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss')
best_xgb.fit(X_train, y_train)

# Make predictions with the best model
y_pred = best_xgb.predict(X_test)
y_pred_prob = best_xgb.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics with the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Add final model metrics to the global dictionary
model_metrics['XGBoost_final'] = [
    accuracy, roc_auc, precision, recall, f1, specificity, balanced_accuracy, fpr, mcc, [TN, FP, FN, TP]
]

# Beautified printing of metrics
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")


Fitting 10 folds for each of 20 candidates, totalling 200 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=  19.3s
[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=  18.1s
[CV] END .....max_depth=5, n_estimators=110, random_state=42; total time=   4.2s
[CV] END .....max_depth=5, n_estimators=110, random_state=42; total time=   4.2s
[CV] END .....max_depth=5, n_estimators=110, random_state=42; total time=   4.4s
[CV] END .....max_depth=5, n_estimators=110, random_state=42; total time=   4.4s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=  13.3s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=  13.5s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=  13.5s
[CV] END ....max_depth=17, n_estimators=190, random_state=42; total time=  31.9s
[CV] END ....max_depth=17, n_estimators=190, random_state=42; total time=  32.7s
[CV] END ....max_depth=12, n_estimators=100, random_state=42; total time=  10.2s
[CV] END ....max_depth=12, n

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=  19.5s
[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=  17.5s
[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=  17.4s
[CV] END .....max_depth=5, n_estimators=110, random_state=42; total time=   4.7s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=  13.8s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=  13.9s
[CV] END ....max_depth=17, n_estimators=190, random_state=42; total time=  31.2s
[CV] END ....max_depth=17, n_estimators=190, random_state=42; total time=  30.8s
[CV] END ....max_depth=17, n_estimators=190, random_state=42; total time=  32.0s
[CV] END ....max_depth=12, n_estimators=100, random_state=42; total time=  10.4s
[CV] END ....max_depth=16, n_estimators=140, random_state=42; total time=  21.1s
[CV] END ....max_depth=16, n_estimators=140, random_state=42; total time=  20.4s
[CV] END ....max_depth=16, n

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters: {'max_depth': 26, 'n_estimators': 71, 'scale_pos_weight': 0.17066305219717406}


        Model Evaluation Metrics        
Metric                            Value
----------------------------------------
Accuracy                         0.8773
ROC AUC                          0.8667
Precision                        0.7690
Recall                           0.3325
F1 Score                         0.4642
Specificity                      0.9810
Balanced Accuracy                0.6567
FPR                              0.0190
Matthews CC                      0.4529


            Confusion Matrix            
TN         FP         FN         TP        
----------------------------------------
3922       76         508        253       




In [10]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
import warnings
warnings.filterwarnings('ignore')

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'max_depth': randint(1, 31),  # Random integer range for max_depth
    'min_samples_split': randint(2, 20),  # Random integer range for min_samples_split
    'min_samples_leaf': randint(1, 20)  # Random integer range for min_samples_leaf
}

# Create the DecisionTreeClassifier model
dt = DecisionTreeClassifier(random_state=69)

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_dist, n_iter=20, cv=10, scoring='roc_auc', random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV to find the best parameters
random_search.fit(X_train, y_train)

# Get the best parameters from RandomizedSearchCV
best_params = random_search.best_params_

# Train the DecisionTreeClassifier model on the entire training set using the best parameters
best_dt = DecisionTreeClassifier(**best_params, random_state=69)
best_dt.fit(X_train, y_train)

# Make predictions with the best model
y_pred = cross_val_predict(best_dt, X_train, y_train, cv=10)
y_pred_prob = best_dt.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics with the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Add final model metrics to the global dictionary
model_metrics['DecisionTree'] = [
    accuracy, roc_auc, precision, recall, f1, specificity, balanced_accuracy, fpr, mcc, [TN, FP, FN, TP]
]

# Beautified printing of metrics
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")


Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END max_depth=7, n_estimators=229, scale_pos_weight=9.60714306409916; total time=  42.7s
[CV] END max_depth=7, n_estimators=229, scale_pos_weight=9.60714306409916; total time=  41.9s
[CV] END max_depth=11, n_estimators=121, scale_pos_weight=6.086584841970366; total time=  34.9s
[CV] END max_depth=11, n_estimators=121, scale_pos_weight=6.086584841970366; total time=  34.5s
[CV] END max_depth=7, n_estimators=171, scale_pos_weight=1.6599452033620266; total time=  32.5s
[CV] END max_depth=7, n_estimators=171, scale_pos_weight=1.6599452033620266; total time=  33.1s
[CV] END max_depth=7, n_estimators=171, scale_pos_weight=1.6599452033620266; total time=  32.2s
[CV] END max_depth=11, n_estimators=252, scale_pos_weight=8.761761457749351; total time=  53.3s
[CV] END max_depth=11, n_estimators=252, scale_pos_weight=8.761761457749351; total time=  55.3s
[CV] END max_depth=4, n_estimators=153, scale_pos_weight=7.180725777960454; t

In [11]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint
import warnings
warnings.filterwarnings('ignore')

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'n_neighbors': randint(1, 31),  # Random integer range for n_neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'p': [1, 2]  # Power parameter for the Minkowski metric
}

# Create the KNeighborsClassifier model
knn = KNeighborsClassifier()

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=20, cv=10, scoring='roc_auc', random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV to find the best parameters
random_search.fit(X_train, y_train)

# Get the best parameters from RandomizedSearchCV
best_params = random_search.best_params_

# Train the KNeighborsClassifier model on the entire training set using the best parameters
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train, y_train)

# Make predictions with the best model
y_pred = best_knn.predict(X_test)
y_pred_prob = best_knn.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics with the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Store the metrics in a list for visualization
model_metrics = {}
model_metrics['KNeighbors'] = [
    accuracy, roc_auc, precision, recall, f1, specificity, balanced_accuracy, fpr, mcc, [TN, FP, FN, TP]
]

# Beautified printing of metrics
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")


Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters: {'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}


        Model Evaluation Metrics        
Metric                            Value
----------------------------------------
Accuracy                         0.8670
ROC AUC                          0.8447
Precision                        0.6265
Recall                           0.4166
F1 Score                         0.5004
Specificity                      0.9527
Balanced Accuracy                0.6846
FPR                              0.0473
Matthews CC                      0.4391


            Confusion Matrix            
TN         FP         FN         TP        
----------------------------------------
3809       189        444        317       




In [12]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.naive_bayes import GaussianNB
from scipy.stats import uniform
import warnings
warnings.filterwarnings('ignore')

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'var_smoothing': uniform(1e-10, 1e-9)  # Uniform distribution for var_smoothing
}

# Create the GaussianNB model
gnb = GaussianNB()

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=gnb, param_distributions=param_dist, n_iter=200, scoring='roc_auc', cv=5, random_state=42, n_jobs=-1)

# Fit the model with the best parameters
random_search.fit(X_train, y_train)

# Get the best model from RandomizedSearchCV
best_gnb = random_search.best_estimator_

# Make predictions with the best model
y_pred = best_gnb.predict(X_test)
y_pred_prob = best_gnb.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics with the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Add final model metrics to the global dictionary
model_metrics['GaussianNB'] = [
    accuracy,
    roc_auc,
    precision,
    recall,
    f1,
    specificity,
    balanced_accuracy,
    fpr,
    mcc,
    [TN, FP, FN, TP]
]

# Beautified printing of metrics
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")


Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[CV] END max_depth=3, min_samples_leaf=14, min_samples_split=18; total time=   1.0s
[CV] END max_depth=3, min_samples_leaf=14, min_samples_split=18; total time=   1.1s
[CV] END max_depth=4, min_samples_leaf=18, min_samples_split=9; total time=   1.1s
[CV] END max_depth=4, min_samples_leaf=18, min_samples_split=9; total time=   1.1s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=7; total time=   1.1s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=7; total time=   1.1s
[CV] END max_depth=4, min_samples_leaf=2, min_samples_split=7; total time=   1.2s
[CV] END max_depth=22, min_samples_leaf=10, min_samples_split=5; total time=   2.5s
[CV] END max_depth=22, min_samples_leaf=10, min_samples_split=5; total time=   2.3s
[CV] END ................n_neighbors=7, p=2, weights=uniform; total time=  11.1s
[CV] END ................n_neighbors=7, p=2, weights=uniform; total time=   9.9s
[CV] END ..............n_

In [13]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
import warnings
warnings.filterwarnings('ignore')

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'C': uniform(0.01, 10.0),  # Uniform distribution for regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization penalty
    'solver': ['lbfgs', 'liblinear', 'saga']  # Algorithms for optimization
}

# Create the LogisticRegression model
logreg = LogisticRegression(random_state=69, max_iter=200)

# Use RandomizedSearchCV to search for the best hyperparameters
random_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_dist,
    n_iter=20,  # Number of iterations
    random_state=42,
    scoring='roc_auc',  # Use ROC AUC as scoring metric
    cv=5,  # 5-fold cross-validation
    n_jobs=-1
)

# Fit the model using RandomizedSearchCV
random_search.fit(X_train, y_train)

# Extract the best parameters and metrics
best_params = random_search.best_params_

# Train the final LogisticRegression model with the best hyperparameters
best_logreg = LogisticRegression(**best_params, random_state=69, max_iter=200)
best_logreg.fit(X_train, y_train)

# Make predictions with the best model
y_pred = best_logreg.predict(X_test)
y_pred_prob = best_logreg.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute the metrics for the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from the confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Add final model metrics to the global dictionary as a list
model_metrics['LogisticRegression'] = [
    accuracy, roc_auc, precision, recall, f1, specificity, balanced_accuracy, fpr, mcc, [TN, FP, FN, TP]
]

# Beautified printing of metrics
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")


Fitting 10 folds for each of 20 candidates, totalling 200 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV] END ...............var_smoothing=2.3752094414599324e-10; total time=   2.3s
[CV] END ...............var_smoothing=2.3752094414599324e-10; total time=   2.5s
[CV] END ...............var_smoothing=2.3752094414599324e-10; total time=   2.4s
[CV] END ...............var_smoothing=4.4106635105025854e-10; total time=   2.5s
[CV] END ...............var_smoothing=4.4106635105025854e-10; total time=   2.6s
[CV] END ................var_smoothing=2.134735212405891e-10; total time=   2.5s
[CV] END ................var_smoothing=2.134735212405891e-10; total time=   2.8s
[CV] END ...............var_smoothing=1.0246936182785628e-09; total time=   2.6s
[CV] END ...............var_smoothing=1.0246936182785628e-09; total time=   2.4s
[CV] END ...............var_smoothing=1.0246936182785628e-09; total time=   2.6s
[CV] END .................var_smoothing=9.77339353380981e-10; total time=   2.4s
[CV] END .................var_smoothing=9.77339353380981e-10; total time=   2.4s
[CV] END ...............var_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'C': 0.21584494295802448, 'penalty': 'l2', 'solver': 'liblinear'}


        Model Evaluation Metrics        
Metric                            Value
----------------------------------------
Accuracy                         0.8504
ROC AUC                          0.7413
Precision                        0.6207
Recall                           0.1656
F1 Score                         0.2614
Specificity                      0.9807
Balanced Accuracy                0.5732
FPR                              0.0193
Matthews CC                      0.2654


            Confusion Matrix            
TN         FP         FN         TP        
----------------------------------------
3921       77         635        126       




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import ParameterSampler
import numpy as np


In [None]:
class ANNModel(nn.Module):
    def __init__(self, input_size, hidden_layer_sizes):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_layer_sizes[0])
        self.fc2 = nn.Linear(hidden_layer_sizes[0], hidden_layer_sizes[1])
        self.fc3 = nn.Linear(hidden_layer_sizes[1], hidden_layer_sizes[2])
        self.fc4 = nn.Linear(hidden_layer_sizes[2], 1)  # Output layer for binary classification
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Define the hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(128, 256, 12)],
    'optimizer': ['sgd', 'adam', 'lbfgs'],
    'learning_rate': [0.001, 0.01, 0.1],
    'batch_size': [32, 64, 128],
    'epochs': [10, 20, 30]
}

# Generate parameter combinations using ParameterSampler
param_list = list(ParameterSampler(param_grid, n_iter=50, random_state=42))


In [None]:
def train_model(model, criterion, optimizer, train_loader, device):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    return running_loss / len(train_loader)

def evaluate_model(model, test_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            y_pred.extend(outputs.cpu().numpy())
            y_true.extend(y_batch.cpu().numpy())
    return y_true, np.round(y_pred)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df_morgan.drop('Activity', axis = 1)
y = df_morgan['Activity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.25, random_state=69)

# Convert pandas DataFrame/Series to NumPy arrays
X_train_array = X_train.to_numpy()
y_train_array = y_train.to_numpy()
X_test_array = X_test.to_numpy()
y_test_array = y_test.to_numpy()

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_array, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_array, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_array, dtype=torch.float32).unsqueeze(1)

# Best accuracy and parameters tracker
best_acc = 0
best_params = None

for params in param_list:
    print(f"Testing parameters: {params}")

    model = ANNModel(input_size=X_train_tensor.shape[1], hidden_layer_sizes=params['hidden_layer_sizes'])

    # Optimizer selection
    if params['optimizer'] == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=params['learning_rate'])
    elif params['optimizer'] == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    elif params['optimizer'] == 'lbfgs':
        optimizer = optim.LBFGS(model.parameters(), lr=params['learning_rate'])

    # Define the loss function
    criterion = nn.BCELoss()

    # Create DataLoader for training and testing
    train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=params['batch_size'], shuffle=True)
    test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=params['batch_size'], shuffle=False)

    # Training loop
    for epoch in range(params['epochs']):
        model.train()

        if params['optimizer'] == 'lbfgs':
                # Define closure for LBFGS
            def closure():
                optimizer.zero_grad()
                outputs = model(X_train_tensor)
                    
                    # Calculate loss
                loss = criterion(outputs, y_train_tensor)
                loss.backward()
                return loss

                # Perform LBFGS optimization
            optimizer.step(closure)

        else:
            running_loss = 0.0
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

            print(f'Epoch [{epoch+1}/{params["epochs"]}], Loss: {running_loss/len(train_loader):.4f}')
    # Evaluate the model
    y_true, y_pred = evaluate_model(model, test_loader, 'cpu')
    acc = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc}")

    if acc > best_acc:
        best_acc = acc
        best_params = params

print(f"Best Accuracy: {best_acc}")
print(f"Best Parameters: {best_params}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Assuming model_metrics is your global dictionary with metrics for all models
model_names = list(model_metrics.keys())
roc_auc_scores = [model_metrics[model]['roc_auc'] for model in model_names]
accuracies = [model_metrics[model]['accuracy'] for model in model_names]
conf_matrices = [model_metrics[model]['conf_matrix'] for model in model_names]

# 1. Bar Plot of ROC AUC with the top 4 in green, rest in yellow
sorted_indices = np.argsort(roc_auc_scores)[::-1]
top4_indices = sorted_indices[:4]

plt.figure(figsize=(10, 6))
bars = plt.bar([model_names[i] for i in sorted_indices], [roc_auc_scores[i] for i in sorted_indices], color='yellow')
for i in top4_indices:
    bars[i].set_color('green')
plt.title('ROC AUC Scores of Models')
plt.xlabel('Models')
plt.ylabel('ROC AUC Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 2. Grouped Bar Plot of Accuracy and ROC AUC
x = np.arange(len(model_names))  # The label locations
width = 0.35  # Width of the bars

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, accuracies, width, label='Accuracy')
rects2 = ax.bar(x + width/2, roc_auc_scores, width, label='ROC AUC')

# Add some text for labels, title, and custom x-axis tick labels, etc.
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Accuracy and ROC AUC by Model')
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45)
ax.legend()

fig.tight_layout()
plt.show()

In [None]:
# 3. Confusion Matrices of All Models
fig, axs = plt.subplots(2, 3, figsize=(15, 10))  # Adjust rows/columns as per number of models
fig.suptitle('Confusion Matrices for All Models')

for i, (ax, model) in enumerate(zip(axs.flat, model_names)):
    conf_matrix = conf_matrices[i]
    sns.heatmap(np.array(conf_matrix).reshape(2, 2), annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(f'Confusion Matrix: {model}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.tight_layout()
plt.show()

In [None]:
# 4. (Optional) Precision, Recall, F1 Score comparison
precision_scores = [model_metrics[model]['precision'] for model in model_names]
recall_scores = [model_metrics[model]['recall'] for model in model_names]
f1_scores = [model_metrics[model]['f1_score'] for model in model_names]

plt.figure(figsize=(10, 6))
plt.bar(model_names, precision_scores, label='Precision', alpha=0.6)
plt.bar(model_names, recall_scores, label='Recall', alpha=0.6, bottom=precision_scores)
plt.bar(model_names, f1_scores, label='F1 Score', alpha=0.6, bottom=[i+j for i,j in zip(precision_scores, recall_scores)])

plt.title('Precision, Recall, and F1 Score of Models')
plt.xlabel('Models')
plt.ylabel('Score')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()