In [50]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [51]:
df = pd.read_csv('data_MACCS.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,158,159,160,161,162,163,164,165,166,Activity
0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,0,0


In [52]:
'Smiles' in df.columns

False

In [53]:
# Get columns with null values
null_columns = df.columns[df.isnull().any()].tolist()

# Print the columns with null values
print("Columns with null values:", null_columns)

for i in null_columns:
    print(df[i].isnull().mean()*100)

Columns with null values: []


In [54]:
df.shape

(23791, 168)

In [55]:
df.dropna(inplace = True)
df.shape

(23791, 168)

In [56]:
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)


# Split data into training and testing sets (one-time split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the number of folds for cross-validation (common choices are 5 or 10)
n_splits = 10

# Create a KFold object
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define the parameter grid for hyperparameter tuning (smaller range for faster results)
param_dist = {
    'n_estimators': np.arange(50, 201, 10),  # Reduced range for faster computation
    'max_depth': np.arange(5, 21, 1),        # Reduced range for faster computation
    'random_state': [42]                    # Fixed for reproducibility
}

# Create the Random Forest model
rf = RandomForestClassifier()

# Create the RandomizedSearchCV object (samples a fixed number of parameter combinations)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                    n_iter=20, cv=kf, scoring='roc_auc',
                                    n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Train the Random Forest model on the entire training set using the best parameters
best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train, y_train)

# Make predictions
y_pred = best_rf.predict(X_test)
y_pred_prob = best_rf.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Beautified printing
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")





Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters: {'random_state': 42, 'n_estimators': 140, 'max_depth': 16}


        Model Evaluation Metrics        
Metric                            Value
----------------------------------------
Accuracy                         0.8798
ROC AUC                          0.8955
Precision                        0.7193
Recall                           0.4074
F1 Score                         0.5201
Specificity                      0.9697
Balanced Accuracy                0.6885
FPR                              0.0303
Matthews CC                      0.4816


            Confusion Matrix            
TN         FP         FN         TP        
----------------------------------------
3877       121        451        310       




In [57]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'n_estimators': randint(50, 301),  # Random integer range for n_estimators
    'max_depth': randint(1, 31),       # Random integer range for max_depth
    'scale_pos_weight': uniform(0.1, 10.0)  # Uniform distribution for scale_pos_weight
}

# Create the XGBClassifier model
xgb = XGBClassifier(objective='binary:logistic', random_state=69, use_label_encoder=False, eval_metric='logloss')

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist,
                                    n_iter=20, cv=10, scoring='roc_auc',
                                    n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Train the XGBClassifier model on the entire training set using the best parameters
best_xgb = XGBClassifier(**best_params, objective='binary:logistic', random_state=69, use_label_encoder=False, eval_metric='logloss')
best_xgb.fit(X_train, y_train)

# Make predictions
y_pred = best_xgb.predict(X_test)
y_pred_prob = best_xgb.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Beautified printing
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")




Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=   2.3s
[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=   2.3s
[CV] END .....max_depth=19, n_estimators=90, random_state=42; total time=   2.2s
[CV] END .....max_depth=5, n_estimators=110, random_state=42; total time=   1.0s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=   2.8s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=   2.9s
[CV] END .....max_depth=9, n_estimators=200, random_state=42; total time=   2.9s
[CV] END ....max_depth=17, n_estimators=190, random_state=42; total time=   4.7s
[CV] END ....max_depth=17, n_estimators=190, random_state=42; total time=   4.7s
[CV] END ....max_depth=12, n_estimators=100, random_state=42; total time=   1.9s
[CV] END ....max_depth=12, n_estimators=100, random_state=42; total time=   1.8s
[CV] END ....max_depth=12, n_estimators=100, r

In [58]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'max_depth': randint(1, 31),       # Random integer range for max_depth
    'min_samples_split': randint(2, 20), # Random integer range for min_samples_split
    'min_samples_leaf': randint(1, 20)  # Random integer range for min_samples_leaf
}

# Create the DecisionTreeClassifier model
dt = DecisionTreeClassifier(random_state=69)

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_dist,
                                   n_iter=20, cv=10, scoring='roc_auc',
                                   n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters for Decision Tree: {best_params}")

# Train the DecisionTreeClassifier model on the entire training set using the best parameters
best_dt = DecisionTreeClassifier(**best_params, random_state=69)
best_dt.fit(X_train, y_train)

# Make predictions
y_pred = best_dt.predict(X_test)
y_pred_prob = best_dt.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Beautified printing
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")



Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters for Decision Tree: {'max_depth': 26, 'min_samples_leaf': 9, 'min_samples_split': 3}


        Model Evaluation Metrics        
Metric                            Value
----------------------------------------
Accuracy                         0.8569
ROC AUC                          0.8037
Precision                        0.5766
Recall                           0.3955
F1 Score                         0.4692
Specificity                      0.9447
Balanced Accuracy                0.6701
FPR                              0.0553
Matthews CC                      0.3991


            Confusion Matrix            
TN         FP         FN         TP        
----------------------------------------
3777       221        460        301       




In [59]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint

# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'n_neighbors': randint(1, 31),  # Random integer range for n_neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'p': [1, 2]  # Power parameter for the Minkowski metric
}

# Create the KNeighborsClassifier model
knn = KNeighborsClassifier()

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist,
                                   n_iter=20, cv=10, scoring='roc_auc',
                                   n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Train the KNeighborsClassifier model on the entire training set using the best parameters
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train, y_train)

# Make predictions
y_pred = best_knn.predict(X_test)
y_pred_prob = best_knn.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Beautified printing
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")



Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END max_depth=26, n_estimators=58, scale_pos_weight=0.2596625222021419; total time=   1.3s
[CV] END max_depth=2, n_estimators=133, scale_pos_weight=2.510254660260117; total time=   0.8s
[CV] END max_depth=2, n_estimators=133, scale_pos_weight=2.510254660260117; total time=   0.8s
[CV] END max_depth=28, n_estimators=248, scale_pos_weight=6.199966577826209; total time=   4.9s
[CV] END max_depth=28, n_estimators=248, scale_pos_weight=6.199966577826209; total time=   5.0s
[CV] END max_depth=28, n_estimators=248, scale_pos_weight=6.199966577826209; total time=   3.6s
[CV] END max_depth=7, min_samples_leaf=15, min_samples_split=12; total time=   0.2s
[CV] END max_depth=7, min_samples_leaf=15, min_samples_split=12; total time=   0.2s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=12; total time=   0.2s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=12; total time=   0.2s
[CV] END max_depth=8, min_sam

In [60]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.naive_bayes import GaussianNB
from scipy.stats import uniform
import warnings
warnings.filterwarnings('ignore')


# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'var_smoothing': uniform(1e-10, 1e-9)  # Uniform distribution for var_smoothing
}

# Create the GaussianNB model
gnb = GaussianNB()

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=gnb, param_distributions=param_dist,
                                   n_iter=200, cv=10, scoring='roc_auc',
                                   n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Train the GaussianNB model on the entire training set using the best parameters
best_gnb = GaussianNB(**best_params)
best_gnb.fit(X_train, y_train)

# Make predictions
y_pred = best_gnb.predict(X_test)
y_pred_prob = best_gnb.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Beautified printing
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")




Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[CV] END max_depth=28, n_estimators=248, scale_pos_weight=6.199966577826209; total time=   4.6s
[CV] END max_depth=7, min_samples_leaf=15, min_samples_split=12; total time=   0.2s
[CV] END max_depth=7, min_samples_leaf=15, min_samples_split=12; total time=   0.2s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=12; total time=   0.2s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=12; total time=   0.2s
[CV] END max_depth=11, min_samples_leaf=4, min_samples_split=9; total time=   0.2s
[CV] END max_depth=11, min_samples_leaf=4, min_samples_split=9; total time=   0.2s
[CV] END max_depth=24, min_samples_leaf=3, min_samples_split=3; total time=   0.3s
[CV] END max_depth=24, min_samples_leaf=3, min_samples_split=3; total time=   0.3s
[CV] END max_depth=24, min_samples_leaf=3, min_samples_split=3; total time=   0.3s
[CV] END max_depth=24, min_samples_leaf=3, min_samples_split=3; total time=   0.3s
[CV] EN

In [61]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
import warnings
warnings.filterwarnings('ignore')


# Separate features and target variable
X = df.drop('Activity', axis=1)
y = df['Activity']

X = X.clip(lower=-1e6, upper=1e6)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'C': uniform(0.01, 10.0),  # Uniform distribution for regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization penalty
    'solver': ['lbfgs', 'liblinear', 'saga']  # Algorithm to use in the optimization problem
}

# Create the LogisticRegression model with max_iter=1000
logreg = LogisticRegression(random_state=69, max_iter=200)

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=logreg, param_distributions=param_dist,
                                   n_iter=20, cv=10, scoring='roc_auc',
                                   n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Train the LogisticRegression model on the entire training set using the best parameters
best_logreg = LogisticRegression(**best_params, random_state=69, max_iter=200)
best_logreg.fit(X_train, y_train)

# Make predictions
y_pred = best_logreg.predict(X_test)
y_pred_prob = best_logreg.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Additional metrics
specificity = TN / (TN + FP)
balanced_accuracy = (recall + specificity) / 2
fpr = FP / (FP + TN)  # False Positive Rate

# Beautified printing
print("\n")
print("="*40)
print("Model Evaluation Metrics".center(40))
print("="*40)
print(f"{'Metric':<20} {'Value':>18}")
print("-"*40)
print(f"{'Accuracy':<20} {accuracy:>18.4f}")
print(f"{'ROC AUC':<20} {roc_auc:>18.4f}")
print(f"{'Precision':<20} {precision:>18.4f}")
print(f"{'Recall':<20} {recall:>18.4f}")
print(f"{'F1 Score':<20} {f1:>18.4f}")
print(f"{'Specificity':<20} {specificity:>18.4f}")
print(f"{'Balanced Accuracy':<20} {balanced_accuracy:>18.4f}")
print(f"{'FPR':<20} {fpr:>18.4f}")
print(f"{'Matthews CC':<20} {mcc:>18.4f}")
print("\n")
print("="*40)
print("Confusion Matrix".center(40))
print("="*40)
print(f"{'TN':<10} {'FP':<10} {'FN':<10} {'TP':<10}")
print("-"*40)
print(f"{TN:<10} {FP:<10} {FN:<10} {TP:<10}")
print("="*40)
print("\n")




Fitting 10 folds for each of 20 candidates, totalling 200 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'C': 0.5908361216819946, 'penalty': 'none', 'solver': 'lbfgs'}


        Model Evaluation Metrics        
Metric                            Value
----------------------------------------
Accuracy                         0.8477
ROC AUC                          0.7489
Precision                        0.6169
Recall                           0.1248
F1 Score                         0.2077
Specificity                      0.9852
Balanced Accuracy                0.5550
FPR                              0.0148
Matthews CC                      0.2280


            Confusion Matrix            
TN         FP         FN         TP        
----------------------------------------
3939       59         666        95        




In [62]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterSampler

# Define the neural network model
class ANNModel(nn.Module):
    def __init__(self, input_size, hidden_layer_sizes):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_layer_sizes[0])
        self.fc2 = nn.Linear(hidden_layer_sizes[0], hidden_layer_sizes[1])
        self.fc3 = nn.Linear(hidden_layer_sizes[1], hidden_layer_sizes[2])
        self.fc4 = nn.Linear(hidden_layer_sizes[2], 1)  # Output layer for binary classification
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

# Function to train the model
def train_model(model, criterion, optimizer, train_loader, device):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    return running_loss / len(train_loader)

# Function to evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            predicted = (outputs >= 0.5).float()  # Apply threshold for binary classification
            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    return y_true, y_pred

# Assume df_maccs is your DataFrame
# Replace with the actual DataFrame loading code
# df_maccs = ...

X = df.drop('Activity', axis=1)
y = df['Activity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.25, random_state=69)

# Convert pandas DataFrame/Series to NumPy arrays
X_train_array = X_train.to_numpy()
y_train_array = y_train.to_numpy()
X_test_array = X_test.to_numpy()
y_test_array = y_test.to_numpy()

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_array, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_array, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_array, dtype=torch.float32).unsqueeze(1)

# Define the hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(128, 256, 128)],
    'optimizer': ['sgd', 'adam', 'lbfgs'],
    'learning_rate': [0.001, 0.01, 0.1],
    'batch_size': [32, 64, 128],
    'epochs': [10, 20, 30]
}

# Generate parameter combinations using ParameterSampler
param_list = list(ParameterSampler(param_grid, n_iter=50, random_state=42))

# Best accuracy and parameters tracker
best_acc = 0
best_params = None

# Loop through each parameter set
for params in param_list:
    print(f"Testing parameters: {params}")

    model = ANNModel(input_size=X_train_tensor.shape[1], hidden_layer_sizes=params['hidden_layer_sizes']).to('cpu')

    # Optimizer selection
    if params['optimizer'] == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=params['learning_rate'])
    elif params['optimizer'] == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    elif params['optimizer'] == 'lbfgs':
        optimizer = optim.LBFGS(model.parameters(), lr=params['learning_rate'])

    # Define the loss function
    criterion = nn.BCELoss()

    # Create DataLoader for training and testing
    train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=params['batch_size'], shuffle=True)
    test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=params['batch_size'], shuffle=False)

    # Training loop
    for epoch in range(params['epochs']):
        if params['optimizer'] == 'lbfgs':
            # Define closure for LBFGS
            def closure():
                optimizer.zero_grad()
                outputs = model(X_train_tensor)
                loss = criterion(outputs, y_train_tensor)
                loss.backward()
                return loss

            optimizer.step(closure)

        else:
            running_loss = train_model(model, criterion, optimizer, train_loader, 'cpu')
            print(f'Epoch [{epoch+1}/{params["epochs"]}], Loss: {running_loss:.4f}')

    # Evaluate the model
    y_true, y_pred = evaluate_model(model, test_loader, 'cpu')
    acc = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_params = params

print(f"Best Accuracy: {best_acc}")
print(f"Best Parameters: {best_params}")


Testing parameters: {'optimizer': 'sgd', 'learning_rate': 0.01, 'hidden_layer_sizes': (128, 256, 128), 'epochs': 10, 'batch_size': 64}
Epoch [1/10], Loss: 0.5912
Epoch [2/10], Loss: 0.4978
Epoch [3/10], Loss: 0.4614
Epoch [4/10], Loss: 0.4542
Epoch [5/10], Loss: 0.4521
Epoch [6/10], Loss: 0.4504
Epoch [7/10], Loss: 0.4488
Epoch [8/10], Loss: 0.4476
Epoch [9/10], Loss: 0.4463
Epoch [10/10], Loss: 0.4450
Accuracy: 0.8397
Testing parameters: {'optimizer': 'sgd', 'learning_rate': 0.001, 'hidden_layer_sizes': (128, 256, 128), 'epochs': 10, 'batch_size': 32}
Epoch [1/10], Loss: 0.6891
Epoch [2/10], Loss: 0.6521
Epoch [3/10], Loss: 0.6204
Epoch [4/10], Loss: 0.5923
Epoch [5/10], Loss: 0.5670
Epoch [6/10], Loss: 0.5444
Epoch [7/10], Loss: 0.5243
Epoch [8/10], Loss: 0.5070
Epoch [9/10], Loss: 0.4927
Epoch [10/10], Loss: 0.4814
Accuracy: 0.8397
Testing parameters: {'optimizer': 'adam', 'learning_rate': 0.01, 'hidden_layer_sizes': (128, 256, 128), 'epochs': 30, 'batch_size': 32}
Epoch [1/30], Los