In [98]:
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score

In [99]:
data = pd.read_csv("../Dataset/explored_dataset.csv")
X = data.drop(columns=['label', 'Unnamed: 0'])
Y = data[['label']]

In [100]:
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

In [101]:
# Function to evaluate with cross-validation
def evaluate_with_cv(model, X, Y, cv_splits=5):
    accuracy_scores = cross_val_score(model, X, Y, cv=cv_splits, scoring='accuracy')
    precision_scores = cross_val_score(model, X, Y, cv=cv_splits, scoring='precision')
    recall_scores = cross_val_score(model, X, Y, cv=cv_splits, scoring='recall')
    f1_scores = cross_val_score(model, X, Y, cv=cv_splits, scoring='f1')
    return accuracy_scores.mean(), precision_scores.mean(), recall_scores.mean(), f1_scores.mean()

# Function to train, evaluate and compute kappa scores
def evaluate_model(model, X_train, Y_train, X_val, Y_val):
    model.fit(X_train, Y_train)
    val_predictions = model.predict(X_val)
    accuracy = accuracy_score(Y_val, val_predictions)
    precision = precision_score(Y_val, val_predictions)
    recall = recall_score(Y_val, val_predictions)
    f1 = f1_score(Y_val, val_predictions)
    return accuracy, precision, recall, f1

In [113]:
results_per_seed = []
for random_state in range(51):  # Change 51 to your desired upper limit
    # Split data with current random_state
    X_train_full, X_test, Y_train_full, Y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train_full, Y_train_full, test_size=0.2, random_state=random_state)
    
    # Store metrics for each model
    metrics_dict = {
        "random_state": random_state,
        "model_metrics": {}
    }
    
    for name, clf in classifiers.items():
        # Cross-validation evaluation
        accuracy_cv, precision_cv, recall_cv, f1_cv = evaluate_with_cv(clf, X_train, Y_train)
        
        # Model evaluation on validation set
        accuracy_val, precision_val, recall_val, f1_val = evaluate_model(clf, X_train, Y_train, X_val, Y_val)
        
        # Store the results for the classifier
        metrics_dict["model_metrics"][name] = {
            "accuracy_cv": accuracy_cv,
            "precision_cv": precision_cv,
            "recall_cv": recall_cv,
            "f1_cv": f1_cv,
            "accuracy_val": accuracy_val,
            "precision_val": precision_val,
            "recall_val": recall_val,
            "f1_val": f1_val
        }
    
    # Store the results for this random state
    results_per_seed.append(metrics_dict)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ZeroDivisionError: division by zero

In [None]:
results_per_seed

[{'random_state': 0,
  'model_metrics': {'Logistic Regression': {'accuracy_cv': 0.6931034482758621,
    'precision_cv': 0.6832234432234433,
    'recall_cv': 0.5423076923076924,
    'f1_cv': 0.5975885167464113,
    'accuracy_val': 0.6111111111111112,
    'precision_val': 0.55,
    'recall_val': 0.6875,
    'f1_val': 0.6111111111111112},
   'Random Forest': {'accuracy_cv': 0.6923645320197045,
    'precision_cv': 0.6733333333333333,
    'recall_cv': 0.6217948717948718,
    'f1_cv': 0.6504933391889913,
    'accuracy_val': 0.6666666666666666,
    'precision_val': 0.625,
    'recall_val': 0.625,
    'f1_val': 0.625},
   'Support Vector Machine': {'accuracy_cv': 0.6364532019704433,
    'precision_cv': 0.7666666666666666,
    'recall_cv': 0.21538461538461537,
    'f1_cv': 0.3288888888888889,
    'accuracy_val': 0.6944444444444444,
    'precision_val': 0.8571428571428571,
    'recall_val': 0.375,
    'f1_val': 0.5217391304347826},
   'K-Nearest Neighbors': {'accuracy_cv': 0.6711822660098521,
  

In [90]:
avg_results = {
    "Classifier": [],
    "Accuracy CV Avg": [],
    "Precision CV Avg": [],
    "Recall CV Avg": [],
    "F1 Score CV Avg": [],
    "Accuracy Val Avg": [],
    "Precision Val Avg": [],
    "Recall Val Avg": [],
    "F1 Score Val Avg": []
}

In [None]:
for name in classifiers.keys():
    accuracy_cv_avg = np.mean([result["model_metrics"][name]["accuracy_cv"] for result in results_per_seed])
    precision_cv_avg = np.mean([result["model_metrics"][name]["precision_cv"] for result in results_per_seed])
    recall_cv_avg = np.mean([result["model_metrics"][name]["recall_cv"] for result in results_per_seed])
    f1_cv_avg = np.mean([result["model_metrics"][name]["f1_cv"] for result in results_per_seed])
    
    accuracy_val_avg = np.mean([result["model_metrics"][name]["accuracy_val"] for result in results_per_seed])
    precision_val_avg = np.mean([result["model_metrics"][name]["precision_val"] for result in results_per_seed])
    recall_val_avg = np.mean([result["model_metrics"][name]["recall_val"] for result in results_per_seed])
    f1_val_avg = np.mean([result["model_metrics"][name]["f1_val"] for result in results_per_seed])
    
    overall_kappa_avg = np.mean([result["model_metrics"][name]["Overall Average Kappa"] for result in results_per_seed])
    peer_kappa_avg = np.mean([result["model_metrics"][name]["Peer Average Kappa"] for result in results_per_seed])
    
    avg_results["Classifier"].append(name)
    avg_results["Accuracy CV Avg"].append(accuracy_cv_avg)
    avg_results["Precision CV Avg"].append(precision_cv_avg)
    avg_results["Recall CV Avg"].append(recall_cv_avg)
    avg_results["F1 Score CV Avg"].append(f1_cv_avg)
    avg_results["Accuracy Val Avg"].append(accuracy_val_avg)
    avg_results["Precision Val Avg"].append(precision_val_avg)
    avg_results["Recall Val Avg"].append(recall_val_avg)
    avg_results["F1 Score Val Avg"].append(f1_val_avg)
    avg_results["Overall Average Kappa Avg"].append(overall_kappa_avg)
    avg_results["Peer Average Kappa Avg"].append(peer_kappa_avg)

# Create DataFrame to display the results
avg_results_df = pd.DataFrame(avg_results)

# Display the results
avg_results_df

Unnamed: 0,Classifier,Accuracy CV Avg,Precision CV Avg,Recall CV Avg,F1 Score CV Avg,Accuracy Val Avg,Precision Val Avg,Recall Val Avg,F1 Score Val Avg
0,Logistic Regression,0.649454,0.643465,0.57583,0.594062,0.633987,0.602128,0.572706,0.573589
1,Random Forest,0.699015,0.689784,0.664865,0.665733,0.712963,0.672782,0.687649,0.67185
2,Support Vector Machine,0.606718,0.684134,0.251986,0.345394,0.614924,0.681783,0.252392,0.35052
3,K-Nearest Neighbors,0.649222,0.630347,0.609866,0.612152,0.641068,0.597065,0.587608,0.58246
4,Decision Tree,0.689781,0.682893,0.641683,0.652849,0.706427,0.674885,0.645964,0.650871
5,Naive Bayes,0.635009,0.670636,0.418126,0.500317,0.62037,0.629285,0.370551,0.45147
