In [1]:
import pandas

In [2]:
import pandas

data = pandas.read_csv("../Dataset/explored_dataset.csv")
X = data.drop(columns = ['label','Unnamed: 0'])
Y = data[['label']]

In [3]:
X.head()

Unnamed: 0,$ne,planningTimeMicros,$,query_length_keywords_only
0,False,83.0,False,12
1,False,83.0,False,12
2,False,71.0,True,17
3,False,71.0,True,17
4,True,42.0,True,13


In [4]:
Y.head()

Unnamed: 0,label
0,False
1,False
2,True
3,True
4,True


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score

def evaluate_model(model, X_test, Y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    f1 = f1_score(Y_test, predictions)
    kappa = cohen_kappa_score(Y_test, predictions)
    return accuracy, precision, recall, f1, kappa

# Train and evaluate each classifier
results = []
predictions_dict = {}

for name, clf in classifiers.items():
    clf.fit(X_train, Y_train.values.ravel())
    accuracy, precision, recall, f1, kappa = evaluate_model(clf, X_test, Y_test)
    results.append([name, accuracy, precision, recall, f1, kappa])
    predictions_dict[name] = clf.predict(X_test)

results_df = pandas.DataFrame(results, columns=["Classifier", "Accuracy", "Precision", "Recall", "F1 Score", "Kappa Score"])

# Calculate kappa score matrix
kappa_matrix = pandas.DataFrame(index=classifiers.keys(), columns=classifiers.keys())

for name1, preds1 in predictions_dict.items():
    for name2, preds2 in predictions_dict.items():
        kappa_matrix.loc[name1, name2] = cohen_kappa_score(preds1, preds2)

average_kappa__without_own_series = {}
average_kappa_series = {}
for name1, preds1 in predictions_dict.items():
    average_kappa_without_itself = 0
    average_kappa = 0
    count = 0
    
    for name2, preds2 in predictions_dict.items():
        kappa_matrix.loc[name1, name2] = cohen_kappa_score(preds1, preds2)
        average_kappa += kappa_matrix.loc[name1, name2]
        if name1 != name2:
            average_kappa_without_itself += kappa_matrix.loc[name1, name2]
            count += 1
    average_kappa__without_own_series[name1] = average_kappa_without_itself / count
    average_kappa_series[name1] = average_kappa/count

kappa_matrix['Overall Average Kappa'] = kappa_matrix.mean(axis=1)
kappa_matrix['Peer Average Kappa'] = pandas.Series(average_kappa__without_own_series)

results_df = results_df.merge(kappa_matrix[['Overall Average Kappa', 'Peer Average Kappa']], left_on='Classifier', right_index=True)



In [8]:
results_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score,Kappa Score,Overall Average Kappa,Peer Average Kappa
0,Logistic Regression,0.622222,0.578947,0.55,0.564103,0.231156,0.531233,0.43748
1,Random Forest,0.711111,0.652174,0.75,0.697674,0.423645,0.52897,0.434764
2,Support Vector Machine,0.644444,0.7,0.35,0.466667,0.242105,0.446394,0.335673
3,K-Nearest Neighbors,0.6,0.545455,0.6,0.571429,0.19802,0.46349,0.356188
4,Decision Tree,0.666667,0.619048,0.65,0.634146,0.328358,0.411813,0.294175
5,Naive Bayes,0.666667,0.666667,0.5,0.571429,0.307692,0.542193,0.450632


In [9]:
kappa_matrix

Unnamed: 0,Logistic Regression,Random Forest,Support Vector Machine,K-Nearest Neighbors,Decision Tree,Naive Bayes,Overall Average Kappa,Peer Average Kappa
Logistic Regression,1.0,0.379921,0.562162,0.331021,0.101796,0.8125,0.531233,0.43748
Random Forest,0.379921,1.0,0.165854,0.600197,0.733728,0.294118,0.52897,0.434764
Support Vector Machine,0.562162,0.165854,1.0,0.1,0.123077,0.727273,0.446394,0.335673
K-Nearest Neighbors,0.331021,0.600197,0.1,1.0,0.421365,0.328358,0.46349,0.356188
Decision Tree,0.101796,0.733728,0.123077,0.421365,1.0,0.090909,0.411813,0.294175
Naive Bayes,0.8125,0.294118,0.727273,0.328358,0.090909,1.0,0.542193,0.450632


In [10]:
best_scores = {
    "Classifier": [],
    "Metric": [],
    "Score": []
}
for metric in ["Accuracy", "Precision", "Recall", "F1 Score", "Overall Average Kappa",'Peer Average Kappa']:
    best_classifier = results_df.iloc[results_df[metric].idxmax()]["Classifier"]
    best_score = results_df.iloc[results_df[metric].idxmax()][metric]
    best_scores["Classifier"].append(best_classifier)
    best_scores["Metric"].append(metric)
    best_scores["Score"].append(best_score)

best_scores_df = pandas.DataFrame(best_scores)
best_scores_df

Unnamed: 0,Classifier,Metric,Score
0,Random Forest,Accuracy,0.711111
1,Support Vector Machine,Precision,0.7
2,Random Forest,Recall,0.75
3,Random Forest,F1 Score,0.697674
4,Naive Bayes,Overall Average Kappa,0.542193
5,Naive Bayes,Peer Average Kappa,0.450632


In [11]:
results_df.to_csv("../Dataset/model_results.csv")
kappa_matrix.to_csv("../Dataset/model_kappa_matrix.csv")
best_scores_df.to_csv("../Dataset/model_best_scores.csv")