In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import utils
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 30)
plt.style.use('ggplot')
from sklearn.utils import shuffle

In [207]:
X_train, y_train = utils.load_dataset("../dataset/train.csv")
X_test, y_test = utils.load_dataset("../dataset/test.csv")
# X_val, y_val = utils.load_dataset("../dataset/validation.csv")
X_train, X_test = utils.scale_data(X_train, X_test)
# X_val, _ = utils.scale_data(X_val, X_val)

combined_X = pd.concat([X_train, X_test])
combined_y = pd.concat([y_train, y_test])

In [208]:
def fit_pred_report(X_train, y_train, X_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [209]:
reports = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(combined_X, combined_y, test_size=0.1, shuffle=True, random_state=None)
    y_pred = fit_pred_report(X_train, y_train, X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [213]:
data = []
iter_num = 0
for item in reports:
    print(item)
    iter_num = iter_num + 1 if iter_num is not None else 1
    name = "baseline #" + str(iter_num)
    accuracy = item['accuracy'] * 100
    f1_0 = item['0']['f1-score'] * 100
    f1_1 = item['1']['f1-score'] * 100
    f1_2 = item['2']['f1-score'] * 100
    precision_0 = item['0']['precision'] * 100
    precision_1 = item['1']['precision'] * 100
    precision_2 = item['2']['precision'] * 100
    recall_0 = item['0']['recall'] * 100
    recall_1 = item['1']['recall'] * 100
    recall_2 = item['2']['recall'] * 100
    data.append([name, accuracy, f1_0, f1_1, f1_2, precision_0, precision_1, precision_2, recall_0, recall_1, recall_2])

# Create DataFrame
df = pd.DataFrame(data, columns=["Algorithm", "Accuracy", "F1 Score (0)", "F1 Score (1)", "F1 Score (2)", "Precision (0)", "Precision (1)", "Precision (2)", "Recall (0)", "Recall (1)", "Recall (2)"])

df

{'0': {'precision': 0.637648388920294, 'recall': 0.4305343511450382, 'f1-score': 0.5140123034859877, 'support': 2620}, '1': {'precision': 0.6172344689378757, 'recall': 0.7727367760819569, 'f1-score': 0.6862872528084671, 'support': 4783}, '2': {'precision': 0.5221238938053098, 'recall': 0.406386975579211, 'f1-score': 0.45704225352112676, 'support': 1597}, 'accuracy': 0.6081111111111112, 'macro avg': {'precision': 0.5923355838878265, 'recall': 0.5365527009354021, 'f1-score': 0.5524472699385271, 'support': 9000}, 'weighted avg': {'precision': 0.606300344700901, 'recall': 0.6081111111111112, 'f1-score': 0.5954578493543805, 'support': 9000}}
{'0': {'precision': 0.6216676120249575, 'recall': 0.4172059383326989, 'f1-score': 0.49931662870159454, 'support': 2627}, '1': {'precision': 0.6102965678107297, 'recall': 0.7599585062240664, 'f1-score': 0.6769543522454259, 'support': 4820}, '2': {'precision': 0.491497975708502, 'recall': 0.39085640695428203, 'f1-score': 0.43543758967001434, 'support': 15

Unnamed: 0,Algorithm,Accuracy,F1 Score (0),F1 Score (1),F1 Score (2),Precision (0),Precision (1),Precision (2),Recall (0),Recall (1),Recall (2)
0,baseline #1,60.811111,51.40123,68.628725,45.704225,63.764839,61.723447,52.212389,43.053435,77.273678,40.638698
1,baseline #2,59.622222,49.931663,67.695435,43.543759,62.166761,61.029657,49.149798,41.720594,75.995851,39.085641
2,baseline #3,61.311111,52.315338,68.920555,46.544429,64.620536,61.831551,53.921569,43.946869,77.845571,40.942928
3,baseline #4,61.3,51.909091,69.099393,46.655113,64.739229,61.844303,53.84,43.323217,78.282935,41.16208
4,baseline #5,60.6,51.444292,68.413716,45.255474,64.371773,61.370457,51.707705,42.840779,77.283225,40.234858
5,baseline #6,61.222222,52.739726,68.882682,45.555556,66.227064,61.722009,51.939826,43.816388,77.922899,40.568955
6,baseline #7,60.844444,51.895843,68.494166,45.467422,64.399093,61.023102,54.591837,43.458301,78.04981,38.956311
7,baseline #8,59.777778,49.849153,67.962603,43.975069,62.880562,60.547584,51.668023,41.291811,77.447257,38.27607
8,baseline #9,60.511111,51.452475,68.520906,43.606904,63.547646,61.538462,51.030503,43.225303,77.290671,38.068881
9,baseline #10,60.077778,49.282184,68.328256,45.167923,61.192285,61.498837,51.848938,41.252955,76.863967,40.012143


In [212]:
df.to_excel("../logs/data/baseline_result.xlsx", index=False)