In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import utils
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 30)
plt.style.use('ggplot')
from sklearn.utils import shuffle

In [5]:
X_train, y_train = utils.load_dataset("../dataset/train.csv")
X_test, y_test = utils.load_dataset("../dataset/test.csv")
# X_val, y_val = utils.load_dataset("../dataset/validation.csv")
X_train, X_test = utils.scale_data(X_train, X_test)
# X_val, _ = utils.scale_data(X_val, X_val)

combined_X = pd.concat([X_train, X_test])
combined_y = pd.concat([y_train, y_test])

In [6]:
def fit_pred_report(X_train, y_train, X_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [7]:
reports = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(combined_X, combined_y, test_size=0.1, shuffle=True, random_state=None)
    y_pred = fit_pred_report(X_train, y_train, X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [8]:
data = []
iter_num = 0
for item in reports:
    print(item)
    iter_num = iter_num + 1 if iter_num is not None else 1
    name = "baseline #" + str(iter_num)
    accuracy = item['accuracy'] * 100
    f1_0 = item['0']['f1-score'] * 100
    f1_1 = item['1']['f1-score'] * 100
    f1_2 = item['2']['f1-score'] * 100
    precision_0 = item['0']['precision'] * 100
    precision_1 = item['1']['precision'] * 100
    precision_2 = item['2']['precision'] * 100
    recall_0 = item['0']['recall'] * 100
    recall_1 = item['1']['recall'] * 100
    recall_2 = item['2']['recall'] * 100
    data.append([name, accuracy, f1_0, f1_1, f1_2, precision_0, precision_1, precision_2, recall_0, recall_1, recall_2])

# Create DataFrame
df = pd.DataFrame(data, columns=["Algorithm", "Accuracy", "F1 Score (0)", "F1 Score (1)", "F1 Score (2)", "Precision (0)", "Precision (1)", "Precision (2)", "Recall (0)", "Recall (1)", "Recall (2)"])

df

{'0': {'precision': 0.6455840455840456, 'recall': 0.4349328214971209, 'f1-score': 0.5197247706422019, 'support': 2605}, '1': {'precision': 0.6183798053126547, 'recall': 0.7750206782464847, 'f1-score': 0.6878957511241626, 'support': 4836}, '2': {'precision': 0.48817567567567566, 'recall': 0.3707504810776139, 'f1-score': 0.4214363835216916, 'support': 1559}, 'accuracy': 0.6065555555555555, 'macro avg': {'precision': 0.5840465088574586, 'recall': 0.5269013269404065, 'f1-score': 0.543018968429352, 'support': 9000}, 'weighted avg': {'precision': 0.6036996728463128, 'recall': 0.6065555555555555, 'f1-score': 0.593062911318856, 'support': 9000}}
{'0': {'precision': 0.6404365307294658, 'recall': 0.4315015479876161, 'f1-score': 0.5156069364161849, 'support': 2584}, '1': {'precision': 0.6102598267821452, 'recall': 0.7686175791902664, 'f1-score': 0.6803453718317705, 'support': 4767}, '2': {'precision': 0.501195219123506, 'recall': 0.38144329896907214, 'f1-score': 0.43319559228650134, 'support': 16

Unnamed: 0,Algorithm,Accuracy,F1 Score (0),F1 Score (1),F1 Score (2),Precision (0),Precision (1),Precision (2),Recall (0),Recall (1),Recall (2)
0,baseline #1,60.655556,51.972477,68.789575,42.143638,64.558405,61.837981,48.817568,43.493282,77.502068,37.075048
1,baseline #2,60.088889,51.560694,68.034537,43.319559,64.043653,61.025983,50.119522,43.150155,76.861758,38.14433
2,baseline #3,60.188889,51.477945,68.173816,43.147027,63.346391,61.031851,51.139241,43.355036,77.208814,37.315271
3,baseline #4,59.711111,49.64376,67.924528,43.849136,61.468412,61.466354,49.014972,41.634541,75.899132,39.668367
4,baseline #5,60.622222,50.503631,68.692449,45.141066,64.705882,61.429513,51.347068,41.413753,77.903091,40.273462
5,baseline #6,60.555556,51.401011,68.652587,43.453443,63.942857,61.765682,49.754902,42.97235,77.268009,38.568714
6,baseline #7,60.566667,50.830798,68.343581,46.575342,63.807286,61.896726,50.801749,42.240373,76.289517,42.998149
7,baseline #8,61.133333,51.665125,69.206642,44.71086,64.716107,62.092369,51.419303,42.994611,78.162117,39.550842
8,baseline #9,60.911111,51.804767,68.483943,46.687478,64.829545,61.123146,54.317998,43.137996,77.860327,40.93674
9,baseline #10,60.411111,50.588235,68.601437,44.129834,63.049853,61.802789,50.275374,42.239686,77.080745,39.323077


In [9]:
df.to_excel("../logs/data/baseline_result.xlsx", index=False)