In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Series,DataFrame
from pathlib import Path
import os
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('once')

In [3]:
# Variables used throughout the notebook
project_root_dir = Path(os.path.abspath('')).resolve().parents[0]
current_notebook = "Classification"

# output path for images
image_folder_path = os.path.join(project_root_dir, "notebooks", current_notebook)
os.makedirs(image_folder_path, exist_ok=True)

# output path for data
data_folder_path = os.path.join(project_root_dir, "data", "interim")

In [4]:
# matplotlib variable 
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [5]:
# Pandas variable to display a complete dataset
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Data

In [8]:
from src.data.make_dataset import create_dataset

input_path = os.path.join(project_root_dir, 'data', 'raw')
output_path = os.path.join(project_root_dir, 'data', 'processed')

create_dataset(input_path, output_path)

In [None]:
# import the data
data_path = os.path.join(project_root_dir, 'data', 'processed')

data = pd.read_csv(os.path.join(data_path, 'training_data.csv'))
labels = pd.read_csv(os.path.join(data_path, 'training_labels.csv'))

In [None]:
# Verify the data
data.describe()

In [None]:
labels.describe()

In [None]:
# The dataset is small, so we will use K-fold cross-validation
# We use StratifiedKFold to keep the same class repartition because the dataset is small
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_score

s_k_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

## Classification

In [None]:
from sklearn.metrics import classification_report
from sklearn.base import clone

In [None]:
# Define the metrics used in cross validation
scoring = {'acc': 'accuracy', 
           'prec': 'precision_macro', 
           'rec': 'recall_macro', 
           'f1': 'f1_macro', 
           'n_l_l':'neg_log_loss'}


In [None]:
def mean_scoring(score):
    mean_score = {}
    
    for key in score:
        mean_score[key] = np.mean(score[key])
    
    return mean_score

In [None]:
def display_score(score):
    for key in mean_score:
        print(key, ':', mean_score[key])

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()

In [None]:
lr_clf_score = cross_validate(lr_clf, data, labels, cv=s_k_fold, scoring=scoring, n_jobs=-1)

In [None]:
mean_score = mean_scoring(lr_clf_score)
display_score(mean_score)

### Perceptron

In [None]:
from sklearn.linear_model import SGDClassifier

# per_clf = Perceptron(penalty='l2', use_proba=True)
per_clf = SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None)

In [None]:
per_clf_score = cross_validate(per_clf, data, np.ravel(labels), cv=s_k_fold, scoring=scoring)
test_score = cross_val_score(per_clf, data, np.ravel(labels), cv=s_k_fold, scoring='f1_macro')
mean_score = mean_scoring(per_clf_score)
display_score(mean_score)
print(test_score)

### SVM

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(probability=True)

In [None]:
svm_clf_score = cross_validate(svm_clf, data, np.ravel(labels), cv=s_k_fold, scoring=scoring)

mean_score = mean_scoring(svm_clf_score)
display_score(mean_score)

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()

In [None]:
rf_clf_score = cross_validate(rf_clf, data, np.ravel(labels), cv=s_k_fold, scoring=scoring)
mean_score = mean_scoring(rf_clf_score)
display_score(rf_clf_score)

### Multi layer perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier()

In [None]:
mlp_clf = cross_validate(mlp_clf, data, np.ravel(labels), cv=s_k_fold, scoring=scoring)

In [None]:
mean_score = mean_scoring(mlp_clf)
display_score(mlp_clf)

### KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn_clf = KNeighborsClassifier()

In [None]:
kn_clf_score = cross_validate(kn_clf, data, np.ravel(labels), cv=s_k_fold, scoring=scoring)
mean_score = mean_scoring(kn_clf_score)
display_score(mean_score)