# Classification Template

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold

train_size = 0.67
cv_splits = StratifiedGroupKFold(n_splits=3,shuffle=True)
randomState = 42
np.random.seed(randomState)

## Esplore Data

In [None]:
target = 'target'
url = ''
df = pd.read_csv(url)  #delimiter=, index_col=, names=

# Check the number of samples and the number of features
print(f' Data frame has {df.shape[0]} samples, and {df.shape[1]-1} features ')

In [None]:
# Check the first 5 rows of the dataset
df.head()

In [None]:
# explore the distribution of the target variable
# count help to see if there are some missing values
df.describe()

In [None]:
# n rows with missing values
df.shape[0]-df.dropna().shape[0]

In [None]:
# Count the number of missing values per columns
df.isna().sum()

In [None]:
# visualize target class distribution / class umbalanced
df[target].value_counts().sort_index().plot(kind='bar',rot=0)

In [None]:
# visualize the distribution of the features
# check for outliers
df.boxplot(figsize=(15,10))
plt.show()

In [None]:
# visualize the relationship between features and target variable
# and for linear relationship between features
sns.pairplot(df,hue=target)
plt.show()

## Preprocessing

In [None]:
print(f'there are {df.isna().sum().sum()} rows with null values')
df1 = df.dropna()
print(f'there are {df1.isna().sum().sum()} rows with null values')
print(f'Data frame has {df1.shape[0]} samples, and {df1.shape[1]-1} features ')

In [None]:
# Split data frame in X features and target
X = df1.drop([target],axis=1)
y = df1[target]

print(f'X shape {X.shape}')
print(f'y shape {y.shape}')

In [None]:
# (OPTIONAL) If there is a string variable, we need to encode it to numerical values 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
column_to_transform = 'target'
transofermed_column = le.fit_transform(df1[column_to_transform])
df1[column_to_transform] = transofermed_column

#y = le.fit_transform(df1[target])

In [None]:
# (OPTIONAL) use this to convert nominal labels to numerical values
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder()
column_to_transform = 'exemple_column'
enc_data = one.fit_transform(df[column_to_transform].values).toarray()
l = list(one.categories_[0])
enc_df = pd.DataFrame(enc_data.toarray(),columns=l)
df = df.join(enc_df)
df = df.drop([column_to_transform],axis=1)
df.head()

In [None]:
# (OPTIONAL) use this to convert ordinal labels to numerical values
from sklearn.preprocessing import OrdinalEncoder
categories = ['bad','good','very good'] # exemple of ordinal categories
oe = OrdinalEncoder(categories=[categories],dtype=int)
column_to_transform = 'col_name' 
df[column_to_transform] = oe.fit_transform(df[column_to_transform].values.reshape(-1,1))

In [None]:
# Change the ranges of the features to be between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_processed = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [None]:
# Data standardization
from sklearn.preprocessing import PowerTransformer,StandardScaler
from sklearn.pipeline import make_pipeline
preprocessor = make_pipeline(PowerTransformer(),StandardScaler())
df_processed = pd.DataFrame(preprocessor.fit_transform(df),columns=df.columns)

In [None]:
# (OPTIONAL) DO THIS STEP ONLY IF THE DATASET HAS A LARGE NUMBER OF FEATURES (E.G. MORE THAN 20)
# remove features with low variance, or with high correlation with other features

from sklearn.decomposition import PCA

pca = PCA()
X_traformed = pca.fit_transform(X)
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')
min_variance = 0.90 #or 80%
variance_cumsum = np.cumsum(pca.explained_variance_ratio_.copy())
cutoff_index = np.argmax(variance_cumsum>min_variance)
X = X_traformed[:,:cutoff_index+1]
print(f'X shape after PCA {X.shape}')

## Training

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=randomState, train_size=train_size)
print("Training on {} examples".format(len(X_train)))
print("Testing on {} examples".format(len(X_test)))

In [None]:
# Import the classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [None]:
# set classifier labels and their parameters for grid search

model_lbls = [
    'dt' # decision tree
    ,'nb' # gaussian native bayers
    ,'lp' # linear perceptron
    # 'svm'  # we remove svm because it is too slow to train and test, and during the exam the pc could crash
    ,'knn' # K-nearest neighbours
    ,'adb' # adaboost
    ,'rf' # random forest
]


param_dt = [{'max_depth':[*range(1,20)], 'class_weight':[None,'balanced']}]
param_nb = [{'var_smoothing':[10**exp for exp in range(-3,-12,-1)]}]
param_lp = [{'early_stopping':[True,False], 'class_weight':[None,'balanced']}]
param_knn = [{'n_neighbors':[*range(2,7)]}]
param_adb = [{'n_estimators':[10,20,30,40,50],'learning_rate':[0.5,0.75,1,1.25,1.5]}]
param_rf = [{'n_estimators':[*range(10,30,4)], 'max_depth':[*range(4,30,4)], 'class_weight':[None,'balanced']}]

# svc is very computational expensive. cosider to avoid using it during the exam to avoid computer crash
param_svc = [
    {'kernel':['rbf'],'gamma':[1e-3,1e-4],'C':[1,10,100]},     #C=1 prioritize the margin, C=100 prioritize the decrease of the training error
    {'kernel':['linear'],'C':[1,10,100]}
]

models = {
    'dt': {
        'name': 'Decision Tree       ',
        'estimator': DecisionTreeClassifier(random_state=randomState),
        'param': param_dt
    },
    'nb': {
        'name': 'Gaussian Naive Bayes',
        'estimator': GaussianNB(),
        'param': param_nb
    },
    'lp': {'name': 'Linear Perceptron   ',
       'estimator': Perceptron(random_state=randomState),
       'param': param_lp,
    },
  'svc':{'name': 'Support Vector',
           'estimator': SVC(random_state=randomState), 
           'param': param_svc
          },
    'knn':{'name': 'K Nearest Neighbor ',
           'estimator': KNeighborsClassifier(),
           'param': param_knn
       },
    'adb':{'name': 'AdaBoost           ',
           'estimator': AdaBoostClassifier(random_state=randomState),
           'param': param_adb
          },
    'rf': {'name': 'Random forest       ',
           'estimator': RandomForestClassifier(random_state=randomState),
           'param': param_rf
          }
}

In [None]:
# define the scoring metrics for the grid search
scorings = ['accuracy','precision_macro','recall_macro','f1_macro']

In [None]:
# instatiate the datastructure to store the results
clfs = []
results = pd.DataFrame(columns=['scoring','model','best_params','accuracy','precision_macro','recall_macro','f1_macro'])

In [None]:
# train and test the classifier with grid search, for each scoring metric
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

for scoring in scorings:
    for m in model_lbls:
        clf = GridSearchCV(
            models[m]['estimator'],
            models[m]['param'],
            scoring=scoring,
            cv=cv_splits
        )
        clf.fit(X_train,y_train)
        clfs.append(clf.best_estimator_)
        y_pred = clf.predict(X_test)
        cr = classification_report(y_test, y_pred, output_dict=True, zero_division=1)
        results.loc[len(results)] = [
            scoring,
            models[m]['name'],
            clf.best_params_,
            cr['accuracy'],
            cr['macro avg']['precision'],
            cr['macro avg']['recall'],
            cr['macro avg']['f1-score']
        ]

## Result Evaluation

In [None]:
# display the results for each scoring metric
for score in scorings:
    display(
        results[results.scoring==score]\
            .sort_values(by=score,ascending=False)\
            .drop('scoring',axis=1)\
            .style.format(precision=3)\
            .set_caption(f'Best Models for:{score}')
    )

In [None]:
# display the confusion matrix for the best model for each scoring metric
from sklearn.metrics import ConfusionMatrixDisplay

for score in scorings:
    scoring_filter = score
    best_row = results.loc[results.scoring==scoring_filter,scoring_filter].idxmax(axis=0)
    disp = ConfusionMatrixDisplay.from_estimator(X=X_test, y=y_test, estimator = clfs[best_row])
    disp.ax_.set_title("Best Model for {}: {}".format(score,results.at[best_row,'model']))