In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

df_train = pd.read_csv('data/train_data.csv')
df_test = pd.read_csv('data/test_data.csv')
X_train = df_train.drop('y', axis=1)
y_train = df_train['y']
X_test = df_test.drop('y', axis=1)
y_test = df_test['y']


# `1 - Logistic regression`

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(n_jobs=-1)
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'max_iter': [50000]}

# create the grid search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
# fit the grid search object to the data
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
best_LR = grid_search.best_estimator_

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/miniconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.84490771]


Best parameters: {'C': 10, 'max_iter': 50000, 'penalty': 'l2'}
Best score: 0.8455691727336511


# `2 - KNN`

## a. Select the best K value

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
param_grid = {'n_neighbors': [1, 2, 3, 5, 7, 9, 11], 'weights': [
    'uniform', 'distance'], 'p': [1, 2, 5, 10, 20]}
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
best_knn = grid_search.best_estimator_


Best parameters: {'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
Best cross-validation score: 0.9379426620621368


# ` 3 - XgBoost `

Grid search is infeasible, I will use random serach

In [55]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
# define the parameter grid for the randomized search
param_grid = {
    'learning_rate': [0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 7, 10, 15, 20, 25, 30],
    'n_estimators': [100, 200, 300, 400, 500],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05],
    'reg_lambda': [0, 0.001, 0.005, 0.01, 0.05]
}

random_search = RandomizedSearchCV(xgb_model, param_grid, cv=5, n_iter=50, random_state=42)
random_search.fit(X_train, y_train)
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
best_xgb = random_search.best_estimator_

Best parameters: {'subsample': 0.7, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 25, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.9}
Best cross-validation score: 0.9635149328303584


# `4 - Test results`

In [64]:
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.metrics import geometric_mean_score as gmean
models = [best_LR, best_knn, best_xgb]
metrics = ['F1 (macro)', 'F1 (weighted)', 'MCC', 'G-mean', 'Accuracy']
results = {}

for model in models:
    y_pred = model.predict(X_test)
    
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)
    geo_mean = gmean(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    results[model] = {
        'F1 (macro)': f1_macro,
        'F1 (weighted)': f1_weighted,
        'MCC': mcc,
        'G-mean': geo_mean,
        'Accuracy': accuracy
    }

In [80]:
import matplotlib.pyplot as plt
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default='notebook'
pd.options.plotting.backend = "plotly"

plt.figure(figsize=(10, 6))

pattern = r'^[^(]*'
regex = re.compile(pattern)
model_names = [regex.match(str(model)).group() for model in models]


traces = []
for metric in metrics:
    values = [results[model][metric] for model in models]
    trace = go.Bar(x=model_names, y=values, name=metric, text=values, textposition='auto')
    traces.append(trace)

layout = go.Layout(
    title='Metrics by model',
    xaxis=dict(title='Model'),
    yaxis=dict(title='Value'),
    barmode='group'
)

fig = go.Figure(data=traces, layout=layout)
fig.show()





<Figure size 1000x600 with 0 Axes>

# ` 5 - Investigate source of error `

ARI is a metric that test the agreement between clusters and actual labels.

In [82]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

KMeans = KMeans(n_clusters=len(np.unique(y_test)), random_state=42)
KMeans.fit(X_train)
y_pred = KMeans.predict(X_test)

ari = adjusted_rand_score(y_test, y_pred)
print(f'ARI: {ari:.3f}')


ARI: 0.244


## Low ARI score imply that the boundaries are not simple to learn. 

### Let's check what samples were misclassified with respect to the decision boundaries

In [168]:
y_pred = best_xgb.predict(X_test)

misclassified = []
i = 0
for pred, true in zip(y_pred, y_test):
    misclassified.append((i, X_test.iloc[i].to_numpy(), pred != true, pred, true))
    i += 1

misclassified = pd.DataFrame(misclassified, columns=['index', 'features', 'misclassified', 'prediction', 'true'])
misclassified = misclassified[misclassified['misclassified'] == True]
x_mis = np.asanyarray(misclassified['features'].to_list())
y_probs = best_xgb.predict_proba(x_mis)
predicts_probs = y_probs.max(axis=1)
true_probs = y_probs[np.arange(len(y_probs)), misclassified['true'].to_numpy()]
misclassified['predicts_probs'] = predicts_probs
misclassified['true_probs'] = true_probs
misclassified

Unnamed: 0,index,features,misclassified,prediction,true,predicts_probs,true_probs
45,45,"[0.7406188830890889, -1.4582014925495337, -0.5...",True,3,2,0.967799,0.030476
46,46,"[0.7635083471503371, -0.7641063978347392, 1.65...",True,0,3,0.635423,0.363955
60,60,"[-0.1825895007145923, -1.4590155324216276, -0....",True,4,0,0.484305,0.369893
93,93,"[-1.0015236593503647, 1.470834494730949, -0.11...",True,4,1,0.512120,0.472426
138,138,"[-1.5941064511582372, -1.3727711515547565, -0....",True,5,1,0.676700,0.290302
...,...,...,...,...,...,...,...
1203,1203,"[0.7762247160732528, -0.8116583317191155, 0.65...",True,0,3,0.663424,0.333835
1204,1204,"[0.8601527509644966, 1.8607603525828351, -0.44...",True,3,0,0.979358,0.019528
1243,1243,"[-1.449139845436998, -0.8021479449422403, 1.43...",True,5,1,0.896205,0.008008
1248,1248,"[-1.0498458612574444, -1.4507847832877128, 0.2...",True,1,4,0.818373,0.145544


### Gently misslcassified

In [169]:
#gently misclassified
misclassified.query('true_probs > .4')

Unnamed: 0,index,features,misclassified,prediction,true,predicts_probs,true_probs
93,93,"[-1.0015236593503647, 1.470834494730949, -0.11...",True,4,1,0.51212,0.472426
278,278,"[0.6236282889982642, -0.7641063978347392, -0.4...",True,3,0,0.504519,0.489071
394,394,"[1.6689138144619362, -1.1635426424635005, 0.21...",True,6,3,0.563177,0.435971
538,538,"[0.6388879317057631, -0.8496998788266166, -0.3...",True,3,0,0.518236,0.479115
597,597,"[-0.7624559235995492, -0.4692844077516058, 0.8...",True,1,4,0.573239,0.42565
708,708,"[1.5086875660331982, 1.565938362499702, -0.779...",True,6,3,0.587497,0.411656
789,789,"[0.8245469179803325, -0.6119402094047349, -0.1...",True,2,3,0.515156,0.483293
1132,1132,"[-1.169379729132852, 1.537407202169076, 1.4330...",True,4,1,0.512918,0.479991


### Is it a problem of the model or data is noisy?


More than half of the misslassified samples are missclassified across all models. (` 38 samples !`)<br>
This could indicate that some of the misclassified data are just noisy, especially, that many of them are classified as the same class across the 3 models.

In [170]:
import warnings
warnings.filterwarnings('ignore')
X_test_numpy = X_test.to_numpy()
troublesome = []
for i in range(len(X_test_numpy)):
    true_label  = y_test[i]
    a = best_knn.predict(X_test_numpy[i].reshape(1, -1))
    b = best_LR.predict(X_test_numpy[i].reshape(1, -1))
    c = best_xgb.predict(X_test_numpy[i].reshape(1, -1))
    if a != true_label and b != true_label and c != true_label:
        troublesome.append((i, X_test_numpy[i], true_label, a, b, c))

print(f'Number of troublesome samples: {len(troublesome)}')
troublesome = pd.DataFrame(troublesome, columns=['index','features', 'true', 'knn', 'lr', 'xgb'])
troublesome

Number of troublesome samples: 38


Unnamed: 0,index,features,true,knn,lr,xgb
0,60,"[-0.1825895007145923, -1.4590155324216276, -0....",0,[4],[5],[4]
1,138,"[-1.5941064511582372, -1.3727711515547565, -0....",1,[5],[5],[5]
2,146,"[0.1887284718345466, -1.4572065702152692, 0.10...",3,[0],[0],[0]
3,186,"[-1.545784249251157, -1.2776672837860037, 0.10...",1,[5],[4],[5]
4,206,"[0.9415375120711572, 0.4151815624977942, -0.22...",2,[3],[3],[3]
5,254,"[-0.1902193220683418, -0.7260648507272381, -1....",4,[0],[5],[0]
6,324,"[0.4049067435241138, -1.1730530292403758, -0.5...",3,[0],[0],[0]
7,342,"[0.3616710891862004, 1.118950183986564, -0.779...",3,[0],[0],[0]
8,377,"[-0.2029356909912575, 0.2439946005140393, -1.1...",0,[4],[4],[4]
9,383,"[-0.1953058696375081, -0.7070440771734876, -0....",4,[5],[5],[5]


## Let's check the probs of those troublsome samples

In [181]:
# missclassified entries that have the same features in troublesome
trouble_probs= misclassified.merge(troublesome, on='index')[['true_probs']].query('true_probs <.25')
print(f'Number of heavily misclassified: {len(trouble_probs)}')
trouble_probs


Number of heavily misclassified: 27


Unnamed: 0,true_probs
3,0.064893
4,0.000116
5,0.03407
7,0.032525
10,0.053394
11,0.005019
12,0.014388
13,0.01094
14,0.0131
16,0.017444


# ` 6 - Conclusion `

I chose Logistic Regression, KNN, and  Xgboost Classifeir to perform on the preprocessed data.
At first, they were tuned using 5-fold cross validation. Furthermore, they all were exposed to the same task and performed relatively well on the preprocessed data, however XgBoost classifier showed superiourity with accuracy of 94.43%.
<br>
<br>
To investiage the source of the error, misclassified samples were analyzed and observed to be slightly noisy. This conclusion was reached by counting common misclassified samples across all models. The common misclassified samples across all models were over 50% from the total misclassified by the xgBoost model alone, which shows that some samples were to complex or noisy.