In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RepeatedStratifiedKFold,
    cross_val_score,
    cross_val_predict,
)

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

## Data loading and cleaning

We load the .csv and start by looking at the data, so that we know what we are working with.

In [2]:
df = pd.read_csv('task_data.csv')

display(df)
print(df.info())

Unnamed: 0,ID,Cardiomegaly,Heart width,Lung width,CTR - Cardiothoracic Ratio,xx,yy,xy,normalized_diff,Inscribed circle radius,Polygon Area Ratio,Heart perimeter,Heart area,Lung area
0,1,0,172,405,424691358,1682.360871,3153.67188,-638.531109,-0.304239,688186,0.213446,6794873689,24898,75419
1,2,1,159,391,4066496164,1526.66096,5102.159054,-889.678405,-0.539387,7392564,0.203652,7886589419,29851,94494
2,5,0,208,400,52,2465.903392,5376.834707,-1755.344699,-0.371163,6933974,0.320787,8623229369,33653,66666
3,7,1,226,435,5195402299,2509.063593,6129.82127,-1025.079806,-0.419123,8414868,0.317545,906724959,42018,82596
4,8,1,211,420,5023809524,2368.770135,5441.767075,-1493.040062,-0.393442,7378347,0.263542,8642396777,35346,85631
5,9,1,222,405,5481481481,2351.057355,8378.677729,-812.061371,-0.561768,8386298,0.328101,1001068103,46381,92755
6,10,1,202,498,40562249,2251.87946,4467.406612,-511.955541,-0.329727,8472308,0.209396,8082985504,35417,116542
7,11,1,228,474,4810126582,2971.936804,3971.943088,-885.822012,-0.144013,8373769,0.242418,8219554045,36716,97325
8,12,0,176,449,3919821826,1833.841218,3709.72454,-831.472858,-0.338389,7340981,0.181388,7279726499,27939,99656
9,13,1,223,473,4714587738,2412.707474,6169.769803,-880.450924,-0.43776,7641989,0.261665,9229259679,38162,105442


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          37 non-null     int64  
 1   Cardiomegaly                37 non-null     int64  
 2   Heart width                 37 non-null     int64  
 3   Lung width                  37 non-null     int64  
 4   CTR - Cardiothoracic Ratio  37 non-null     object 
 5   xx                          37 non-null     float64
 6   yy                          37 non-null     float64
 7   xy                          37 non-null     float64
 8   normalized_diff             37 non-null     float64
 9   Inscribed circle radius     37 non-null     object 
 10  Polygon Area Ratio          37 non-null     float64
 11  Heart perimeter             37 non-null     object 
 12  Heart area                  37 non-null     int64  
 13  Lung area                   37 non-nu

Our check reveals three critical insights. Firstly there is not much data there, only 37 rows, which we need to remember. Secondly luckily there is no Nulls. And ultimately there are a few columns that we need to fix due to commas being the decimal separators instead of dots.

In [3]:
for column in df.select_dtypes(include='object').columns:
    df[column] = df[column].str.replace(',', '.')
    df[column] = pd.to_numeric(df[column])

After cleaning the data we separate the features `X` and the target `y`. After that we divide data into training and test set. We use `random_state=42` to ensure repeatability and `stratify` due to low amount of data so that our training and test set have similar amount of positive and negative cases.

In [4]:
X = df.drop(columns=['ID', 'Cardiomegaly'])
y = df['Cardiomegaly']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Model training

Disclaimer: To make this notebook run quickly I intentionally commented out the process of finding the best hyperparameters and just left the final models with the optimal parameters hard-coded below.
Nevertheless, they are there as "proof-of-work" and can be uncommented and ran if you wish.

### K-Nearest Neighbors (KNN) Classifier

We start of with defining the `param_grid `that is going to be used in finding the best hyperparameters. Then we create a `Pipeline` that first applies a `StandardScaler` before running a classifier. To evaluate the model reliably we also define a `RepeatedStratifiedKFold` which will run 5 splits, repeated 100 times, ensuring our accuracy score is very stable and not just a result of a single lucky data split.

These components are then passed to `GridSearchCV`. Its job is to test all possible combinations from the `param_grid` and find the single best one.



In [5]:
# param_grid = {
#     "model__n_neighbors": [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15],
#     "model__weights": ["uniform", "distance"],
#     "model__metric": ["minkowski", "manhattan", "euclidean", "chebyshev"],
# }
#
# rskf = RepeatedStratifiedKFold(
#     n_splits=5,
#     n_repeats=100,
#     random_state=42
# )
#
# pipe_knn = Pipeline(steps=[
#     ("scaler", StandardScaler()),
#     ("model", KNeighborsClassifier())
# ])
#
# grid_search = GridSearchCV(
#     estimator=pipe_knn,
#     param_grid=param_grid,
#     scoring="accuracy",
#     cv=rskf,
#     verbose=1,
#     n_jobs=-1
# )
#
# grid_search.fit(X_train, y_train)
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best accuracy (averaged CV): {grid_search.best_score_:.4f}")

In the next cell, we take the results from the search above and hard-code them.

We use these specific parameters to create our final pipeline, `pipe_knn_final`. This allows us to instantly create our best-performing model without re-running the search. Finally, we train this optimized pipeline on the entire `X_train` and `y_train` dataset, making it ready for predictions.

We will repeat the same process for all classifiers.

In [7]:
knn_best_params = {'model__metric': 'minkowski', 'model__n_neighbors': 2, 'model__weights': 'distance'} # Values copied from the commented cell above
knn_best_score = 0.8152 # Value copied from the cell above

pipe_knn_final = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(
        n_neighbors=knn_best_params['model__n_neighbors'],
        weights=knn_best_params['model__weights'],
        metric=knn_best_params['model__metric']
    ))
])

pipe_knn_final.fit(X_train, y_train)

print(f"Best CV Score (from GridSearch): {knn_best_score:.4f}")
print(f"Best Params (from GridSearch): {knn_best_params}")

Best CV Score (from GridSearch): 0.8152
Best Params (from GridSearch): {'model__metric': 'minkowski', 'model__n_neighbors': 2, 'model__weights': 'distance'}


### Decision Tree

In [14]:
# param_grid = {
#     "model__criterion": ["gini", "entropy", "log_loss"],
#     "model__max_depth": [5, 6, 7, 8, None],
#     "model__min_samples_split" : [3, 4, 5, 6, 7, 10],
#     "model__class_weight" : [None, "balanced"],
#     "model__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9]
# }
#
# rsfk = RepeatedStratifiedKFold(
#     n_splits=5,
#     n_repeats=100,
#     random_state=42
# )
#
# pipe_tree = Pipeline(steps=[
#     ("scaler", StandardScaler()),
#     ("model", DecisionTreeClassifier())
# ])
#
# grid = GridSearchCV(
#     estimator=pipe_tree,
#     param_grid=param_grid,
#     scoring="accuracy",
#     cv=rsfk,
#     n_jobs=-1,
#     verbose=1
# )
#
#
# grid.fit(X_train, y_train)
# print(f"\nBest parameters: {grid.best_params_}")
# print(f"Best mean CV accuracy: {grid.best_score_:.4f}")


Fitting 500 folds for each of 1620 candidates, totalling 810000 fits

Best parameters: {'model__class_weight': None, 'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_leaf': 8, 'model__min_samples_split': 6}
Best mean CV accuracy: 0.7381


In [26]:
tree_best_params = {'model__class_weight': None, 'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_leaf': 8, 'model__min_samples_split': 6} # Values copied from the commented cell above
tree_best_score = 0.7381 # Value copied from the cell above


pipe_tree_final = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", DecisionTreeClassifier(
    max_depth=tree_best_params["model__max_depth"],
    criterion=tree_best_params["model__criterion"],
    min_samples_split=tree_best_params["model__min_samples_split"],
    min_samples_leaf=tree_best_params["model__min_samples_leaf"],
    class_weight=tree_best_params["model__class_weight"]
))
])


pipe_tree_final.fit(X_train, y_train)

print(f"Best CV Score (from GridSearch): {tree_best_score:.4f}")
print(f"Best Params (from GridSearch): {tree_best_params}")

Best CV Score (from GridSearch): 0.7381
Best Params (from GridSearch): {'model__class_weight': None, 'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_leaf': 8, 'model__min_samples_split': 6}


### Random Forest

In [36]:
# param_grid = {
#     "model__n_estimators": [25, 50, 75, 100, 150],
#     "model__max_depth": [1, 2, 3, None],
#     "model__min_samples_split": [2, 3, 4],
#     "model__min_samples_leaf": [2, 3, 4, 5, 6, 7],
#     "model__max_features": ["sqrt", "log2"],
#     "model__criterion": ["entropy", "gini"]
# }
#
# rsfk = RepeatedStratifiedKFold(
#     n_splits=3,
#     n_repeats=5,
#     random_state=42
# )
#
# pipe_rf = Pipeline(steps=[
#     ("scaler", StandardScaler()),
#     ("model", RandomForestClassifier(random_state=42))
# ])
#
# grid = GridSearchCV(
#     estimator=pipe_rf,
#     param_grid=param_grid,
#     scoring="accuracy",
#     cv=rsfk,
#     n_jobs=-1,
#     verbose=1
# )
#
# grid.fit(X_train, y_train)
#
# print(f"Best parameters: {grid.best_params_}")
# print(f"Best mean CV accuracy: {grid.best_score_:.4f}")
#
# # 7. Test na danych testowych
# best_rf = grid.best_estimator_
# y_pred = best_rf.predict(X_test)


Fitting 15 folds for each of 1440 candidates, totalling 21600 fits
Best parameters: {'model__criterion': 'entropy', 'model__max_depth': 1, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 2, 'model__n_estimators': 50}
Best mean CV accuracy: 0.7593


In [38]:
rf_best_params = {'model__criterion': 'entropy', 'model__max_depth': 1, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 2, 'model__n_estimators': 50} # Values copied from the commented cell above
rf_best_score = 0.7593 # Value copied from the cell above

pipe_rf_final = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(
    max_depth=rf_best_params["model__max_depth"],
    min_samples_split=rf_best_params["model__min_samples_split"],
    n_estimators=rf_best_params["model__n_estimators"],
    min_samples_leaf=rf_best_params["model__min_samples_leaf"],
    max_features=rf_best_params["model__max_features"],
    criterion=rf_best_params["model__criterion"],
    random_state=42
))
])


pipe_rf_final.fit(X_train, y_train)

print(f"Best CV Score (from GridSearch): {rf_best_score:.4f}")
print(f"Best Params (from GridSearch): {rf_best_params}")

Best CV Score (from GridSearch): 0.7593
Best Params (from GridSearch): {'model__criterion': 'entropy', 'model__max_depth': 1, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 2, 'model__n_estimators': 50}
