1. Import the data
  - use mounting and pd.read_excel
2. Clean the data
  - Do not forget to do imputation
  - check df.info()
    - be sure that there is no problem with the types
2. Split the data
2. Is it a claasification or regression task?
3. Use different ML models to build a predictive model
4. Use Grid Search for hyperparameter tunning

In [2]:
# Import the data

import pandas as pd
# mount
# connection between colab notebook and drive
from google.colab import drive
drive.mount('/content/drive')

# import covid file
df_covid = pd.read_excel('/content/drive/My Drive/QCC-Bootcamp/Machine Learning/LAB/covid.xlsx')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Clean the data

# Check the first few rows of the dataframe
print(df_covid.head())

# Check the info of the dataframe
print(df_covid.info())

# Check for missing values
print(df_covid.isnull().sum())

# Impute missing values with median for numerical columns
for column in df_covid.select_dtypes(include=['float64', 'int64']).columns:
    df_covid[column].fillna(df_covid[column].median(), inplace=True)

# Check again for missing values
print(df_covid.isnull().sum())

  SESSO  AGE   WBC  Piastrine  Neutrofili Linfociti  Monociti  Eosinofili  \
0     M   56   2.9      128.0    1.900000       0.8  0.200000    0.000000   
1     M   56   3.5      151.0    2.100000       0.9  0.400000    0.000000   
2     M   72   4.6      206.0    6.200478         1  0.605742    0.055024   
3     M   72  16.5      316.0   14.000000       1.2  0.300000    0.000000   
4     M   77   4.9      198.0    6.200478         1  0.605742    0.055024   

   Basofili         PCR        AST        ALT       ALP        GGT  \
0  0.000000   29.000000  36.000000  18.000000  43.00000  21.000000   
1  0.000000   16.500000  25.000000  14.000000  50.00000  17.000000   
2  0.014423  193.700000  31.000000  22.000000  89.89313  82.477941   
3  0.000000  318.700000  96.000000  33.000000  80.00000  42.000000   
4  0.014423   90.889011  54.202166  44.917293  89.89313  82.477941   

          LDH  TARGET  
0  257.000000       2  
1  207.000000       2  
2  380.448454       2  
3  651.000000       

In [9]:
# Split the Data

from sklearn.model_selection import train_test_split

X = df_covid.drop('TARGET', axis=1)
y = df_covid['TARGET']

# Convert categorical data to numerical data
X = pd.get_dummies(X, drop_first=True)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
# Use different ML models

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# kNN Classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("kNN Classifier Report:")
print(classification_report(y_test, y_pred))

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print("Decision Tree Classifier Report:")
print(classification_report(y_test, y_pred))

# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest Classifier Report:")
print(classification_report(y_test, y_pred))

kNN Classifier Report:
              precision    recall  f1-score   support

           0       0.52      0.60      0.56        20
           1       0.45      0.39      0.42        23
           2       0.38      0.38      0.38        13

    accuracy                           0.46        56
   macro avg       0.45      0.46      0.45        56
weighted avg       0.46      0.46      0.46        56

Decision Tree Classifier Report:
              precision    recall  f1-score   support

           0       0.62      0.50      0.56        20
           1       0.73      0.70      0.71        23
           2       0.39      0.54      0.45        13

    accuracy                           0.59        56
   macro avg       0.58      0.58      0.57        56
weighted avg       0.61      0.59      0.60        56

Random Forest Classifier Report:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75        20
           1       0.75      0.78      0

In [13]:
# Grid Search for hyperparameter tuning

from sklearn.model_selection import GridSearchCV

# Grid search for kNN
knn_params = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train, y_train)
print(f"Best Parameters for kNN: {knn_grid.best_params_}")

# Grid search for Decision Tree
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(1, 11)),
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5)
dt_grid.fit(X_train, y_train)
print(f"Best Parameters for Decision Tree: {dt_grid.best_params_}")

# Grid search for Random Forest
rf_params = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 3]
}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5)
rf_grid.fit(X_train, y_train)
print(f"Best Parameters for Random Forest: {rf_grid.best_params_}")

Best Parameters for kNN: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
Best Parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 4}
Best Parameters for Random Forest: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
