<a href="https://colab.research.google.com/github/Simarjit1303/Data-Science/blob/main/exercises/machine-learning/supervised-learning/model_training_and_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Training and Evaluation
You should build a machine learning pipeline with a complete model training and evaluation step. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Conduct data exploration, data preprocessing, and feature engineering if necessary.
- Choose a few machine learning algorithms, such as [KNN](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), [decision tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html), and [gradient boosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html).
- Define a grid of hyperparameters for every selected model.
- Conduct [grid search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) or [random search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) using k-fold cross-validation on the training set to find out the best model (i.e., the best algorithm and its hyperparameters).
- Train the best model on the whole training set.
- Test the trained model on the test set and report various [evaluation metrics](https://scikit-learn.org/0.15/modules/model_evaluation.html).  
- Check the documentation to identify the most important hyperparameters, attributes, and methods. Use them in practice.

In [61]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Classifiers
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [62]:
dataset = pd.read_csv('https://raw.githubusercontent.com/m-mahdavi/teaching/refs/heads/main/datasets/mnist.csv')
dataset.head(3)

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
dataset.describe()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,34415.17925,4.4395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07675,0.01525,0.013,0.0015,0.0,0.0,0.0,0.0,0.0,0.0
std,20508.890104,2.879655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.616022,0.964495,0.822192,0.094868,0.0,0.0,0.0,0.0,0.0,0.0
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16575.75,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34435.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,52111.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,69998.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125.0,61.0,52.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
dataset.drop(columns = ['id'], inplace=True)
dataset.isnull().sum().value_counts()

Unnamed: 0,count
0,785


In [65]:
train_data, test_data = train_test_split(dataset)
print(f"dataset_size: {dataset.shape}")
print(f"datset_trained_size: {train_data.shape}")
print(f"dataset_test_size: {test_data.shape}")

dataset_size: (4000, 785)
datset_trained_size: (3000, 785)
dataset_test_size: (1000, 785)


In [66]:
x_train = train_data.drop(columns=['class'], axis=1)
y_train = train_data['class']
x_test = test_data.drop(columns=['class'], axis=1)
y_test = test_data['class']

print(f"x_train_size: {x_train.shape}")
print(f"y_train_size: {y_train.shape}")
print(f"x_test_size: {x_test.shape}")
print(f"y_test_size: {y_test.shape}")

x_train_size: (3000, 784)
y_train_size: (3000,)
x_test_size: (1000, 784)
y_test_size: (1000,)


In [67]:
# Scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Defining hyperparameter

In [71]:
# Defining hyperparameter grids for KNN, Decision Tree and Gradient Boosting

svc_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

dt_params = {
    'max_depth': [None, 10, 20],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4]
}

gb_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate' : [0.01, 0.1, 0.2],
    'max_depth' : [3, 4, 5]
}
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Declaring common hyperparametrs

In [72]:
# Creating an dusing dictionary of models and their hyperparameter grids
models = {
    'SVC': (SVC(), svc_params),
    'KNN' : (KNeighborsClassifier(), knn_params),
    'Decision Tree': (DecisionTreeClassifier(), dt_params),
    'Gradient Boosting': (GradientBoostingClassifier(), gb_params),
    'Random Forest': (RandomForestClassifier(), rf_params)
}

# Conducting Grid Search with Cross_Validation


In [None]:
# performing Grid Search for each model
best_model_1 = None
best_score_1 = 0

# Using 5-fold cross-validation
for model_name, (model, params) in models.items():
  grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
  grid_search.fit(x_train_scaled, y_train)

  print(f"Best parameters for {model_name}: {grid_search.best_params_}")
  print(f"Best score for {model_name}: {grid_search.best_score_}")

  if grid_search.best_score_ > best_score_1:
    best_score_1 = grid_search.best_score_
    best_model_1 = grid_search.best_estimator_

print(f"The best model is: {best_model_1}")

Best parameters for SVC: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Best score for SVC: 0.9186666666666667
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best score for KNN: 0.8940000000000001
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best score for Decision Tree: 0.745


# Training the models on whole tarining set

In [None]:
# Train the best models on the whole training set
best_model_1.fit(x_train_scaled, y_train)

# Testing the accuracy of prediction

In [None]:
# Make predictions on the test set
y_pred_1 = best_model_1.predict(x_test_scaled)
# Evaluate the model
accuracy_1 = accuracy_score(y_test, y_pred_1)
# Printing the accuarcy
print(f"Accuracy by using GridSearch on {best_model_1}: {accuracy_1 * 100:.2f}%")