<a href="https://colab.research.google.com/github/Simarjit1303/Data-Science/blob/main/exercises/machine-learning/supervised-learning/decision_trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Trees
You should build a machine learning pipeline using a decision tree model. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Conduct data exploration, data preprocessing, and feature engineering if necessary.
- Train and test a decision tree model using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html).
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [69]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Classifiers
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Data Collection and Exploration

In [70]:
dataset = pd.read_csv('https://raw.githubusercontent.com/m-mahdavi/teaching/refs/heads/main/datasets/mnist.csv')
dataset.head(3)

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
dataset.describe()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,34415.17925,4.4395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07675,0.01525,0.013,0.0015,0.0,0.0,0.0,0.0,0.0,0.0
std,20508.890104,2.879655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.616022,0.964495,0.822192,0.094868,0.0,0.0,0.0,0.0,0.0,0.0
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16575.75,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34435.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,52111.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,69998.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125.0,61.0,52.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# removing ID columns as it is no use
dataset.drop(columns = ['id'], inplace = True)
# checiking null values
dataset.isnull().sum().value_counts()

Unnamed: 0,count
0,785


# Data Preprocessing

In [73]:
# splitting dataset into 2 sets training set(75%) and test set(25%)
train_data, test_data = train_test_split(dataset)
print(f"dataset_size: {dataset.shape}")
print(f"datset_trained_size: {train_data.shape}")
print(f"dataset_test_size: {test_data.shape}")

dataset_size: (4000, 785)
datset_trained_size: (3000, 785)
dataset_test_size: (1000, 785)


In [74]:
# Declaring feature vector and target variable
x_train = train_data.drop('class', axis=1)
y_train = train_data['class']
x_test = test_data.drop('class', axis=1)
y_test = test_data['class']

print(f"x_train_size: {x_train.shape}")
print(f"y_train_size: {y_train.shape}")
print(f"x_test_size: {x_test.shape}")
print(f"y_test_size: {y_test.shape}")

x_train_size: (3000, 784)
y_train_size: (3000,)
x_test_size: (1000, 784)
y_test_size: (1000,)


# Feature Scalling

In [75]:
# Scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Dimensionality Reduction (optional)
pca = PCA(n_components=0.95)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

# # [Method 1] Data Modelling and prediction with SVM.SVC(default parameter)

In [76]:
# creating modell for prediction
# Default hyperparameter means C=1.0, kernel=rbf and gamma=auto among other parameters

# initialisinfg classifier with default parameter
svc1 = SVC()
svc2 = SVC()
# fitting classifier to training dataset
svc1.fit(x_train, y_train)
svc2.fit(x_train_pca, y_train)
# makimg prediction
y_predict1 = svc1.predict(x_test)
y_predict2 = svc2.predict(x_test_pca)

# checking the accuracy of our model before pca
accuracy = accuracy_score(y_test, y_predict1)
print(f"Accuracy of our model before PCA: {accuracy*100:.2f}%")

# checking the accuracy of our model after pca
accuracy = accuracy_score(y_test, y_predict2)
print(f"Accuracy of our model after PCA: {accuracy*100:.2f}%")


Accuracy of our model before PCA: 94.80%
Accuracy of our model after PCA: 91.50%


# [Method 2] Data Modelling and prediction with KNN

In [77]:
# using knn model before pca
knn1 = KNeighborsClassifier(n_neighbors=5)
knn1.fit(x_train, y_train)
y_pred_knn1 = knn1.predict(x_test)
accuracy_knn1 = accuracy_score(y_test, y_pred_knn1)
print(f"Accuracy of KNN before pca: {accuracy_knn1 * 100:.2f}%")


# using knn model before pca
knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(x_train_pca, y_train)
y_pred_knn2 = knn2.predict(x_test_pca)
accuracy_knn2 = accuracy_score(y_test, y_pred_knn2)
print(f"Accuracy of KNN after pca : {accuracy_knn2 * 100:.2f}%")

Accuracy of KNN before pca: 90.60%
Accuracy of KNN after pca : 88.40%


# [Method 3] Data Modelling and prediction with naive_baye with GaussianNB

In [78]:
# using naive_baye with GaussianNB model before pca and scalling
nb1 = GaussianNB()
nb1.fit(x_train, y_train)
y_pred1 = nb1.predict(x_test)
accuracy1 = accuracy_score(y_test, y_pred1)
print(f"Accuracy with naive_baye using GaussianNB before pca: {accuracy1 * 100:.2f}%")

# using naive_baye with GaussianNB model before pca
nb2 = GaussianNB()
nb2.fit(x_train_pca, y_train)
y_pred2 = nb2.predict(x_test_pca)
accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Accuracy with naive_baye using GaussianNB after pca: {accuracy2 * 100:.2f}%")


Accuracy with naive_baye using GaussianNB before pca: 59.60%
Accuracy with naive_baye using GaussianNB after pca: 52.90%


# [Method 4] Data Modelling and prediction with decision tree

In [79]:
# Model with 'gini' before feature scalling
model_gini_1 = DecisionTreeClassifier()
model_gini_1.fit(x_train, y_train)
y_pred_gini_1 = model_gini_1.predict(x_test)
accuracy_gini_1 = accuracy_score(y_test, y_pred_gini_1)
print(f"Accuracy with decision tree before feature scalling': {accuracy_gini_1 * 100:.2f}%")

# Model with 'gini' after feature scalling
model_gini_2 = DecisionTreeClassifier()
model_gini_2.fit(x_train_pca, y_train)
y_pred_gini_2 = model_gini_2.predict(x_test_pca)
accuracy_gini_2 = accuracy_score(y_test, y_pred_gini_2)
print(f"Accuracy with  decision tree after feature scalling : {accuracy_gini_2 * 100:.2f}%")

Accuracy with decision tree before feature scalling': 77.20%
Accuracy with  decision tree after feature scalling : 70.00%


## Using 'Gini' decision tree

In [80]:
# Model with 'gini' before feature scalling
model_gini_1 = DecisionTreeClassifier(criterion='gini')  # Specify 'gini'
model_gini_1.fit(x_train, y_train)
y_pred_gini_1 = model_gini_1.predict(x_test)
accuracy_gini_1 = accuracy_score(y_test, y_pred_gini_1)
print(f"Accuracy with 'gini before feature scalling': {accuracy_gini_1 * 100:.2f}%")

# Model with 'gini' after feature scalling
model_gini_2 = DecisionTreeClassifier(criterion='gini')  # Specify 'gini'
model_gini_2.fit(x_train_pca, y_train)
y_pred_gini_2 = model_gini_2.predict(x_test_pca)
accuracy_gini_2 = accuracy_score(y_test, y_pred_gini_2)
print(f"Accuracy with 'gini' after feature scalling : {accuracy_gini_2 * 100:.2f}%")

Accuracy with 'gini before feature scalling': 77.50%
Accuracy with 'gini' after feature scalling : 70.70%


## Using 'Entropy' decision tree

In [81]:
# Model with 'entropy' before pca and scalling
model_entropy_1 = DecisionTreeClassifier(criterion='entropy')  # Specify 'entropy'
model_entropy_1.fit(x_train, y_train)
y_pred_entropy_1 = model_entropy_1.predict(x_test)
accuracy_entropy_1 = accuracy_score(y_test, y_pred_entropy_1)
print(f"Accuracy with 'entropy' before feature scalling: {accuracy_entropy_1 * 100:.2f}%")

# Model with 'entropy' before pca and scalling after pca and scalling
model_entropy_2 = DecisionTreeClassifier(criterion='entropy')  # Specify 'entropy'
model_entropy_2.fit(x_train_pca, y_train)
y_pred_entropy_2 = model_entropy_2.predict(x_test_pca)
accuracy_entropy_2 = accuracy_score(y_test, y_pred_entropy_1)
print(f"Accuracy with 'entropy' after feature scalling: {accuracy_entropy_2 * 100:.2f}%")

Accuracy with 'entropy' before feature scalling: 78.80%
Accuracy with 'entropy' after feature scalling: 78.80%
