##Decision Tree Classifier

In [0]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd

In [0]:
diabates_data = pd.read_csv('https://raw.githubusercontent.com/SurathuKumarSai/PythonML/refs/heads/main/diabetes_data.csv')

In [0]:
diabates_data.head()

In [0]:
diabates_data.describe()

In [0]:
diabates_data.columns

In [0]:
feature_cols = ['Pregnancies','Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Age']

X = diabates_data[feature_cols]
y = diabates_data['Outcome']


In [0]:
print(X.shape)
print(y.shape)


In [0]:
from sklearn.model_selection import train_test_split

# 80% training and 20% test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [0]:
print(X_train.shape)
print(y_train.shape)

In [0]:
print(X_test.shape)
print(y_test.shape)


In [0]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

In [0]:
# Train Decision Tree Classifer - Check Output
clf = clf.fit(X_train,y_train)

##Encoding

In [0]:
X['Pregnancies'].value_counts()

In [0]:
## Manual Encoding

category_mapping = {
    'Zero':0,
    'One': 1,
    'Two': 2,
    'More Than 2': 3
}

X['Pregnancies'] = X['Pregnancies'].apply(lambda x: category_mapping[x])

X['Pregnancies'].value_counts()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)


# Train Decision Tree Classifer - Check Output
clf = clf.fit(X_train,y_train)

In [0]:
#Predict the response for train dataset
y_train_pred = clf.predict(X_train)

In [0]:
y_train_pred

In [0]:
from sklearn.metrics import accuracy_score

accuracy_score(y_train, y_train_pred)


In [0]:
#Predict the response for test dataset
y_test_pred = clf.predict(X_test)

In [0]:

accuracy_score(y_test, y_test_pred)


In [0]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(25,10))
a = plot_tree(clf)

###Pruning

In [0]:
clf = DecisionTreeClassifier(max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

In [0]:
#Predict the response for train dataset
y_train_pred = clf.predict(X_train)

In [0]:
accuracy_score(y_train, y_train_pred)


In [0]:
#Predict the response for test dataset
y_test_pred = clf.predict(X_test)

In [0]:

accuracy_score(y_test, y_test_pred)


In [0]:
plt.figure(figsize=(25,10))
a = plot_tree(clf)

In [0]:
plt.figure(figsize=(25,10))
a = plot_tree(clf,feature_names=X.columns,filled=True)

In [0]:
print(252+87)

print(X_train.shape)

###Hyperparameter Tuning

In [0]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':range(2,5)}

clf = GridSearchCV(DecisionTreeClassifier(), param_grid=parameters, cv = 4, n_jobs=3)

clf.fit(X_train,y_train)


In [0]:
print(clf.best_score_)

print(clf.best_params_)

In [0]:
model = clf.best_estimator_

y_pred_train = model.predict(X_train)

y_pred_test = model.predict(X_test)

In [0]:

print(accuracy_score(y_train,y_pred_train))

print(accuracy_score(y_test,y_pred_test))


###Confusion Matrix

In [0]:
from sklearn.metrics import confusion_matrix

confusion_mat = confusion_matrix(y_true = y_test,   y_pred = y_pred_test)

confusion_mat

In [0]:
from sklearn.metrics import ConfusionMatrixDisplay

cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_mat)

cm_display.plot()