## LECTURE 10: Decision tree

## course: Awfera Machine Learning

## Instructor: Dr. Shazia Saqib

## Student: Muhammad Shafiq

____________

# Decision Tree Classification for diabetes dataset

### Step 1: import the libraries


In [12]:
# Step 1: import the libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Load the dataset:

In [13]:
# Load the dataset
file_path = './dataset/diabetic_dataset_full.csv'
df = pd.read_csv(file_path)

In [19]:
# display basic information
print("Dataset Infromation: ")
print(df.head(10))
print(df.info())
print(df.shape)
print(df.dtypes)

Dataset Infromation: 
   PatientID  Age  Gender   BMI  ...  SmokingStatus  CholesterolLevel  HbA1c  Outcome
0          1   59       1  29.9  ...         Former            Normal    4.9        0
1          2   72       1  27.2  ...          Never              High    5.0        0
2          3   49       1  22.0  ...        Current              High    7.7        0
3          4   35       1  32.8  ...          Never              High    7.8        0
4          5   63       0  26.0  ...          Never              High    6.8        0
5          6   28       0  29.0  ...         Former            Normal    6.9        1
6          7   41       1  31.4  ...          Never            Normal    4.8        1
7          8   59       1  19.6  ...        Current              High    7.3        0
8          9   78       1  20.7  ...          Never              High    7.3        1
9         10   39       0  28.5  ...        Current              High    5.3        1

[10 rows x 14 columns]
<class '

In [25]:
print(df.columns)
object_colums = df.select_dtypes(include='object').columns
print("Object colums:", object_colums.tolist())

Index(['PatientID', 'Age', 'Gender', 'BMI', 'BloodPressure', 'Glucose',
       'Insulin', 'SkinThickness', 'DiabetesPedigree', 'PhysicalActivityLevel',
       'SmokingStatus', 'CholesterolLevel', 'HbA1c', 'Outcome'],
      dtype='object')
Object colums: ['PhysicalActivityLevel', 'SmokingStatus', 'CholesterolLevel']


In [28]:
print("PhysicalActivityLevel: ", df['PhysicalActivityLevel'].unique())
print("SmokingStatus: ", df['SmokingStatus'].unique())
print("CholesterolLevel: ", df['CholesterolLevel'].unique())



PhysicalActivityLevel:  ['Low' 'Moderate' 'High']
SmokingStatus:  ['Former' 'Never' 'Current']
CholesterolLevel:  ['Normal' 'High']


## preprocessing

In [9]:
# step 2: Handle Missing Values
print("\nChecking for missing Values")
print(df.isnull().sum())

# Fill missing numerical values with the median
df.fillna(df.median(numeric_only=True), inplace=True)

# Fill missing catagorical values with the mode(if any)
for col in df.select_dtypes(include=['object']):
    df[col].fillna(df[col].mode()[0], inplace=True)


Checking for missing Values
PatientID                0
Age                      0
Gender                   0
BMI                      0
BloodPressure            0
Glucose                  0
Insulin                  0
SkinThickness            0
DiabetesPedigree         0
PhysicalActivityLevel    0
SmokingStatus            0
CholesterolLevel         0
HbA1c                    0
Outcome                  0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [29]:
# Encode Gender Columm
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['PhysicalActivityLevel'] = le.fit_transform(df['PhysicalActivityLevel'])
df['CholesterolLevel'] = le.fit_transform(df['CholesterolLevel'])
df = pd.get_dummies(df, columns=['SmokingStatus'])

# step 3: Prepare Data
# Separate features and target varaible
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Apply standard scalling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# train a decision tree classifier
print("\nTraining Decision Tree Classifier...")
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
# predict on test dat
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Classification report
print("\nClassification Report: ")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix: ")
print(confusion_matrix(y_test, y_pred))



Training Decision Tree Classifier...
Model Accuracy: 0.53

Classification Report: 
              precision    recall  f1-score   support

           0       0.64      0.59      0.61       127
           1       0.37      0.42      0.40        73

    accuracy                           0.53       200
   macro avg       0.51      0.51      0.51       200
weighted avg       0.54      0.53      0.54       200


Confusion Matrix: 
[[75 52]
 [42 31]]


In [31]:
pip install graphviz

Collecting graphviz
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.21-py3-none-any.whl (47 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.21
Note: you may need to restart the kernel to use updated packages.


In [32]:
# step 8: Visualize the decision tree

from sklearn.tree import export_graphviz
import graphviz 

    # Export the decision tree to a dot file
feature_names= X.columns
dot_data = export_graphviz(model, out_file=None,feature_names=feature_names, class_names=['No Daiabetes','Diabetes'], filled=True, rounded=True, special_characters=True)


    # create and save the graph
graph = graphviz.Source(dot_data)
graph.render("./dataset/diabetic_dataset_full.csv")

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH