<a href="https://colab.research.google.com/github/WMinerva292/WMinerva292/blob/main/DecisionTreeQuiz2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Load the diabetes dataset
df = pd.read_csv('/content/diabetes.csv')

# Convert Blood Pressure into categorical column
def categorize_bp(bp):
    if bp < 80:
        return "Low"
    elif 80 <= bp <= 120:
        return "Normal"
    else:
        return "High"

df["BP_Category"] = df["BloodPressure"].apply(categorize_bp)

# Calculate the probability of each category
category_counts = df["BP_Category"].value_counts()
total_count = len(df)
probabilities = category_counts / total_count

# Calculate entropy
entropy = -np.sum(probabilities * np.log2(probabilities))

print(f"Entropy of Blood Pressure: {entropy:.2f}")


Entropy of Blood Pressure: 0.85


In [None]:
# Find column which has the highest entropy excluding the target column Outcome in the original data
target_column = "Outcome"
entropy_values = {}

def cal_entropy(column):
    category_counts = column.value_counts()
    total_count = len(column)
    probabilities = category_counts / total_count
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

for column in df.columns:
    if column != target_column:
        entropy = cal_entropy(df[column])
        entropy_values[column] = entropy
max_entropy_column = max(entropy_values, key=entropy_values.get)
print(f"Column with the highest entropy: {max_entropy_column}")

# Find column which is considered as the root node of the decision tree using Information Gain in the original data
target_column = "Outcome"
entropy_values = {}

for column in df.columns:
    if column != target_column:
        entropy = cal_entropy(df[column])
        entropy_values[column] = entropy
        print(f"Entropy of {column}: {entropy:.2f}")
        print()
max_entropy_column = max(entropy_values, key=entropy_values.get)
print(f"Column with the highest entropy: {max_entropy_column}")


Column with the highest entropy: DiabetesPedigreeFunction


In [None]:
!pip install scikit-learn
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.preprocessing import LabelEncoder

# Convert Blood Pressure into categorical column
def categorize_bp(bp):
    if bp < 80:
        return "Low"
    elif 80 <= bp <= 120:
        return "Normal"
    else:
        return "High"

# Convert Cholesterol into categorical column
def categorize_chol(chol):
    if chol < 200:
        return "Normal"
    else:
        return "High"

df["BP_Category"] = df["BloodPressure"].apply(categorize_bp)

# --- Changes start here ---
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Fit and transform the categorical column
df["BP_Category"] = label_encoder.fit_transform(df["BP_Category"])
# --- Changes end here ---

# Separate features and target
X = df.drop(columns=["Outcome"])
y = df["Outcome"]

# Build a Decision Tree using Information Gain (default in sklearn is Gini, so specify 'entropy')
model = DecisionTreeClassifier(criterion="entropy", random_state=1)
model.fit(X, y)

# Extract the feature importance
tree_rules = export_text(model, feature_names=list(X.columns))
print(tree_rules)


|--- Glucose <= 127.50
|   |--- Age <= 28.50
|   |   |--- BMI <= 30.95
|   |   |   |--- Pregnancies <= 7.50
|   |   |   |   |--- DiabetesPedigreeFunction <= 0.67
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- DiabetesPedigreeFunction >  0.67
|   |   |   |   |   |--- DiabetesPedigreeFunction <= 0.69
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- DiabetesPedigreeFunction >  0.69
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- Pregnancies >  7.50
|   |   |   |   |--- class: 1
|   |   |--- BMI >  30.95
|   |   |   |--- BloodPressure <= 37.00
|   |   |   |   |--- class: 1
|   |   |   |--- BloodPressure >  37.00
|   |   |   |   |--- DiabetesPedigreeFunction <= 0.50
|   |   |   |   |   |--- BMI <= 45.35
|   |   |   |   |   |   |--- DiabetesPedigreeFunction <= 0.28
|   |   |   |   |   |   |   |--- Insulin <= 89.00
|   |   |   |   |   |   |   |   |--- BloodPressure <= 55.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- Bloo

In [None]:
# Initialize a DecisionTreeClassifier with 'gini' criterion (default)
model = DecisionTreeClassifier(criterion="gini", random_state=1)

# Fit the model
model.fit(X, y)

# Get feature importances (related to Gini index)
feature_importances = model.feature_importances_

# Create a DataFrame to display features and their Gini importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Gini Index': feature_importances
})

# Sort the DataFrame by Gini Index (lower Gini index indicates better feature)
feature_importance_df = feature_importance_df.sort_values(by="Gini Index")

# Display the feature with the lowest Gini index
print(f"The feature with the lowest Gini index: {feature_importance_df.iloc[0]['Feature']}")


The feature with the lowest Gini index: BP_Category


In [None]:
# Find column which has the lowest gini index excluding the target column Outcome in the original data
target_column = "Outcome"
gini_values = {}

def cal_gini(column):
    category_counts = column.value_counts()
    total_count = len(column)
    probabilities = category_counts / total_count
    gini = 1 - np.sum(probabilities ** 2)
    return gini
for column in df.columns:
    if column != target_column:
        gini = cal_gini(df[column])
        gini_values[column] = gini
        print(f"Gini of {column}: {gini:.2f}")
        print()
min_gini_column = min(gini_values, key=gini_values.get)
print(f"Column with the lowest gini index: {min_gini_column}")

Gini of Pregnancies: 0.89

Gini of Glucose: 0.99

Gini of BloodPressure: 0.96

Gini of SkinThickness: 0.90

Gini of Insulin: 0.76

Gini of BMI: 0.99

Gini of DiabetesPedigreeFunction: 1.00

Gini of Age: 0.96

Gini of BP_Category: 0.39

Column with the lowest gini index: BP_Category


In [None]:
# Remove the records with 0 Glucose.
df.drop(df[df['Glucose'] == 0].index, inplace=True)

In [None]:
# Filter the column Glucose having values less than and equal to 75.
df = df[df['Glucose'] <= 75]
print(df)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
47             2       71             70             27        0  28.0   
55             1       73             50             10        0  23.0   
62             5       44             62              0        0  25.0   
76             7       62             78              0        0  32.6   
81             2       74              0              0        0   0.0   
97             1       71             48             18       76  20.4   
146            9       57             80             37        0  32.8   
174            2       75             64             24       55  29.7   
183            5       73             60              0        0  26.8   
234            3       74             68             28       45  29.7   
273            1       71             78             50       45  33.2   
352            3       61             82             28        0  34.4   
403            9       72             

In [None]:
# Now whether the Glucose column with filtered values is considered as leaf node
