In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("diabetes.csv")

In [3]:
data.head(30)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data.isnull()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False


In [6]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
import pandas as pd

# Assuming 'data' is your DataFrame and 'target_column' is the name of your target column
target_labels = data['Outcome'].value_counts()

print("Number of unique labels in the target column:")
print(len(target_labels))

print("\nCounts of each label:")
print(target_labels)


Number of unique labels in the target column:
3

Counts of each label:
0    362
1    256
2    150
Name: Outcome, dtype: int64


In [10]:
data['Outcome'] = data['Outcome'].map({0: 0, 1: 1, 2: 1})

# Check the unique values in the target column after mapping
print("Unique values in the target column after mapping:")
print(data['Outcome'].unique())

Unique values in the target column after mapping:
[1 0]


In [11]:
X = data.drop(columns=['Outcome'])
y = data['Outcome']

In [12]:
def entropy(y):
    value_counts = y.value_counts()
    probabilities = value_counts / len(y)
    entropy_value = -np.sum(probabilities * np.log2(probabilities.replace(0, 1)))
    return entropy_value
entropy(y)

0.997630998766336

In [13]:
def information_gain(y, feature):

    total_entropy = entropy(y)
    
    unique_values = feature.unique()
    weighted_entropies = 0

    for value in unique_values:
        subset_y = y[feature == value]
        weighted_entropies += (len(subset_y) / len(y)) * entropy(subset_y)

    return total_entropy - weighted_entropies

for column in data.columns:
    if column != 'Outcome':
        feature = data[column]
        ig = information_gain(y, feature)
        print(f"Feature: {column}, Information Gain: {ig:.4f}")


Feature: Pregnancies, Information Gain: 0.0368
Feature: Glucose, Information Gain: 0.1876
Feature: BloodPressure, Information Gain: 0.0681
Feature: SkinThickness, Information Gain: 0.0655
Feature: Insulin, Information Gain: 0.2526
Feature: BMI, Information Gain: 0.3379
Feature: DiabetesPedigreeFunction, Information Gain: 0.6713
Feature: Age, Information Gain: 0.0837


In [14]:
class Node:
    def __init__(self, feature=None, value=None, entropy=None, information_gain=None, left=None, right=None):
        self.feature = feature
        self.value = value
        self.entropy = entropy
        self.information_gain = information_gain
        self.left = left
        self.right = right

def build_decision_tree(X, y):
    if entropy(y) == 0:
        # If all instances have the same class, create a leaf node
        return Node(value=y.iloc[0])

    if X.empty:
        # If no features left, create a leaf node with the majority class
        return Node(value=y.value_counts().idxmax())

    # Find the best feature to split on
    best_feature = None
    max_info_gain = 0

    for feature_name in X.columns:
        current_info_gain = information_gain(y, X[feature_name])
        if current_info_gain > max_info_gain:
            max_info_gain = current_info_gain
            best_feature = feature_name

    # Create a node with the best feature
    node = Node(feature=best_feature, entropy=entropy(y), information_gain=max_info_gain, value={})

    # Recursively build the left and right subtrees
    unique_values = X[best_feature].unique()
    for value in unique_values:
        subset_X = X[X[best_feature] == value].drop(columns=[best_feature])
        subset_y = y[X[best_feature] == value]
        child_node = build_decision_tree(subset_X, subset_y)

        if node.value is None:
            node.value = {value: child_node}
        else:
            node.value[value] = child_node

    return node

decision_tree = build_decision_tree(X, y)


In [17]:
def predict(node, instance):
    if node.feature is None:
        return node.value
    else:
        value = instance[node.feature]
        if value in node.value:
            return predict(node.value[value], instance)
        else:
            # If the value is not present in the node's children, return the majority class
            return max(node.value.values(), key=lambda x: isinstance(x, Node)).value


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

decision_tree = build_decision_tree(X_train, y_train)
y_pred = [predict(decision_tree, instance) for _, instance in X_test.iterrows()]

accuracy_score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_score:.2f}")

Accuracy: 0.49


In [19]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
tp=cm[0][0]
fp=cm[0][1]
fn=cm[1][0]
tn=cm[1][1]
print (tp,fp,fn,tn)




55 20 58 21


In [20]:
acc=(tp+tn)/(tp+tn+fp+fn)

In [21]:
print("Accuracy:",acc)

Accuracy: 0.4935064935064935
