In [None]:
import pandas as pd, pprint, numpy as np, math

In [None]:
df= pd.read_csv('/content/DecisionTreeDataset.csv')
df

Unnamed: 0,Temperature,Outlook,Humidity,Windy,Play?
0,hot,sunny,60,False,no
1,hot,sunny,70,True,no
2,hot,overcast,80,False,yes
3,cool,rain,40,False,yes
4,cool,overcast,30,True,yes
5,mild,sunny,100,False,no
6,cool,sunny,20,False,yes
7,mild,rain,45,False,yes
8,mild,sunny,25,True,yes
9,mild,overcast,85,True,yes


#Preprocessing For Built In Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:

df['Humidity'] = df['Humidity'].apply(lambda x: 'High' if x > 50 else 'Low')
print(df)
le = LabelEncoder()
df['Humidity'] = le.fit_transform(df['Humidity'])
df['Temperature'] = le.fit_transform(df['Temperature'])
df['Outlook'] = le.fit_transform(df['Outlook'])
df['Windy'] = le.fit_transform(df['Windy'])
df['Play?'] = le.fit_transform(df['Play?'])
df

   Temperature   Outlook Humidity  Windy Play?
0          hot     sunny     High  False    no
1          hot     sunny     High   True    no
2          hot  overcast     High  False   yes
3         cool      rain      Low  False   yes
4         cool  overcast      Low   True   yes
5         mild     sunny     High  False    no
6         cool     sunny      Low  False   yes
7         mild      rain      Low  False   yes
8         mild     sunny      Low   True   yes
9         mild  overcast     High   True   yes
10         hot  overcast      Low  False   yes
11        mild      rain     High   True    no
12        cool      rain      Low   True    no
13        mild      rain     High  False   yes
14        cool  overcast     High   True    no


Unnamed: 0,Temperature,Outlook,Humidity,Windy,Play?
0,1,2,0,0,0
1,1,2,0,1,0
2,1,0,0,0,1
3,0,1,1,0,1
4,0,0,1,1,1
5,2,2,0,0,0
6,0,2,1,0,1
7,2,1,1,0,1
8,2,2,1,1,1
9,2,0,0,1,1


In [None]:

X = df.drop('Play?', axis=1)
y = df['Play?']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)


In [None]:
X_test

Unnamed: 0,Temperature,Outlook,Humidity,Windy
9,2,0,0,1
11,2,1,0,1
0,1,2,0,0


In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_pred

array([0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
score = accuracy_score(y_test, y_pred)
score

0.6666666666666666

#From Scratch

In [None]:
import pandas as pd
import numpy as np


df= pd.read_csv('DecisionTreeDataset.csv')
df

Unnamed: 0,Temperature,Outlook,Humidity,Windy,Play?
0,hot,sunny,60,False,no
1,hot,sunny,70,True,no
2,hot,overcast,80,False,yes
3,cool,rain,40,False,yes
4,cool,overcast,30,True,yes
5,mild,sunny,100,False,no
6,cool,sunny,20,False,yes
7,mild,rain,45,False,yes
8,mild,sunny,25,True,yes
9,mild,overcast,85,True,yes


In [None]:
df['Humidity'] = df['Humidity'].apply(lambda x: 'High' if x > 50 else 'Low')
df

Unnamed: 0,Temperature,Outlook,Humidity,Windy,Play?
0,hot,sunny,High,False,no
1,hot,sunny,High,True,no
2,hot,overcast,High,False,yes
3,cool,rain,Low,False,yes
4,cool,overcast,Low,True,yes
5,mild,sunny,High,False,no
6,cool,sunny,Low,False,yes
7,mild,rain,Low,False,yes
8,mild,sunny,Low,True,yes
9,mild,overcast,High,True,yes


In [None]:
df.shape

(15, 5)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Temperature  15 non-null     object
 1   Outlook      15 non-null     object
 2   Humidity     15 non-null     object
 3   Windy        15 non-null     bool  
 4   Play?        15 non-null     object
dtypes: bool(1), object(4)
memory usage: 623.0+ bytes


In [None]:
def entropy(data):
    labels = data['Play?']
    total_instances = len(labels)
    unique_labels = labels.unique()
    entropy_val = 0

    for label in unique_labels:
        p = len(labels[labels == label]) / total_instances
        entropy_val -= p * math.log2(p)

    return entropy_val

In [None]:
def information_gain(data, attribute):

    total_instances = len(data)
    attribute_entropy = 0

    for value in data[attribute].unique():
        subset = data[data[attribute] == value]
        subset_entropy = entropy(subset) * len(subset) / total_instances
        attribute_entropy += subset_entropy

    return entropy(data) - attribute_entropy

In [None]:
def build_decision_tree(data, attributes):

    if len(data['Play?'].unique()) == 1:
        return data['Play?'].iloc[0]

    if len(attributes) == 0:
        return data['Play?'].value_counts().idxmax()

    max_gain = -1
    best_attribute = None
    for attribute in attributes:
        gain = information_gain(data, attribute)
        if gain > max_gain:
            max_gain = gain
            best_attribute = attribute

    tree = {best_attribute: {}}
    remaining_attributes = [attr for attr in attributes if attr != best_attribute]

    for value in data[best_attribute].unique():
        subset = data[data[best_attribute] == value]
        subtree = build_decision_tree(subset, remaining_attributes)
        tree[best_attribute][value] = subtree

    return tree

In [None]:
import pprint
attributes = ['Outlook', 'Temperature', 'Humidity', 'Windy']

decision_tree = build_decision_tree(df, attributes)

pprint.pprint((decision_tree))

{'Humidity': {'High': {'Outlook': {'overcast': {'Temperature': {'cool': 'no',
                                                                'hot': 'yes',
                                                                'mild': 'yes'}},
                                   'rain': {'Windy': {False: 'yes',
                                                      True: 'no'}},
                                   'sunny': 'no'}},
              'Low': {'Outlook': {'overcast': 'yes',
                                  'rain': {'Windy': {False: 'yes', True: 'no'}},
                                  'sunny': 'yes'}}}}


In [None]:
def predict(instance, tree):

    attribute = next(iter(tree))
    value = instance[attribute]
    subtree = tree[attribute][value]

    if isinstance(subtree, dict):
        return predict(instance, subtree)
    else:
        return subtree

In [None]:
df_= pd.read_csv('/content/Test data.csv')
df_

Unnamed: 0,Temperature,Outlook,Humidity,Windy
0,hot,sunny,60,False
1,hot,sunny,70,True
2,hot,overcast,80,False
3,cool,rain,40,False


In [None]:
df_['Humidity'] = df_['Humidity'].apply(lambda x: 'High' if x > 50 else 'Low')
df_

Unnamed: 0,Temperature,Outlook,Humidity,Windy
0,hot,sunny,High,False
1,hot,sunny,High,True
2,hot,overcast,High,False
3,cool,rain,Low,False


In [None]:
predictions = []
for i in range(len(df_)):
    instance = df_.iloc[i]
    prediction = predict(instance, decision_tree)
    predictions.append(prediction)

df_['Play?'] = predictions

df_

Unnamed: 0,Temperature,Outlook,Humidity,Windy,Play?
0,hot,sunny,High,False,no
1,hot,sunny,High,True,no
2,hot,overcast,High,False,yes
3,cool,rain,Low,False,yes


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ... (Your existing code for prediction and updating df_)

# Calculate accuracy
accuracy_scratch = accuracy_score(df_['Play?'], df_['Play?'])  # Use the correct column name here

# Calculate confusion matrix and classification report
confusion_mat_scratch = confusion_matrix(df_['Play?'], df_['Play?'])  # Use the correct column name here
class_report_scratch = classification_report(df_['Play?'], df_['Play?'], target_names=["No", "Yes"])  # Use the correct column name here

print("Accuracy (From Scratch):", accuracy_scratch)
print("\nConfusion Matrix (From Scratch):")
print(confusion_mat_scratch)
print("\nClassification Report (From Scratch):")
print(class_report_scratch)


Accuracy (From Scratch): 1.0

Confusion Matrix (From Scratch):
[[2 0]
 [0 2]]

Classification Report (From Scratch):
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         2
         Yes       1.00      1.00      1.00         2

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

