In [48]:
import numpy as np
import pandas as pd

In [49]:
df=pd.read_csv(r'C:/Users/jasme/Downloads/weather.csv')

In [50]:
def find_entropy(df):
    label=df.keys()[-1]
    label_values=df[label].unique()
    entropy=0
    for value in label_values:
        prob=len(df[df[label]==value])/len(df)
        entropy+=-(prob*np.log2(prob))
    return entropy

In [51]:
find_entropy(df)

0.9402859586706309

In [52]:
def find_average_info_entropy(df,attribute):
    target=df.keys()[-1]
    unique_target_values=df[target].unique()
    unique_attribute_values=df[attribute].unique()
    average_info_entropy=0
    for value in unique_attribute_values:
        sub_df=df[df[attribute]==value]
        entropy_subsample=find_entropy(sub_df)
        weight=len(df[df[attribute]==value])/len(df)
        average_info_entropy+=weight*entropy_subsample
    return average_info_entropy

In [53]:
find_average_info_entropy(df,'Windy')

0.8921589282623617

In [54]:
def find_winner(df,feature_subset=None):
    attributes=df.keys()[:-1]
    if feature_subset is not None:
        attributes=[att for att in attributes if att in feature_subset]
    IG=[]
    for attribute in attributes:
        IG.append(find_entropy(df)-find_average_info_entropy(df,attribute))
    return attributes[np.argmax(IG)]

In [55]:
find_winner(df)

'Outlook'

In [34]:
tree=training(df)

In [35]:
import pprint
pprint.pprint(tree)

{'Outlook': {'overcast': 'yes',
             'rainy': {'Windy': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'Humidity': {'high': 'no', 'normal': 'yes'}}}}


In [56]:
def predict_instance(instance, tree):
    for feature in tree.keys():
        value = instance.get(feature)
        subtree = tree[feature].get(value)
        if isinstance(subtree, dict):
            return predict_instance(instance, subtree)
        else:
            return subtree
    return None

In [13]:
df1=pd.read_csv('C:/Users/jasme/Downloads/weather_test.csv')
Y_label=[]
for i in range(len(df1)):
    inst =df1.iloc[i,:]
    label = prediction(inst,tree)
    Y_label.append(label)

In [14]:
from sklearn.metrics import classification_report
print(classification_report(Y_label,df1['Play']))

              precision    recall  f1-score   support

          no       1.00      1.00      1.00         1
         yes       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [57]:
def bootstrap(df):
    n_samples=int(len(df)*0.8)
    return df.sample(n_samples,replace=True).reset_index(drop=True)

In [18]:
import random
sub_df=bootstrap(df)
random.sample(sub_df.keys()[:-1].to_list(),2)

['Windy', 'Outlook']

In [59]:
def training(df, feature_subset):
    target = df.keys()[-1]
    # Base case 1: all labels same
    if len(df[target].unique()) == 1 or (feature_subset is not None and len(feature_subset) == 0):
        return df[target].iloc[0]
    
    # Base case 2: no more features left
    if len(df.columns) == 1: 
        return df[target].mode()[0]

    # Select the best feature from subset
    best_feature = find_winner(df, feature_subset)

    
    tree = {best_feature: {}}
    for value in df[best_feature].unique():
        sub_df = df[df[best_feature] == value].reset_index(drop=True)
        sub_df=sub_df.drop(columns=[best_feature])

        # If subset becomes empty, return most common label
        #if sub_df.empty:
        #    tree[best_feature][value] = df[target].mode()[0]
        #else:
            # Prepare subset of features (remove used one)
        #new_features = None
        #if feature_subset is not None:
        new_features = [f for f in feature_subset if f != best_feature]
        tree[best_feature][value] = training(sub_df, new_features)
    return tree


In [60]:
import random
def build_random_forest(df,n_trees,max_features=None):
    trees=[]
    features=df.keys()[:-1].tolist()
    for i in range(n_trees):
        sub_df=bootstrap(df)
        if max_features is None:
            number_of_features=int(np.sqrt(len(features)))
        feature_subset=random.sample(features,number_of_features)
        tree=training(sub_df,feature_subset)
        trees.append(tree)
    return trees

In [62]:
trees=build_random_forest(df,5)

In [63]:
trees

[{'Temp': {'mild': 'yes',
   'cool': 'yes',
   'hot': {'Humidity': {'normal': 'yes', 'high': 'no'}}}},
 {'Outlook': {'rainy': {'Humidity': {'high': 'yes', 'normal': 'yes'}},
   'overcast': 'yes',
   'sunny': {'Humidity': {'normal': 'yes', 'high': 'no'}}}},
 {'Outlook': {'overcast': 'yes',
   'rainy': {'Humidity': {'normal': 'yes', 'high': 'no'}},
   'sunny': {'Humidity': {'high': 'no', 'normal': 'yes'}}}},
 {'Outlook': {'rainy': {'Humidity': {'high': 'yes', 'normal': 'no'}},
   'sunny': {'Humidity': {'high': 'no', 'normal': 'yes'}},
   'overcast': 'yes'}},
 {'Outlook': {'overcast': 'yes',
   'rainy': {'Temp': {'cool': 'no', 'mild': 'yes'}},
   'sunny': {'Temp': {'hot ': 'no', 'mild': 'yes'}}}}]

In [78]:
from collections import Counter
def random_forest_predict(df, trees):
    predictions = []
    for i in range(len(df)):
        instance=df.iloc[i,:]
        votes = [predict_instance(instance, tree) for tree in trees]
        final_vote = Counter(votes).most_common(1)[0][0]
        predictions.append(final_vote)
    return predictions


In [77]:
df1=pd.read_csv(r'C:/Users/jasme/Downloads/weather_test.csv')
random_forest_predict(df1,trees)

(['yes', 'yes', 'yes', 'yes', 'yes'], 'yes', ['yes', 'yes'])