### <span style='color:blue'> $$\bf{\text{Binary ID3 for Play Tennis}}$$ </span>

In [1]:
import os
import numpy as np
import pandas as pd

- [ ] <span style='color:blue'> $\bf{\text{Part 0: }}$ </span> $\bf{\text{Dataset Creation}}$

In [2]:
# Creating a dictionary type of input data:

data_dict = {'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 
                        'Rain', 'Rain', 'Overcast', 'Sunny', 
                        'Sunny', 'Rain', 'Sunny', 'Overcast', 
                       'Overcast', 'Rain'], 
            'Temparature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 
                           'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 
                           'Mild', 'Mild', 'Hot', 'Mild'], 
            'Humidity': ['High', 'High', 'High', 'High', 'Normal', 
                        'Normal', 'Normal', 'High', 'Normal', 
                         'Normal', 'Normal', 'High', 'Normal', 'High'], 
            'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 
                    'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 
                    'Strong', 'Strong', 'Weak', 'Strong'], 
            'Label': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 
                     'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']}

In [3]:
# Saving the DataFrame to a CSV file in the current directory

path = os.getcwd() + '/'
file_name = 'data1.csv'
df = pd.DataFrame(data_dict)
df.to_csv(path + file_name, index=False)

In [4]:
data_df = pd.read_csv(path + file_name)
data_df

Unnamed: 0,Outlook,Temparature,Humidity,Wind,Label
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


- [x] <span style='color:blue'> $\bf{\text{End of Part 0}}$ </span>

- [ ] <span style='color:blue'> $\bf{\text{Part 1: }}$ </span> $\bf{\text{Function Definition}}$

In [5]:
#                                         ---------
# Input: (pd.Series, feature_value) ---> |  Prob   | --->  Output: probability of feature_value in the pd.Series
#                                         ---------                               -------------       ----------

def Prob(series: pd.Series, value: str) -> float:
    
    prob = len(series[series == value]) / len(series)
    return prob

In [6]:
#                                ----------
# Input: pd.Series(Labels) ---> |  Entropy | --->  Output: Entropy of input series
#                                ----------               --------   --------------


def Entropy(labels: pd.Series) -> float:
    
    positive_prob = Prob(series=labels, value='Yes')
    negative_prob = 1 - positive_prob
    if (positive_prob == 0) or (positive_prob == 1):
        entropy = 0
    else:
        entropy = -(positive_prob*np.log2(positive_prob) + negative_prob*np.log2(negative_prob))
    
    return entropy

In [7]:
#                                               ------------
# Input: (new_DataFrame, desired_feature) ---> |  Info_Gain | ---> Output: Information_Gain of the desired feature
#                                               ------------                ----------------       ---------------

def Info_Gain(df: pd.DataFrame, feature: str) -> float:
    feature_values = np.unique(df[feature])
    probs_dict = {}
    ent_dict = {}
    info_gain = Entropy(labels=df['Label'])
    for feature_value in feature_values:
        probs_dict[feature_value] = Prob(series=df[feature], value=feature_value)
        ent_dict[feature_value] = Entropy(labels=df[df[feature] == feature_value]['Label'])
        
    info_gain -= np.sum(np.array(list(probs_dict.values()))*np.array(list(ent_dict.values())))
    return info_gain

In [8]:
#                                  ----------------------
# Input: Previous_branch_str ---> |  Branch_str_updater  | --->  Output: List of the Previous_branch + new_possible_mini_branches
#                                  ----------------------                           ---------------------------------------------

def branch_str_updater(previous_branch_str: str, mini_branches: list) -> list:
    updated_branch_list = []
    for mini_branch_str in mini_branches:
        updated_branch_str = previous_branch_str + mini_branch_str
        updated_branch_list.append(updated_branch_str)
        
    return updated_branch_list

In [9]:
#                                        ------------------
# Input: DataFrame with its Lables ---> |  Feature_Values  | --->  Output: Dictionary of Features(keys) and Feature_Values(values)
#                                        ------------------               -------------------------------------------------------

def feature_values(df_with_label: pd.DataFrame) -> dict:
    df = df_with_label.copy()
    feature_value_dict = {}
    del df['Label']
    for feature in df.columns:
        feature_value_dict[feature] = np.unique(df[feature])
        
    return feature_value_dict

In [10]:
#                                   ----------------------
# Input: (S_new, Feature_str) ---> |  mini_branch_creator | --->  Output: List of Mini_branches
#                                   ----------------------               ----------------------

def mini_branch_creator(S_new: pd.DataFrame, new_state_feature: str) -> str:
    features_values_dict = feature_values(df_with_label=S_new)
    mini_branches_list = []
    feature_values_ndarray = features_values_dict[new_state_feature]
    for feature_value in feature_values_ndarray:
        mini_branch = new_state_feature + '---' + feature_value + '--->'
        mini_branches_list.append(mini_branch)
    return mini_branches_list

In [11]:
#                     --------------------------------------
# Input: Branch ---> |  Feature_and_Feature_Value_Seprator  | --->  Output: Dictionary of Features(keys) and Feature_Values(values)
#                     --------------------------------------               -------------------------------------------------------

def feature_and_feature_value_separator(branch: str='') -> dict:
    features_and_feature_values_list = branch.split('--->')
    features_and_feature_values_list.remove(features_and_feature_values_list[-1])
    features_and_feature_values_dict = {}
    for feature_and_feature_value in features_and_feature_values_list:
        feature_and_value_list = feature_and_feature_value.split('---')
        features_and_feature_values_dict[feature_and_value_list[0]] = feature_and_value_list[1]
        
    return features_and_feature_values_dict

In [12]:
#                                                  ----------------------
# Input: (Branches_list, Mini_branches_list) ---> |  Branch_str_updater  | --->  Output: List of the New_branches
#                                                  ----------------------               --------------------------

def branches_list_updater(branches_list: list, mini_branches_list: list) -> list:
    new_branches_list = []
    for branch in branches_list:
        for mini_branch in mini_branches_list:
            new_branch_str = branch + mini_branch
            new_branches_list.append(new_branch_str)
            
    return new_branches_list

In [13]:
#                                                                        ------------------
# Input: (Original_DataFrame, Dictionary of Features and its Values)--->|  S_new_creator   |---> Output: New_DataFrame
#                                                                        ------------------              -------------

def S_new_creator(orig_data: pd.DataFrame=data_df, feature_and_feature_values: dict={}) -> pd.DataFrame:
    features = feature_and_feature_values.keys()
    df = orig_data
    for feature in features:
        df = df[df[feature] == feature_and_feature_values[feature]]
        l = list(df.keys())
        l.remove(feature)
        df = df[l]
    return df

In [14]:
#                             ---------------------------------                                                               
# Input: New_DataFrame  ---> |  info_gain_for_S_new_features   | --->  Output: Dictiionary of Info_Gain
#                             ---------------------------------               -------------------------

def info_gain_for_S_new_features(df: pd.DataFrame) -> dict:
    data = df.copy()
    info_gain_dict = {}
    for feature in data.keys():
        info_gain_dict[feature] = Info_Gain(df=data, feature=feature)
    
    del info_gain_dict['Label']
    return info_gain_dict

In [15]:
#                                      ---------------------------
# Input: Dictionary of Info_Gain ---> |  max_info_gain_attribute  | --->  Output: (max_key, max_Info_Gain)
#                                      ---------------------------                ------------------------

def max_info_gain_attribute(info_gain_dict: dict) -> (str, float):
        max_key = max(info_gain_dict, key=info_gain_dict.get)
        max_value = info_gain_dict[max_key]
        return (max_key, max_value)

In [16]:
#                               ----------------------------
# Input: List_of_Branches ---> |  list_of_branches_updater  | --->  Output: (List_of_Branches, list_of_completed_branches)
#                               ----------------------------                ---------------------------------------------

def list_of_branches_updater(list_of_branches: list) -> list:
    new_branches_out = []
    for branch_str in list_of_branches:
        feature_and_feature_value_separator_dict = feature_and_feature_value_separator(branch=branch_str)
        S_new_1 = S_new_creator(feature_and_feature_values=feature_and_feature_value_separator_dict)
        info_gain_dict = info_gain_for_S_new_features(df=S_new_1)
        (selected_feature_str, max_info_float) = max_info_gain_attribute(info_gain_dict=info_gain_dict)
        if max_info_float == 0:
            new_branches_out.append(branch_str)
            continue
        else:
            list_of_mini_branches = mini_branch_creator(S_new=S_new_1, new_state_feature=selected_feature_str)
            new_branches = branch_str_updater(previous_branch_str=branch_str, mini_branches=list_of_mini_branches)
            new_branches_out += new_branches
    
    completed_branches_out = []
    for branch_str in new_branches_out:
        feature_and_feature_value_separator_dict = feature_and_feature_value_separator(branch=branch_str)
        S_new_1 = S_new_creator(feature_and_feature_values=feature_and_feature_value_separator_dict)
        info_gain_dict = info_gain_for_S_new_features(df=S_new_1)
        (_, max_ifo_float) = max_info_gain_attribute(info_gain_dict=info_gain_dict)
        if max_info_float == 0: completed_branches_out.append(branch_str)
         
    new_branches_out = [x for x in new_branches_out if x not in completed_branches_out]
    return (new_branches_out, completed_branches_out)

In [17]:
#                        ----------------------------
# Input: No Inputs ---> |  list_of_branches_updater  | --->  Output: List_of_Completed_Branches
#                        ----------------------------                --------------------------

def final_branches(initial_branch: list=['']):
    branches_list = initial_branch
    while True:
        branches_list, completed_branches_list = list_of_branches_updater(list_of_branches=branches_list)
        if branches_list == []: break
    
    return completed_branches_list

In [18]:
#                                 ----------------------------------
# Input: Completed_branches ---> |  tagging_the_completed_branches  | --->  Output: List_of_Tagged_Branches, List_of_Yes_tagged_Branches
#                                 ----------------------------------                ----------------------------------------------------

def tagging_the_completed_branches(completed_branches: list) -> list:
    tagged_branches_list = []
    yes_tagged_branches = []
    for branch in completed_branches:
        Dict = feature_and_feature_value_separator(branch=branch)
        S = S_new_creator(feature_and_feature_values=Dict)
        tag = np.unique(S['Label'])[0]
        branch_with_tag = branch + tag
        tagged_branches_list.append(branch_with_tag)
        if tag == 'Yes': yes_tagged_branches.append(branch)
        
    return tagged_branches_list, yes_tagged_branches

In [19]:
#                                 ------------------------
# Input: Positive_branches ---> |  hypothesis_dict_dict  | --->  Output: A Dictionary of the Dictionary of Positive_branches
#                                 ------------------------                ---------------------------------------------------

def hypothesis_dict_dict(completed_positive_branches: list) -> dict:
    hypothesis_dict = {}
    for key_index in range(len(completed_positive_branches)):
        hypothesis_dict['positive_branch' + str(key_index + 1)] = \
        feature_and_feature_value_separator(completed_positive_branches[key_index])
        
    return hypothesis_dict

In [20]:
#                                        -----------
# Input: (Hypothesis_dict, Sample) ---> |  predict  | --->  str(Yes or No)
#                                        -----------        --------------

def predict(hypothesis_dict: dict, sample: dict) -> bool:
    prediction1 = False
    for key1 in hypothesis_dict.keys():
        prediction2 = True
        for key2 in hypothesis_dict[key1].keys():
            prediction2 &= (hypothesis_dict[key1][key2] == sample[key2])
            
        prediction1 |= prediction2
    
    if prediction1 == True: out = 'Yes'
    else: out = 'No'
    return out

- [x] <span style='color:blue'> $\bf{\text{End of Part 1}}$ </span>

- [ ] <span style='color:blue'> $\bf{\text{Part 2: }}$ </span> $\bf{\text{Test}}$

In [21]:
# Test:
branches = final_branches()
_, positive_branches = tagging_the_completed_branches(completed_branches=branches)
hypo_dict = hypothesis_dict_dict(completed_positive_branches=positive_branches)
arbitrary_sample = {'Outlook': 'Rain', 'Temparature': 'Mild', 'Humidity': 'High', 'Wind': 'Strong'}
result = predict(hypothesis_dict=hypo_dict, sample=arbitrary_sample)
print(f'Positive branches are:\n\n{positive_branches}\n\n')
print(f'Result of the Hypothesis for the arbitrary sample: {result}')

Positive branches are:

['Outlook---Overcast--->', 'Outlook---Rain--->Wind---Weak--->', 'Outlook---Sunny--->Humidity---Normal--->']


Result of the Hypothesis for the arbitrary sample: No


- [x] <span style='color:blue'> $\bf{\text{End of Part 2}}$ </span>