# Dataset

In [4]:
import pandas as pd

data = {
    "age": ["<=30", "<=30", "31...40", ">40", ">40", ">40", "31...40", "<=30", "<=30", ">40", "<=30", "31...40", "31...40", ">40"],
    "income": ["high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"],
    "student": ["no", "no", "no", "no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"],
    "credit_rating": ["fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"],
    "buys_computer": ["no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"]
}

df = pd.DataFrame(data)

# Decision Tree Implementation

In [5]:
from math import log2


def entropy(arr,totalCount)->float:
  ans = 0.0;
  for currCount in arr:
    if(currCount == 0):
      continue;
    prob = currCount/totalCount;
    ans += prob * log2(prob)
  return -ans;

def find_information_needed(df,feature,target):
  counts = pd.crosstab(df[feature],df[target])
  total_children_entropy = 0.0
  total_sum = counts.values.sum()
  for _,rows in counts.iterrows():
      total_row_sum = rows.sum()
      child_entropy = entropy(rows,total_row_sum)
      weight = total_row_sum/total_sum
      total_children_entropy += child_entropy *weight
  return total_children_entropy

def find_information_gain(df,feature,target):
    parent_counts = df[target].value_counts()
    parent_entropy = entropy(parent_counts,parent_counts.sum())
    # print("Parent Entropy: ",parent_entropy)
    all_child_entropy = find_information_needed(df,feature,target=target)
    # print("All Children Entropy:",all_child_entropy)
    return (parent_entropy-all_child_entropy)

# Build Decision Tree
def build_decision_tree(df:pd.DataFrame,features,target):
  max_gain_column = ""
  max_gain = 0
  for column in features:
    if(column != target):
      ig_col = find_information_gain(df,column,target)
      if(max_gain < ig_col):
          max_gain = ig_col
          max_gain_column = column
      print(f"IG({column}): {ig_col}")
  print(f"Next Root : {max_gain_column} , Gain: {max_gain}")

build_decision_tree(df,df.columns,"buys_computer")

IG(age): 0.24674981977443933
IG(income): 0.02922256565895487
IG(student): 0.15183550136234159
IG(credit_rating): 0.04812703040826949
Next Root : age , Gain: 0.24674981977443933


# Naive Bayes Implementation

In [6]:
priors = df['buys_computer'].value_counts(normalize=True).to_dict()

conditionals = {}
features = [col for col in df.columns if col != 'buys_computer']

for feature in features:
    conditionals[feature] = {}
    for class_val in df['buys_computer'].unique():
        subset = df[df['buys_computer'] == class_val]
        counts = subset[feature].value_counts()
        total = len(subset)
        conditionals[feature][class_val] = {val: counts[val]/total for val in counts.index}

def predict(sample):
    posteriors = {}
    for class_val in priors:
        prob = priors[class_val]
        for feature, value in sample.items():
            prob *= conditionals[feature][class_val].get(value, 0)
        posteriors[class_val] = prob
    return max(posteriors, key=posteriors.get), posteriors


test_sample = {"age": "<=30", "income": "high", "student": "no", "credit_rating": "fair"}
predicted_class, probs = predict(test_sample)

print("Prediction:", predicted_class)
print("Posterior Probabilities:", probs)

Prediction: no
Posterior Probabilities: {'yes': np.float64(0.007054673721340388), 'no': np.float64(0.02742857142857143)}
