<a href="https://colab.research.google.com/github/Tamoziit/Data-Mining/blob/main/Attribute_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas



In [None]:
import pandas as pd
import math

In [91]:
df = pd.read_csv("./dataset.csv")

print(df)

  Weekend Weather Parents Financial condition     Decision
0      W1   Sunny     Yes                Rich       Cinema
1      W2   Sunny      No                Rich  Play Tennis
2      W3   Windy     Yes                Rich       Cinema
3      W4   Rainy     Yes                Poor       Cinema
4      W5   Rainy      No                Rich      Stay in
5      W6   Rainy     Yes                Poor       Cinema
6      W7   Windy      No                Poor       Cinema
7      W8   Windy      No                Rich     Shopping
8      W9   Windy     Yes                Rich       Cinema
9     W10   Sunny      No                Rich  Play Tennis


In [92]:
print(df[df["Weather"] == "Sunny"])

  Weekend Weather Parents Financial condition     Decision
0      W1   Sunny     Yes                Rich       Cinema
1      W2   Sunny      No                Rich  Play Tennis
9     W10   Sunny      No                Rich  Play Tennis


## **Attribute Selection using Information Gain (Entropy)**

In [93]:
# Entropy function
def entropy(attr):
    probs = attr.value_counts(normalize=True)
    info = 0

    for prob in probs:
      info += -prob * math.log2(prob)

    return info

In [94]:
# information gain of subset wrt target
def information_gain(df, attr, target="Decision"):
    infoTotal = 0
    probs = df[attr].value_counts(normalize=True)

    for entry, prob in probs.items():
        subset = df[df[attr] == entry][target]
        infoTotal += prob * entropy(subset)

    return entropy(df[target]) - infoTotal

In [95]:
for col in df.columns.drop(["Weekend", "Decision"]):
    ig = information_gain(df, col, "Decision")
    print(f"Information Gain({col}): {ig}")

Information Gain(Weather): 0.6954618442383218
Information Gain(Parents): 0.6099865470109875
Information Gain(Financial condition): 0.2812908992306926


In [96]:
# Best attribute selection
def attribute_selection(df, target="Decision", omit=None, return_all=False):
    if omit is None:
        omit = []

    # Always omitting target and Weekend
    omit = list(set(omit + ["Weekend", target]))

    gains = {}
    for col in df.columns.drop(omit):
        ig = information_gain(df, col, target)
        gains[col] = ig

    if return_all:
        return dict(sorted(gains.items(), key=lambda x: x[1], reverse=True))
    else:
        best_attr = max(gains, key=gains.get)
        return best_attr, gains[best_attr]

In [97]:
all_gains = attribute_selection(df, return_all=True)
print("Info Gain for all attributes:")
for col, gain in all_gains.items():
    print(f"  {col}: {gain}")

Info Gain for all attributes:
  Weather: 0.6954618442383218
  Parents: 0.6099865470109875
  Financial condition: 0.2812908992306926


In [98]:
attr, gain = attribute_selection(df)
print(f"Best Attribute: {attr}, Info Gain: {gain}")

Best Attribute: Weather, Info Gain: 0.6954618442383218


In [107]:
def D_Tree(df, target="Decision", omit=None, level=0, branch=None):
  if omit is None:
      omit = []

  indent = "  " * level

  # If all values in target column are the same -> PURE NODE
  if len(df[target].unique()) == 1:
      print(f"{indent}lvl:{level} {branch} -> PURE ({df[target].iloc[0]})")
      return

  # If no attributes left to split -> MIXED NODE
  if len(df.columns.drop(omit + [target, "Weekend"])) == 0:
      print(f"{indent}lvl:{level} {branch} -> MIXED ({df[target].value_counts().to_dict()})")
      return

  # Finding best attribute
  best_attr, gain = attribute_selection(df, target=target, omit=omit)
  if branch is None:
      print(f"{indent}lvl{level} {best_attr} -> (Gain={gain:.3f})")
  else:
      print(f"{indent}lvl:{level} {branch} -> {best_attr} (Gain={gain:.3f})")

  # Recursing for each value of best_attr
  for val in df[best_attr].unique():
      subset = df[df[best_attr] == val]
      D_Tree(subset, target=target, omit=omit + [best_attr], level=level+1, branch=f"{best_attr}={val}")

In [108]:
D_Tree(df)

lvl0 Weather -> (Gain=0.695)
  lvl:1 Weather=Sunny -> Parents (Gain=0.918)
    lvl:2 Parents=Yes -> PURE (Cinema)
    lvl:2 Parents=No -> PURE (Play Tennis)
  lvl:1 Weather=Windy -> Parents (Gain=0.311)
    lvl:2 Parents=Yes -> PURE (Cinema)
    lvl:2 Parents=No -> Financial condition (Gain=1.000)
      lvl:3 Financial condition=Poor -> PURE (Cinema)
      lvl:3 Financial condition=Rich -> PURE (Shopping)
  lvl:1 Weather=Rainy -> Parents (Gain=0.918)
    lvl:2 Parents=Yes -> PURE (Cinema)
    lvl:2 Parents=No -> PURE (Stay in)
