<a href="https://colab.research.google.com/github/Tamoziit/Data-Mining/blob/main/Attribute_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas



In [None]:
import pandas as pd
import math
from collections import Counter

In [None]:
df = pd.read_csv("./dataset.csv")

print(df)

  Weekend Weather Parents Financial condition     Decision
0      W1   Sunny     Yes                Rich       Cinema
1      W2   Sunny      No                Rich  Play Tennis
2      W3   Windy     Yes                Rich       Cinema
3      W4   Rainy     Yes                Poor       Cinema
4      W5   Rainy      No                Rich      Stay in
5      W6   Rainy     Yes                Poor       Cinema
6      W7   Windy      No                Poor       Cinema
7      W8   Windy      No                Rich     Shopping
8      W9   Windy     Yes                Rich       Cinema
9     W10   Sunny      No                Rich  Play Tennis


In [None]:
print(df[df["Weather"] == "Sunny"])

  Weekend Weather Parents Financial condition     Decision
0      W1   Sunny     Yes                Rich       Cinema
1      W2   Sunny      No                Rich  Play Tennis
9     W10   Sunny      No                Rich  Play Tennis


## **Attribute Selection using Information Gain (Entropy)**

In [None]:
# Entropy function
def entropy(attr):
    probs = attr.value_counts(normalize=True)
    info = 0

    for prob in probs:
      info += -prob * math.log2(prob)

    return info

In [None]:
# information gain of subset wrt target
def information_gain(df, attr, target="Decision"):
    infoTotal = 0
    probs = df[attr].value_counts(normalize=True)

    for entry, prob in probs.items():
        subset = df[df[attr] == entry][target]
        infoTotal += prob * entropy(subset)

    return entropy(df[target]) - infoTotal

In [None]:
for col in df.columns.drop(["Weekend", "Decision"]):
    ig = information_gain(df, col, "Decision")
    print(f"Information Gain({col}): {ig}")

Information Gain(Weather): 0.6954618442383218
Information Gain(Parents): 0.6099865470109875
Information Gain(Financial condition): 0.2812908992306926


In [None]:
# Best attribute selection
def attribute_selection_entropy(df, target="Decision", omit=None, return_all=False):
    if omit is None:
        omit = []

    # Always omitting target and Weekend
    omit = list(set(omit + ["Weekend", target]))

    gains = {}
    for col in df.columns.drop(omit):
        ig = information_gain(df, col, target)
        gains[col] = ig

    if return_all:
        return dict(sorted(gains.items(), key=lambda x: x[1], reverse=True))
    else:
        best_attr = max(gains, key=gains.get)
        return best_attr, gains[best_attr]

In [None]:
all_gains = attribute_selection_entropy(df, return_all=True)
print("Info Gain for all attributes:")
for col, gain in all_gains.items():
    print(f"  {col}: {gain}")

Info Gain for all attributes:
  Weather: 0.6954618442383218
  Parents: 0.6099865470109875
  Financial condition: 0.2812908992306926


In [None]:
attr, gain = attribute_selection_entropy(df)
print(f"Best Attribute: {attr}, Info Gain: {gain}")

Best Attribute: Weather, Info Gain: 0.6954618442383218


In [None]:
def D_Tree_entropy(df, target="Decision", omit=None, level=0, branch=None):
  if omit is None:
      omit = []

  indent = "  " * level

  # If all values in target column are the same -> PURE NODE
  if len(df[target].unique()) == 1:
      print(f"{indent}lvl:{level} {branch} -> PURE ({df[target].iloc[0]})")
      return

  # If no attributes left to split -> MIXED NODE
  if len(df.columns.drop(omit + [target, "Weekend"])) == 0:
      print(f"{indent}lvl:{level} {branch} -> MIXED ({df[target].value_counts().to_dict()})")
      return

  # Finding best attribute
  best_attr, gain = attribute_selection_entropy(df, target=target, omit=omit)
  if branch is None:
      print(f"{indent}lvl{level} {best_attr} -> (Gain={gain})")
  else:
      print(f"{indent}lvl:{level} {branch} -> {best_attr} (Gain={gain})")

  # Recursing for each value of best_attr
  for val in df[best_attr].unique():
      subset = df[df[best_attr] == val]
      D_Tree_entropy(subset, target=target, omit=omit + [best_attr], level=level+1, branch=f"{best_attr}={val}")

In [None]:
D_Tree_entropy(df)

lvl0 Weather -> (Gain=0.6954618442383218)
  lvl:1 Weather=Sunny -> Parents (Gain=0.9182958340544896)
    lvl:2 Parents=Yes -> PURE (Cinema)
    lvl:2 Parents=No -> PURE (Play Tennis)
  lvl:1 Weather=Windy -> Parents (Gain=0.31127812445913283)
    lvl:2 Parents=Yes -> PURE (Cinema)
    lvl:2 Parents=No -> Financial condition (Gain=1.0)
      lvl:3 Financial condition=Poor -> PURE (Cinema)
      lvl:3 Financial condition=Rich -> PURE (Shopping)
  lvl:1 Weather=Rainy -> Parents (Gain=0.9182958340544896)
    lvl:2 Parents=Yes -> PURE (Cinema)
    lvl:2 Parents=No -> PURE (Stay in)


## **Attribute Selection using GINI Index**

In [None]:
def gini(attr):
  counts = Counter(attr)
  total = len(attr)
  prob_coeff = 0

  for count in counts.values():
    prob_coeff += (count / total) ** 2

  return 1 - prob_coeff

In [None]:
def gini_index(df, attr, target="Decision"):
  values = df[attr].unique()
  weighted_gini = 0

  for v in values:
    subset = df[df[attr] == v]
    if len(subset) > 0:
        weight = len(subset) / len(df)
        weighted_gini += weight * gini(subset[target])

  return weighted_gini

In [None]:
# Best attribute selection
def attribute_selection_gini(df, target="Decision", omit=None, return_all=False):
    if omit is None:
        omit = []

    # Always omitting target and Weekend
    omit = list(set(omit + ["Weekend", target]))

    ginis = {}
    for col in df.columns.drop(omit):
        ig = gini_index(df, col, target)
        ginis[col] = ig

    if return_all:
        return dict(sorted(ginis.items(), key=lambda x: x[1], reverse=True))
    else:
        best_attr = min(ginis, key=ginis.get)
        return best_attr, ginis[best_attr]

In [None]:
all_indices = attribute_selection_gini(df, return_all=True)
print("GINI Index for all attributes:")
for col, idx in all_indices.items():
    print(f"  {col}: {idx}")

GINI Index for all attributes:
  Financial condition: 0.48571428571428565
  Weather: 0.41666666666666663
  Parents: 0.36


In [None]:
def D_Tree_gini(df, target="Decision", omit=None, level=0, branch=None):
  if omit is None:
      omit = []

  indent = "  " * level

  # If all values in target column are the same -> PURE NODE
  if len(df[target].unique()) == 1:
      print(f"{indent}lvl:{level} {branch} -> PURE ({df[target].iloc[0]})")
      return

  # If no attributes left to split -> MIXED NODE
  if len(df.columns.drop(omit + [target, "Weekend"])) == 0:
      print(f"{indent}lvl:{level} {branch} -> MIXED ({df[target].value_counts().to_dict()})")
      return

  # Finding best attribute
  best_attr, gini = attribute_selection_gini(df, target=target, omit=omit)
  if branch is None:
      print(f"{indent}lvl:{level} {best_attr} -> (GINI Idx.={gini:.8f})")
  else:
      print(f"{indent}lvl:{level} {branch} -> {best_attr} (GINI Idx.={gini:.8f})")

  # Recursing for each value of best_attr
  for val in df[best_attr].unique():
      subset = df[df[best_attr] == val]
      D_Tree_gini(subset, target=target, omit=omit + [best_attr], level=level+1, branch=f"{best_attr}={val}")

In [None]:
D_Tree_gini(df)

lvl:0 Parents -> (GINI Idx.=0.36000000)
  lvl:1 Parents=Yes -> PURE (Cinema)
  lvl:1 Parents=No -> Weather (GINI Idx.=0.20000000)
    lvl:2 Weather=Sunny -> PURE (Play Tennis)
    lvl:2 Weather=Rainy -> PURE (Stay in)
    lvl:2 Weather=Windy -> Financial condition (GINI Idx.=0.00000000)
      lvl:3 Financial condition=Poor -> PURE (Cinema)
      lvl:3 Financial condition=Rich -> PURE (Shopping)


In [None]:
!pip install pypandoc

Collecting pypandoc
  Downloading pypandoc-1.15-py3-none-any.whl.metadata (16 kB)
Downloading pypandoc-1.15-py3-none-any.whl (21 kB)
Installing collected packages: pypandoc
Successfully installed pypandoc-1.15


In [None]:
!apt-get install -y pandoc

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc-data
Suggested packages:
  texlive-latex-recommended texlive-xetex texlive-luatex pandoc-citeproc
  texlive-latex-extra context wkhtmltopdf librsvg2-bin groff ghc nodejs php
  python ruby libjs-mathjax libjs-katex citation-style-language-styles
The following NEW packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc
  pandoc-data
0 upgraded, 4 newly installed, 0 to remove and 35 not upgraded.
Need to get 20.6 MB of archives.
After this operation, 156 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [115 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm-extensions0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [25.1 kB

In [None]:
import pypandoc
pypandoc.convert_file("Attribute_Selection.ipynb", "docx", outputfile="output.docx")

''