In [6]:
import pandas as pd
import numpy as np
from math import log2
from collections import Counter

# ----- Data -----
data = [
    [">=9",  "Y", "VG", "G",  "Y"],
    [">=8",  "N", "G",  "M",  "Y"],
    [">=9",  "N", "AVG","P",  "N"],
    ["<8",   "N", "AVG","G",  "N"],
    [">=8",  "Y", "G",  "M",  "Y"],
    [">=9",  "Y", "G",  "M",  "Y"],
    ["<8",   "Y", "G",  "P",  "N"],
    [">=9",  "N", "VG", "G",  "Y"],
    [">=8",  "Y", "G",  "G",  "Y"],
    [">=8",  "Y", "AVG","G",  "Y"]
]
df = pd.DataFrame(data, columns=["CGPA","Interactive","PracKnowledge","CommSkill","JobOffer"])

def entropy(col):
    vals, cnts = np.unique(col, return_counts=True)
    p = cnts / cnts.sum()
    return -np.sum(p * np.log2(p))

def info_gain(D, attr, target="JobOffer"):
    H = entropy(D[target])
    vals, cnts = np.unique(D[attr], return_counts=True)
    weighted = 0.0
    for v in vals:
        subset = D[D[attr] == v]
        weighted += (len(subset)/len(D)) * entropy(subset[target])
    return H - weighted

def split_info(D, attr):
    vals, cnts = np.unique(D[attr], return_counts=True)
    p = cnts / cnts.sum()
    return -np.sum(p * np.log2(p))

def gain_ratio(D, attr, target="JobOffer"):
    ig = info_gain(D, attr, target)
    si = split_info(D, attr)
    return (ig / si) if si > 0 else 0.0, ig, si

def majority(col):
    return Counter(col).most_common(1)[0][0]

def c45(D, features, target="JobOffer"):
    # Pure?
    if len(np.unique(D[target])) == 1:
        return D[target].iloc[0]
    # No features -> majority
    if not features:
        return majority(D[target])

    scores = []
    for f in features:
        gr, ig, si = gain_ratio(D, f, target)
        scores.append((gr, ig, f))
    scores.sort(key=lambda x: (-x[0], -x[1], x[2]))
    best = scores[0][2]

    print(f"\nSplit on: {best}")
    for gr, ig, f in sorted(scores, key=lambda x: -x[0]):
        print(f"  GR({f})={gr:.4f}  IG={ig:.4f}")

    tree = {best: {}}
    for v in sorted(D[best].unique()):
        subset = D[D[best] == v]
        if subset.empty:
            tree[best][v] = majority(D[target])
        else:
            rem = [x for x in features if x != best]
            tree[best][v] = c45(subset, rem, target)
    return tree

features = ["CGPA","Interactive","PracKnowledge","CommSkill"]
tree = c45(df, features)
print("\nC4.5 Decision Tree:")
print(tree)
def predict(sample, tree):
    if not isinstance(tree, dict):
        return tree
    
    root = list(tree.keys())[0]
    branches = tree[root]
    value = sample[root]
    
    if value in branches:
        return predict(sample, branches[value])
    else:
        return 
sample = {"CGPA": ">=9", "Interactive": "Y", "PracKnowledge": "AVG", "CommSkill": "P"}
prediction = predict(sample, tree)
print("Prediction with Test Sample:", prediction)




Split on: CGPA
  GR(CGPA)=0.3658  IG=0.5568
  GR(CommSkill)=0.3503  IG=0.5203
  GR(PracKnowledge)=0.1648  IG=0.2448
  GR(Interactive)=0.0940  IG=0.0913

Split on: CommSkill
  GR(CommSkill)=0.5409  IG=0.8113
  GR(PracKnowledge)=0.5409  IG=0.8113
  GR(Interactive)=0.3113  IG=0.3113

C4.5 Decision Tree:
{'CGPA': {'<8': 'N', '>=8': 'Y', '>=9': {'CommSkill': {'G': 'Y', 'M': 'Y', 'P': 'N'}}}}
Prediction: N
