In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
df=pd.read_csv(r"C:\Users\Pratham\Downloads\Dataset_2 - Sheet1.csv")
print(df.head())
print(df.columns.tolist())


In [None]:
# ENTROPY (PURITY)
# ----------------
def entropy(labels):
    values, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    ent = 0
    for p in probabilities:
        ent -= p * np.log2(p)
    return ent

# ----------------
# TOTAL DATASET ENTROPY
# ----------------
total_entropy = entropy(df['Team'])
print("Total Entropy:", total_entropy)

# ----------------
# INFORMATION GAIN (NUMERICAL)
# ----------------
def information_gain_numeric(df, feature, threshold):
    parent_entropy = entropy(df['Team'])
    
    left = df[df[feature] > threshold]
    right = df[df[feature] <= threshold]
    
    n = len(df)
    
    weighted_entropy = (
        (len(left) / n) * entropy(left['Team']) +
        (len(right) / n) * entropy(right['Team'])
    )
    
    ig = parent_entropy - weighted_entropy
    return ig

# ----------------
# INFORMATION GAIN (CATEGORICAL)
# ----------------
def information_gain_categorical(df, feature):
    parent_entropy = entropy(df['Team'])
    weighted_entropy = 0
    
    for value in df[feature].unique():
        subset = df[df[feature] == value]
        weight = len(subset) / len(df)
        weighted_entropy += weight * entropy(subset['Team'])
    
    ig = parent_entropy - weighted_entropy
    return ig

# ----------------
# CHECK PURITY OF FEATURES
# ----------------
ig_income = information_gain_numeric(df, 'Income', 40000)
ig_gender = information_gain_categorical(df, 'Gender')
ig_department = information_gain_categorical(df, 'Department')

print("\nInformation Gain Values")
print("Income >", 40000, ":", ig_income)
print("Gender:", ig_gender)
print("Department:", ig_department)

# ----------------
# DECISION TREE LOGIC
# ----------------
def predict_team(row):
    if row['Gender'] == 'F':
        return 'A'
    else:
        if row['Income'] > 40000:
            return 'A'
        else:
            return 'B'


# ----------------
# APPLY DECISION TREE
# ----------------
df['Predicted_Team'] = df.apply(predict_team, axis=1)

# ----------------
# ACCURACY
# ----------------
accuracy = (df['Team'] == df['Predicted_Team']).mean()
print("\nAccuracy:", accuracy)

# ----------------
# SHOW FINAL RESULTS
# ----------------
print("\nFinal Predictions:")
print(df[['Employee ID ', 'Income', 'Gender', 'Department', 'Team', 'Predicted_Team']])

# ----------------
# SHOW MISCLASSIFIED (IF ANY)
# ----------------
print("\nMisclassified Rows:")
print(df[df['Team'] != df['Predicted_Team']])
