## Question 1

#### Sri Lakshmi Swetha Addepalli 
#### FSUID: sa23m 
#### Data Mining 
#### HomWork - 2 Calculations

In [1]:
import numpy as np

def entropy(labels):
    """Computes entropy of a list of binary labels (0 and 1)."""
    n_labels = len(labels)
    if n_labels <= 1:
        return 0
    counts = np.bincount(labels)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)
    if n_classes <= 1:
        return 0
    return -np.sum(probs * np.log2(probs))

def information_gain(data, split_attribute_name, target_name):
    """Calculates the information gain of a dataset. This function takes three parameters:
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The entropy of the whole dataset will be calculated against this target feature."""
    # Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])

    # Calculate the values and the corresponding counts for the split attribute
    vals, counts= np.unique(data[split_attribute_name], return_counts=True)

    # Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])

    # Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

# Convert your dataset into a pandas DataFrame
import pandas as pd

data = pd.DataFrame({
    'Tobacco': ['Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No'],
    'Radon': ['Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No'],
    'Chronic Cough': ['Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No'],
    'Weight Loss': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes'],
    'Lung Cancer': ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'No']
})

# Convert 'Yes'/'No' labels to 1/0
data['Lung Cancer'] = data['Lung Cancer'].map({'Yes': 1, 'No': 0})

# Calculate information gain for each attribute
for attribute in ['Tobacco', 'Radon', 'Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data, attribute, 'Lung Cancer')}")

Information gain for Tobacco: 0.2780719051126377
Information gain for Radon: 0.2364527976600279
Information gain for Chronic Cough: 0.034851554559677034
Information gain for Weight Loss: 0.02904940554533142


In [2]:
# Subset for 'Tobacco' = 'Yes'
data_tobacco_yes = data[data['Tobacco'] == 'Yes']

# Subset for 'Tobacco' = 'No'
data_tobacco_no = data[data['Tobacco'] == 'No']

# Calculate information gain for each attribute in the 'Tobacco' = 'Yes' subset
print("Information gain for 'Tobacco' = 'Yes' subset:")
for attribute in ['Radon', 'Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_yes, attribute, 'Lung Cancer')}")

# Calculate information gain for each attribute in the 'Tobacco' = 'No' subset
print("\nInformation gain for 'Tobacco' = 'No' subset:")
for attribute in ['Radon', 'Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_no, attribute, 'Lung Cancer')}")

# Third level for 'Tobacco' = 'Yes' subset
# Since 'Chronic Cough' has the highest information gain for 'Tobacco' = 'Yes' subset,
# we further split based on 'Chronic Cough'
data_tobacco_yes_cough_yes = data_tobacco_yes[data_tobacco_yes['Chronic Cough'] == 'Yes']
data_tobacco_yes_cough_no = data_tobacco_yes[data_tobacco_yes['Chronic Cough'] == 'No']

# Calculate information gain for each attribute in the 'Tobacco' = 'Yes' and 'Chronic Cough' = 'Yes' subset
print("\nInformation gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'Yes' subset:")
for attribute in ['Radon', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_yes_cough_yes, attribute, 'Lung Cancer')}")

# Calculate information gain for each attribute in the 'Tobacco' = 'Yes' and 'Chronic Cough' = 'No' subset
print("\nInformation gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'No' subset:")
for attribute in ['Radon', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_yes_cough_no, attribute, 'Lung Cancer')}")

# Third level for 'Tobacco' = 'No' subset
# Since 'Radon' has the highest information gain for 'Tobacco' = 'No' subset,
# we further split based on 'Radon'
data_tobacco_no_radon_yes = data_tobacco_no[data_tobacco_no['Radon'] == 'Yes']
data_tobacco_no_radon_no = data_tobacco_no[data_tobacco_no['Radon'] == 'No']

# Calculate information gain for each attribute in the 'Tobacco' = 'No' and 'Radon' = 'Yes' subset
print("\nInformation gain for 'Tobacco' = 'No' and 'Radon' = 'Yes' subset:")
for attribute in ['Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_no_radon_yes, attribute, 'Lung Cancer')}")

# Calculate information gain for each attribute in the 'Tobacco' = 'No' and 'Radon' = 'No' subset
print("\nInformation gain for 'Tobacco' = 'No' and 'Radon' = 'No' subset:")
for attribute in ['Chronic Cough', 'Weight Loss']:
    print(f"Information gain for {attribute}: {information_gain(data_tobacco_no_radon_no, attribute, 'Lung Cancer')}")


Information gain for 'Tobacco' = 'Yes' subset:
Information gain for Radon: 0.07290559532005603
Information gain for Chronic Cough: 0.7219280948873623
Information gain for Weight Loss: 0.17095059445466865

Information gain for 'Tobacco' = 'No' subset:
Information gain for Radon: 0.7219280948873623
Information gain for Chronic Cough: 0.3219280948873623
Information gain for Weight Loss: 0.17095059445466865

Information gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'Yes' subset:
Information gain for Radon: 0.0
Information gain for Weight Loss: 0.0

Information gain for 'Tobacco' = 'Yes' and 'Chronic Cough' = 'No' subset:
Information gain for Radon: 0.0
Information gain for Weight Loss: 0.0

Information gain for 'Tobacco' = 'No' and 'Radon' = 'Yes' subset:
Information gain for Chronic Cough: 0.0
Information gain for Weight Loss: 0.0

Information gain for 'Tobacco' = 'No' and 'Radon' = 'No' subset:
Information gain for Chronic Cough: 0.0
Information gain for Weight Loss: 0.0


## Question 2

### 2 A

In [19]:
import math as m
def log(x):
    try:
        return m.log(x,2)
    except:
        return 0

In [20]:
def entropy(l):
    entr=0
    for i in l:
        entr+=i*log(i)
    return -entr

In [21]:
parent_entropy=-(0.41*log(0.41)+0.46*log(0.46)+0.13*log(0.13))
parent_entropy

1.4253642047367425

### 2 B

In [22]:
x_2_lh=-(0.8*log(0.8) + 0.2*log(0.2))
x_2_lh

0.7219280948873623

In [23]:
p_a_rh=0.41/0.8
p_b_rh=0.3/0.8
p_c_rh=0.09/0.8
x_2_rh=-(p_a_rh*log(p_a_rh) + p_b_rh * log(p_b_rh) + p_c_rh * log(p_c_rh))
x_2_rh

1.3794821565051398

In [24]:
weighted_x_2=0.2*x_2_lh + 0.8 * x_2_rh

In [25]:
weighted_x_2

1.2479713441815843

In [26]:
Info_gain_x2=parent_entropy- weighted_x_2
Info_gain_x2

0.17739286055515824

In [27]:
# For x<=0.7
p_a_lh = (0.5*0.4) / 0.7
p_b_lh = (0.7*0.6 + 0.2*0.2 ) / 0.7
p_c_lh=(0.2*0.2)/0.7
#a_lh+b_lh + c_lh
en_x_7_lh=-(p_a_lh*log(p_a_lh) + p_b_lh*log(p_b_lh) + p_c_lh*log(p_c_lh))
en_x_7_lh

1.1503914187111117

In [28]:
# For x>0.7
p_a_rh=(0.3*0.4 + 0.3*.3)/0.3
p_b_rh=0
p_c_rh=(.3*.3)/.3
en_x_7_rh=-(p_a_rh*log(p_a_rh) + p_b_rh*log(p_b_rh) + p_c_rh*log(p_c_rh))
en_x_7_rh

0.8812908992306927

In [29]:
weighted_x_7=0.7*en_x_7_lh + 0.3*en_x_7_rh
weighted_x_7

1.069661262866986

In [30]:
Info_gain_x7=parent_entropy- weighted_x_7
Info_gain_x7

0.3557029418697566

In [31]:
# For y<=0.6
area_ly6=0.3*0.3  + 0.6*.7 + .3*.3
p_a_lh=(.3*.3) / area_ly6
p_b_lh=(0.6*.7)/area_ly6
p_c_lh=(0.3*.3)/area_ly6
en_y_6_lh=-(p_a_lh*log(p_a_lh) + p_b_lh*log(p_b_lh) + p_c_lh*log(p_c_lh))
en_y_6_lh

1.1812908992306925

In [32]:
# For y >0.6
area_ry6= .4*.8+.2*.2 + .2*.2
p_a_rh=(.4*.8) / area_ry6
p_b_rh=(.2*.2)/area_ry6
p_c_rh=(.2*.2)/area_ry6
en_y_6_rh=-(p_a_rh*log(p_a_rh) + p_b_rh*log(p_b_rh) + p_c_rh*log(p_c_rh))
en_y_6_rh

0.9219280948873625

In [33]:
weighted_y_6=area_ly6*en_y_6_lh + area_ry6*en_y_6_rh
weighted_y_6

1.0775457774933606

In [34]:
Info_gain_y6=parent_entropy- weighted_y_6
Info_gain_y6

0.34781842724338197

## Question 3

In [35]:
def gini(l):
    gini=0
    for i in l:
        gini+=i**2
    return 1-gini

#### Question A

In [67]:
from collections import Counter

# Define the dataset
data = [
    {"Customer ID": 1, "Gender": "M", "Car Type": "Family", "Shirt Size": "Small", "Class": "C0"},
    {"Customer ID": 2, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Medium", "Class": "C0"},
    {"Customer ID": 3, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Medium", "Class": "C0"},
    {"Customer ID": 4, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Large", "Class": "C0"},
    {"Customer ID": 5, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Extra Large", "Class": "C0"},
    {"Customer ID": 6, "Gender": "M", "Car Type": "Sports", "Shirt Size": "Extra Large", "Class": "C0"},
    {"Customer ID": 7, "Gender": "F", "Car Type": "Sports", "Shirt Size": "Small", "Class": "C0"},
    {"Customer ID": 8, "Gender": "F", "Car Type": "Sports", "Shirt Size": "Small", "Class": "C0"},
    {"Customer ID": 9, "Gender": "F", "Car Type": "Sports", "Shirt Size": "Medium", "Class": "C0"},
    {"Customer ID": 10, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Large", "Class": "C0"},
    {"Customer ID": 11, "Gender": "M", "Car Type": "Family", "Shirt Size": "Large", "Class": "C1"},
    {"Customer ID": 12, "Gender": "M", "Car Type": "Family", "Shirt Size": "Extra Large", "Class": "C1"},
    {"Customer ID": 13, "Gender": "M", "Car Type": "Family", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 14, "Gender": "M", "Car Type": "Luxury", "Shirt Size": "Extra Large", "Class": "C1"},
    {"Customer ID": 15, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Small", "Class": "C1"},
    {"Customer ID": 16, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Small", "Class": "C1"},
    {"Customer ID": 17, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 18, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 19, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Medium", "Class": "C1"},
    {"Customer ID": 20, "Gender": "F", "Car Type": "Luxury", "Shirt Size": "Large", "Class": "C1"}
]

# Function to compute Gini index
def gini_index(labels):
    total_count = len(labels)
    class_counts = Counter(labels)
    gini = 1.0
    for class_label in class_counts:
        class_prob = class_counts[class_label] / total_count
        gini -= class_prob ** 2
    return gini

# Compute Gini index for the overall collection
class_labels = [entry["Class"] for entry in data]
overall_gini = gini_index(class_labels)
print("Overall Gini index:", overall_gini)


Overall Gini index: 0.5


#### Question B

In [68]:
#p_c1=p_c2=p_c3=p_c4=...=p_c20= 1
gini([1])
# so for all the players the gini impurity = 0

0

In [69]:
#weighted gini Index
id_wise=[(1/20)*0]*20
id_wise

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [70]:
overall_weighted_gini_id=np.sum(id_wise)
overall_weighted_gini_id

0.0

#### Question C

In [71]:
gm_co=6/10
gm_c1=4/10
gf_co=4/10
gf_c1=6/10
gini_gm=gini([gm_co,gm_c1])
gini_gf=gini([gf_co,gf_c1])

In [72]:
weighted_gini=gini_gm #As gini for both male and female is same avg
# we can put either 1 of em.

#### Question D

In [73]:
cf_co=1/4
cf_c1=3/4
cs_co=8/8
cs_c1=0
cl_co=1/8
cl_c1=7/8
gini_cf=gini([cf_co,cf_c1])
gini_cs=gini([cs_co,cs_c1])
gini_cl=gini([cl_co,cl_c1])

In [74]:
weighted_gini_car=(4/20)*gini_cf + gini_cs*(8/20) + gini_cl*(8/20)
weighted_gini_car

0.16250000000000003

#### Question E

In [44]:
ss_co=3/5
ss_c1=2/5
sm_co=3/7
sm_c1=4/7
sl_co=2/4
sl_c1=2/4
sxl_co=2/4
sxl_c1=2/4
gini_ss=gini([ss_co,ss_c1])
gini_sm=gini([sm_co,sm_c1])
gini_sl=gini([sl_co,sl_c1])
gini_sxl=gini([sxl_co,sxl_c1])

In [45]:
weighted_gini_shirt=(5/20)*gini_ss + gini_sm*(7/20) + gini_sl*(4/20) + gini_sxl*(4/20)
weighted_gini_shirt

0.49142857142857144

## Question 6

In [46]:
#x>0.5
above_x5=gini([1,0])
above_x5
# x<0.5

0

In [47]:
less_x5=gini([0.26/0.5,0.24/0.5])
less_x5

0.4992

In [48]:
weighted_gini_index=0.5*less_x5 + 0.5*above_x5
weighted_gini_index

0.2496

### y=0.4

In [49]:
# y >0.4
above_y4=gini([0.36/0.6,0.24/0.6])
above_y4

0.48

In [50]:
less_y4=gini([1])
weighted_gini_index_y4=0.4*less_y4 + 0.6*above_y4
weighted_gini_index_y4

0.288

### y=0.7

In [51]:
above_y7=gini([0.21/0.3,0.09/0.3])
above_y7

0.42000000000000004

In [52]:
less_y7=gini([0.55/0.7,0.15/0.7])
less_y7

0.33673469387755084

In [53]:
weighted_gini_y7=0.7*less_y7 + 0.3*above_y7
weighted_gini_y7

0.36171428571428554

### x = 0.2

In [54]:
above_x2=gini([0.62/0.8,0.18/0.8])
above_x2

0.3487500000000001

In [55]:
less_x2=gini([0.14/0.2,0.06/0.2])
less_x2

0.41999999999999993

In [56]:
weighted_gini_x2=0.2*less_x2 + 0.8*above_x2
weighted_gini_x2

0.3630000000000001

## Question 7

In [57]:
pos=10/20
neg=10/20
parent_ent=entropy([pos,neg])
parent_ent

1.0

### 7 A

In [58]:
import numpy as np

In [59]:
ent=entropy([1]*20)
Info_gain_A=parent_ent-np.absolute(ent)
Info_gain_A

1.0

### 7 B

In [60]:
lhand_pos=9/10
lhand_neg=1/10
rhand_pos=1/10
rhand_neg=9/10
l_hand_entr=entropy([lhand_pos,lhand_neg])
r_hand_entr=entropy([rhand_pos,rhand_neg])
weighted_entr= l_hand_entr = r_hand_entr #It because avg of 1+1 is 1
# As here l_hand and r_hand entropy are same the weighted entropy is just 1 of em.

In [61]:
Info_gain_B=parent_ent - weighted_entr
Info_gain_B

0.5310044064107188

In [62]:
def splitInfo(li):
    split=0
    for i in li:
        split += i * log (i)
    return -split

### 7 D

In [63]:
Gain_ratio_D=Info_gain_A/splitInfo([1/20]*20)
Gain_ratio_D

0.23137821315975915

### 7 E

In [64]:
Gain_ratio_E=Info_gain_B/splitInfo([10/20,10/20])
Gain_ratio_E

0.5310044064107188