In [1]:
import numpy as np 
import pandas as pd

In [2]:
#loading the data from the dataset samplecar.csv
data = pd.read_csv('samplecar.csv')
data

Unnamed: 0,Type,Price,Buy
0,SUV,H,No
1,HB,H,No
2,HB,H,Yes
3,HB,L,No
4,SEDAN,H,Yes
5,SEDAN,L,Yes
6,SUV,L,No
7,SEDAN,H,Yes
8,HB,L,Yes
9,SUV,Hot,Yes


We see that there is an error in the dataset. There is a price with a value called "Hot" while everything else has just H which indicates High price. Let's replace that wrong data with a correct one.

# Sir, can we discuss about Find-S Algorithm and Candidate Elimination Algorithm for this one? To replace "Hot" with "H"?

### This time, I've set it manually. 

In [3]:
data = data.replace(to_replace="Hot", value="H")
data

Unnamed: 0,Type,Price,Buy
0,SUV,H,No
1,HB,H,No
2,HB,H,Yes
3,HB,L,No
4,SEDAN,H,Yes
5,SEDAN,L,Yes
6,SUV,L,No
7,SEDAN,H,Yes
8,HB,L,Yes
9,SUV,H,Yes


In [4]:
features = data.iloc[:,:-1]
target = data.iloc[:,-1:]

assert features.shape[0] == target.shape[0], " len(features) == len(target)"

In [5]:
tot = len(data)
# here are 2 classes, so we take the total number of nos' and total number of yes'
tot_n = data[data['Buy'] == 'No'].shape[0]
tot_y =  data[data['Buy'] == 'Yes'].shape[0]

In [6]:
prob_y = tot_y/tot
prob_n = tot_n/tot
e_tot = -(prob_y*np.log2(prob_y)) - (prob_n*np.log2(prob_n))
#this gives the total entropy of the dataset
e_tot

0.9709505944546686

# Let us now go through each Attributes of the dataset and calculate their respective average information that we get. Here are two attributes: Car Type and Car Price


## 1. For Car Type

In [7]:
data['Type'].unique()

array(['SUV', 'HB', 'SEDAN'], dtype=object)

### There are three types of cars : SUV, HB, SEDAN

#### 1.1. Entropy and Probability

For SUV, there are 3 total samples in the dataset. Where yes=1 and no=2

In [8]:
tot_suv = data[data['Type'] == 'SUV'].shape[0]
p_suv_yes = len(data[(data.Type == 'SUV') & (data.Buy == 'Yes')])/tot_suv
p_suv_no = len(data[(data.Type == 'SUV') & (data.Buy == 'No')])/tot_suv

#calculating entropy
e_suv = -(p_suv_yes*np.log2(p_suv_yes)) - (p_suv_no*np.log2(p_suv_no)) 


#probability of SUV in the dataset
p_suv = tot_suv/tot


print(f"Entropy : {e_suv}\nProbability : {p_suv}")

Entropy : 0.9182958340544896
Probability : 0.3


For HB, there are 4 total samples in the dataset. Where yes=2 and no=2

In [9]:
tot_HB = data[data['Type'] == 'HB'].shape[0]
p_HB_y = len(data[(data.Type == 'HB') & (data.Buy == 'Yes')])/tot_HB
p_HB_n = len(data[(data.Type == 'HB') & (data.Buy == 'No')])/tot_HB

#calculating entropy 
e_hb = -(p_HB_y*np.log2(p_HB_y)) - (p_HB_n*np.log2(p_HB_n))

#probability of HB in dataset
p_hb = tot_HB/tot

print(f"Entropy : {e_hb}\nProbability : {p_hb}")

Entropy : 1.0
Probability : 0.4


For Sedan, there are 3 total samples in the dataset. Where yes=3 and no=1

In [10]:
tot_sed = data[data['Type'] == 'SEDAN'].shape[0]
p_sed_y = len(data[(data.Type == 'SEDAN') & (data.Buy == 'Yes')])/tot_sed
p_sed_n = len(data[(data.Type == 'SEDAN') & (data.Buy == 'No')])/tot_sed

#calculating entropy
e_sed = -(p_sed_y*np.log2(p_sed_y)) - (p_sed_y*np.log2(p_sed_y))

#probability of Sedan in dataset
p_sed = tot_sed/tot

print(f"Entropy : {e_sed}\nProbability : {p_sed}")

Entropy : -0.0
Probability : 0.3


#### 1.2. Average Information

In [11]:
I_cartype = p_suv * e_suv + p_hb * e_hb + p_sed * e_sed
print(f"Average Information for the Cartype\nI(cartype) = {I_cartype}")

Average Information for the Cartype
I(cartype) = 0.6754887502163469


## 2. For Car price


In [12]:
data['Price'].unique()

array(['H', 'L'], dtype=object)

### There are two categories for the price, High or Low. 

#### 2.1. Entropy and Probability

For <b>High</b>, there are 6 total samples, where yes=4 and no=2

In [13]:
tot_high = len(data[data.Price == 'H'])
p_high_y = len(data[(data.Price == 'H') & (data.Buy == 'Yes')])/tot_high
p_high_n = len(data[(data.Price == 'H') & (data.Buy == 'No')])/tot_high

#entropy for high
e_high = -(p_high_y *np.log2(p_high_y ))- (p_high_n*np.log2(p_high_n))

#probability of High in dataaset
p_high = tot_high/tot

print(f"Entropy : {e_high}\nProbability : {p_high}")

Entropy : 0.9182958340544896
Probability : 0.6


For <b>Low</b>, there are 4 total samples, where yes=2 and no=2

In [14]:
tot_low = len(data[data.Price == 'L'])
p_low_y = len(data[(data.Price == 'L') & (data.Buy == 'Yes')])/tot_low
p_low_n = len(data[(data.Price == 'L') & (data.Buy == 'No')])/tot_low

#entropy for low
e_low = -(p_low_y *np.log2(p_low_y ))- (p_low_n*np.log2(p_low_n))

#probability of High in dataaset
p_low = tot_low/tot

print(f"Entropy : {e_low}\nProbability : {p_low}")

Entropy : 1.0
Probability : 0.4


#### 2.2. Average Information

In [15]:
I_price = p_high* e_high + p_low *e_low
print(f"Average Information for the Price\nI(price) = {I_price}")

Average Information for the Price
I(price) = 0.9509775004326937


# Information Gain.

## We calculate Information gain of each arrtibute in the dataset. We see the difference between Entropy of the whole dataset Entropy(S) and the Average Information of each attribute that we have in the dataset.

## 1. For Car Type

In [16]:
gain_att1 = e_tot - I_cartype
print(f"Information Gain of Cartype : {gain_att1}")

Information Gain of Cartype : 0.29546184423832167


## 2. For Car Price

In [17]:
gain_att2 = e_tot - I_price
print(f"Information Gain of Price : {gain_att2}")

Information Gain of Price : 0.01997309402197489


# Since, Car type has higher information gain than Price, it can be used as the root node for our tree.