### Problem Statement

- We have a sample of 50 people with three variables Gender (M/F), employment status( Student/ Working) and Age (years)
- Some of these 50 are planning to watch the movie.
- Now, we want to create a model to predict who will watch the movie? In this problem, we need to segregate the sample into who will watch the movie based on highly significant input variable among all three
- For the sake of simplicity, age feature is conveted into bins of >28 and <28

In [1]:
import pandas as pd
import numpy as np
films = pd.read_excel('films.xlsx', sheetname='films')
print(films.head())
print(films.shape)

   # gender  is_28+ employment_status watching
0  1      M       0           student      yes
1  2      M       1           working      yes
2  3      F       0           working      yes
3  4      F       0           student       no
4  5      M       1           working      yes
(50, 5)


### __I. Gini Index__

In [2]:
print("Viewers who watched the movie:{}".format(len(films[films['watching'] == 'yes'])))
print("Viewers who did not watch the movie:{}".format(len(films[films['watching'] == 'no'])))

Viewers who watched the movie:26
Viewers who did not watch the movie:24


__SPLIT BASED ON GENDER__

In [3]:
crosstab1 = pd.crosstab(index=films["watching"], columns=films["gender"])
crosstab1

gender,F,M
watching,Unnamed: 1_level_1,Unnamed: 2_level_1
no,8,16
yes,14,12


In [4]:
male_watched_yes = (12/float(28))
female_watched_yes = (14/float(22))

print("Probability of males that watched Dunkirk:{:.3f}".format(male_watched_yes))
print("Probability of females that watched Dunkirk:{:.3f}".format(female_watched_yes))

Probability of males that watched Dunkirk:0.429
Probability of females that watched Dunkirk:0.636


In [5]:
subnode_male = 1 - ((male_watched_yes)**2 + (1-male_watched_yes)**2)
subnode_female =1-( (female_watched_yes)**2 + (1-female_watched_yes)**2)

print("Gini(female):{:.3f}".format(subnode_female))
print("Gini(male):{:.3f}".format(subnode_male))

Gini(female):0.463
Gini(male):0.490


In [6]:
# Weighted Gini Index Calculation for Gender Split
calculated_wt_gender = (28/float(50))*subnode_male + (22/float(50))*subnode_female
print("Weighted Gini for Gender:{:.4f}".format(calculated_wt_gender))

Weighted Gini for Gender:0.4779


__SPLIT BASED ON EMPLOYMENT__

In [7]:
crosstab2 = pd.crosstab(index=films["watching"], columns=films["employment_status"])
crosstab2

employment_status,student,working
watching,Unnamed: 1_level_1,Unnamed: 2_level_1
no,5,19
yes,4,22


In [8]:
student_watched_yes = (4/float(9))
working_watched_yes = (22/float(41))
print("Probability of students that watched:{:.3f}".format(student_watched_yes))
print("Probability of working people that watched:{:.3f}".format(working_watched_yes))

Probability of students that watched:0.444
Probability of working people that watched:0.537


In [9]:
subnode_student =1-( (student_watched_yes)**2 + (1 - student_watched_yes)**2)
subnode_working = 1-((working_watched_yes)**2 + (1 - working_watched_yes)**2)

print("Gini(student):{:.3f}".format(subnode_student))
print("Gini(working):{:.3f}".format(subnode_working))

Gini(student):0.494
Gini(working):0.497


In [10]:
#Weighted Gini Index for Employment Split
calculated_wt_emp = (41/float(50))*subnode_working + (9/float(50))*subnode_student
print("Weighted Gini(employment):{:.4f}".format(calculated_wt_emp))

Weighted Gini(employment):0.4967


__SPLIT BASED ON AGE__

In [11]:
crosstab3 = pd.crosstab(index=films["watching"], columns=films["is_28+"])
crosstab3

is_28+,0,1
watching,Unnamed: 1_level_1,Unnamed: 2_level_1
no,11,13
yes,17,9


In [12]:
people_younger_than_28_watched_yes = (17/float(28))
people_older_than_28_watched_yes = (9/float(22))
print("Probability of people_younger_than_28_watched:{:.3f}".format(people_younger_than_28_watched_yes))
print("Probability of people_older_than_28_watched:{:.3f}".format(people_older_than_28_watched_yes))

Probability of people_younger_than_28_watched:0.607
Probability of people_older_than_28_watched:0.409


In [13]:
subnode_less_than28_watched_yes =1-( (people_younger_than_28_watched_yes)**2 + (1 - people_younger_than_28_watched_yes)**2)
subnode_more_than28_watched_yes = 1-((people_older_than_28_watched_yes)**2 + (1 - people_older_than_28_watched_yes)**2)

print("Gini(people_younger_than_28):{:.3f}".format(subnode_less_than28_watched_yes))
print("Gini(people_older_than_28):{:.3f}".format(subnode_more_than28_watched_yes))

Gini(people_younger_than_28):0.477
Gini(people_older_than_28):0.483


In [14]:
#Weighted Gini Index for Age Split
calculated_wt_emp = (28/float(50))*subnode_less_than28_watched_yes + (22/float(50))*subnode_more_than28_watched_yes
print("Weighted Gini(age):{:.4f}".format(calculated_wt_emp))

Weighted Gini(age):0.4799


- __Since weighted gini(gender)< weighted gini(age) < weighted gini(employment), the node split will take on Gender__

***

### II. Chi-Square

__Gender Node__

In [15]:
crosstab1 = pd.crosstab(index=films["gender"], columns=films["watching"])
crosstab1["Total"] = crosstab1.no + crosstab1.yes
crosstab1

watching,no,yes,Total
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,8,14,22
M,16,12,28


In [16]:
# calculate the expected  who watch movie

crosstab1["Expected watch"] = crosstab1.Total * 26/50
crosstab1["Expected not watch"] = crosstab1.Total * 24/50
crosstab1

watching,no,yes,Total,Expected watch,Expected not watch
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,8,14,22,11.44,10.56
M,16,12,28,14.56,13.44


In [17]:
#Calculating Deviation
crosstab1["E - O (Watch)"] = crosstab1["Expected watch"] - crosstab1.yes
crosstab1["E - O (Not Watch)"] = crosstab1["Expected not watch"] - crosstab1.no
crosstab1

watching,no,yes,Total,Expected watch,Expected not watch,E - O (Watch),E - O (Not Watch)
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,8,14,22,11.44,10.56,-2.56,2.56
M,16,12,28,14.56,13.44,2.56,-2.56


In [18]:
crosstab1["chi2_watch"] = np.sqrt(crosstab1["E - O (Watch)"]**2/crosstab1["Expected watch"])
crosstab1["chi2_not_watch"] = np.sqrt(crosstab1["E - O (Not Watch)"]**2/crosstab1["Expected not watch"])
crosstab1

watching,no,yes,Total,Expected watch,Expected not watch,E - O (Watch),E - O (Not Watch),chi2_watch,chi2_not_watch
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
F,8,14,22,11.44,10.56,-2.56,2.56,0.75688,0.787786
M,16,12,28,14.56,13.44,2.56,-2.56,0.670902,0.698297


In [19]:
chi2_gender = (crosstab1["chi2_watch"] + crosstab1["chi2_not_watch"]).sum()
chi2_gender

2.9138649533909593

__Employment node__

In [20]:
crosstab2 = pd.crosstab(index=films["employment_status"], columns=films["watching"])
crosstab2["Total"] = crosstab2.no + crosstab2.yes

crosstab2["Expected watch"] = crosstab2.Total * 26/50
crosstab2["Expected not watch"] = crosstab2.Total * 24/50

crosstab2["E - O (Watch)"] = crosstab2["Expected watch"] - crosstab2.yes
crosstab2["E - O (Not Watch)"] = crosstab2["Expected not watch"] - crosstab2.no

crosstab2["chi2_watch"] = np.sqrt(crosstab2["E - O (Watch)"]**2/crosstab2["Expected watch"])
crosstab2["chi2_not_watch"] = np.sqrt(crosstab2["E - O (Not Watch)"]**2/crosstab2["Expected not watch"])

crosstab2

watching,no,yes,Total,Expected watch,Expected not watch,E - O (Watch),E - O (Not Watch),chi2_watch,chi2_not_watch
employment_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
student,5,4,9,4.68,4.32,0.68,-0.68,0.31433,0.327165
working,19,22,41,21.32,19.68,-0.68,0.68,0.14727,0.153284


In [21]:
chi2_emp = (crosstab2["chi2_watch"] + crosstab2["chi2_not_watch"]).sum()
chi2_emp

0.9420494494487789

__Age node__

In [22]:
crosstab3 = pd.crosstab(index=films["is_28+"], columns=films["watching"])
crosstab3["Total"] = crosstab3.no + crosstab3.yes

crosstab3["Expected watch"] = crosstab3.Total * 26/50
crosstab3["Expected not watch"] = crosstab3.Total * 24/50

crosstab3["E - O (Watch)"] = crosstab3["Expected watch"] - crosstab3.yes
crosstab3["E - O (Not Watch)"] = crosstab3["Expected not watch"] - crosstab3.no

crosstab3["chi2_watch"] = np.sqrt(crosstab3["E - O (Watch)"]**2/crosstab3["Expected watch"])
crosstab3["chi2_not_watch"] = np.sqrt(crosstab3["E - O (Not Watch)"]**2/crosstab3["Expected not watch"])

crosstab3

watching,no,yes,Total,Expected watch,Expected not watch,E - O (Watch),E - O (Not Watch),chi2_watch,chi2_not_watch
is_28+,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,11,17,28,14.56,13.44,-2.44,2.44,0.639454,0.665565
1,13,9,22,11.44,10.56,2.44,-2.44,0.721401,0.750858


In [23]:
chi2_age = (crosstab3["chi2_watch"] + crosstab3["chi2_not_watch"]).sum()
chi2_age

2.777277533700757

- __Since chi2_gender < chi2_age < chi2_emp, the node split will take on Gender__

***

### III.Entropy

__Gender node__

In [24]:
crosstab1.iloc[:,:2]

watching,no,yes
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,8,14
M,16,12


In [25]:
# Female node entropy
p = 14/float(22)
q = 8/float(22)
female_entropy = -p*np.log2(p) - q*np.log2(q)
female_entropy

0.94566030460064021

In [26]:
# Male node entropy
p = 12/float(28)
q = 16/float(28)
male_entropy = -p*np.log2(p) - q*np.log2(q)
male_entropy

0.98522813603425163

In [28]:
#Weighted entropy for gender
total_entropy_gender = (28/float(50))*male_entropy + (22/float(50))*female_entropy
total_entropy_gender

0.96781829020346266

__Employment Node__

In [29]:
crosstab2.iloc[:,:2]

watching,no,yes
employment_status,Unnamed: 1_level_1,Unnamed: 2_level_1
student,5,4
working,19,22


In [30]:
#entropy for students
p = 4/float(9)
q = 5/float(9)
working_entropy = -p*np.log2(p) - q*np.log2(q)
working_entropy

0.99107605983822222

In [31]:
# entropy for working people
p = 22/float(41)
q = 19/float(41)

student_entropy = -p*np.log2(p) - q*np.log2(q)
student_entropy

0.99613448350957956

In [32]:
total_entropy_emp = (41/float(50))*working_entropy + (9/float(50))*student_entropy
total_entropy_emp

0.99198657609906649

__Age Node__

In [33]:
crosstab3.iloc[:,:2]

watching,no,yes
is_28+,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11,17
1,13,9


In [34]:
#entropy for less than 28 years
p = 17/float(28)
q = 11/float(28)
less_than_28_entropy = -p*np.log2(p) - q*np.log2(q)
less_than_28_entropy

0.96661863254810276

In [35]:
#entropy for more than 28 years
p = 9/float(22)
q = 13/float(22)
more_than_28_entropy = -p*np.log2(p) - q*np.log2(q)
more_than_28_entropy

0.97602064823661505

In [36]:
total_entropy_age = (28/float(50))*less_than_28_entropy + (22/float(50))*more_than_28_entropy
total_entropy_age

0.97075551945104821

- __Since entropy_gender < entropy_age < entropy_emp, the node split will take on Gender__

***