In [1]:
import pandas

In [2]:
def create_contingency_dict(df, target_feature):
    all_columns = list(df.columns.values)
    target_feature_values = list(df[target_feature].unique())
    contingency_dict = {}
    for column in all_columns:
        all_categories = df[column].unique()
        dict_cat_proba = {}
        for cat1 in all_categories:
            dict_target_feature_values = {}
            for cat2 in target_feature_values:
                dict_target_feature_values[cat2] = round(df[(df[target_feature] == cat2) & 
                                                      (df[column] == cat1)].shape[0] / \
                                                   df[df[target_feature] == cat2].shape[0], 2)
            dict_cat_proba[cat1] = dict_target_feature_values
        contingency_dict[column] = dict_cat_proba
    return contingency_dict

In [3]:
def print_contingency_table(dict):
    for category in dict:
        print(category)
        for feature in dict[category]:
            print(feature, end="")
            for target_class in dict[category][feature]:
                print("\t\t", target_class, "=", dict[category][feature][target_class])

# Question 1

In [4]:
df1 = pandas.read_csv('Tutorials/tuto3_table1.txt', sep=" ")

print(df1)

   RainRecently(RR) RainToday(RT) Temp(T)   Wind(W) Sunshine(S) Swimming
1          Moderate      Moderate    Warm     Light        Some      Yes
2             Light      Moderate    Warm  Moderate        None       No
3          Moderate      Moderate    Cold      Gale        None       No
4          Moderate      Moderate    Warm     Light        None      Yes
5          Moderate         Light    Cold     Light        Some       No
6             Heavy         Light    Cold  Moderate        Some      Yes
7             Light         Light    Cold  Moderate        Some       No
8          Moderate      Moderate    Cold      Gale        Some       No
9             Heavy         Heavy    Warm  Moderate        None      Yes
10            Light         Light    Cold     Light        Some       No


## A/ Construct the contingency table of conditional and prior probabilities that would be used by Naïve Bayes to build a classifier for this dataset.

In [5]:
contingency_q1 = create_contingency_dict(df1, "Swimming")

print_contingency_table(contingency_q1)

Temp(T)
Cold		 Yes = 0.25
		 No = 0.83
Warm		 Yes = 0.75
		 No = 0.17
RainToday(RT)
Moderate		 Yes = 0.5
		 No = 0.5
Heavy		 Yes = 0.25
		 No = 0.0
Light		 Yes = 0.25
		 No = 0.5
Wind(W)
Moderate		 Yes = 0.5
		 No = 0.33
Gale		 Yes = 0.0
		 No = 0.33
Light		 Yes = 0.5
		 No = 0.33
Sunshine(S)
None		 Yes = 0.5
		 No = 0.33
Some		 Yes = 0.5
		 No = 0.67
RainRecently(RR)
Moderate		 Yes = 0.5
		 No = 0.5
Heavy		 Yes = 0.5
		 No = 0.0
Light		 Yes = 0.0
		 No = 0.5
Swimming
Yes		 Yes = 1.0
		 No = 0.0
No		 Yes = 0.0
		 No = 1.0


In [6]:
p_yes_q1 = df1[df1['Swimming'] == 'Yes'].shape[0] / df1.shape[0]
p_no_q1 = df1[df1['Swimming'] == 'No'].shape[0] / df1.shape[0]

print("P(Swimming = Yes) =", p_yes_q1)
print("P(Swimming = No)  =", p_no_q1)

P(Swimming = Yes) = 0.4
P(Swimming = No)  = 0.6


## B/Based on the contingency table, classify the two new examples below using Naïve Bayes.

In [7]:
df1b = pandas.read_csv('Tutorials/tuto3_table1b.txt', sep=" ")

print(df1b)

   RainRecently(RR) RainToday(RT) Temp(T) Wind(W) Sunshine(S) Swimming
X1            Heavy      Moderate    Warm   Light        Some      ???
X2            Light      Moderate    Warm   Light        Some      ???


### First example

In [8]:
p_x1_yes = p_yes_q1 * \
           contingency_q1['RainRecently(RR)']['Heavy']['Yes'] * \
           contingency_q1['RainToday(RT)']['Moderate']['Yes'] * \
           contingency_q1['Temp(T)']['Warm']['Yes'] * \
           contingency_q1['Wind(W)']['Light']['Yes'] * \
           contingency_q1['Sunshine(S)']['Some']['Yes']

p_x1_no = p_no_q1 * \
          contingency_q1['RainRecently(RR)']['Heavy']['No'] * \
          contingency_q1['RainToday(RT)']['Moderate']['No'] * \
          contingency_q1['Temp(T)']['Warm']['No'] * \
          contingency_q1['Wind(W)']['Light']['No'] * \
          contingency_q1['Sunshine(S)']['Some']['No']

#normalising to 1
p_x1_yes *= 1 / (p_x1_yes + p_x1_no)
p_x1_no *= 1 / (p_x1_yes + p_x1_no)

print("P(Yes|X1) =", p_x1_yes)
print("P(No|X1)  =", p_x1_no)
print()
print("Prediction: it will be possible to swim")

P(Yes|X1) = 1.0
P(No|X1)  = 0.0

Prediction: it will be possible to swim


### Second example

In [9]:
p_x2_yes = p_yes_q1 * \
           contingency_q1['RainRecently(RR)']['Light']['Yes'] * \
           contingency_q1['RainToday(RT)']['Moderate']['Yes'] * \
           contingency_q1['Temp(T)']['Warm']['Yes'] * \
           contingency_q1['Wind(W)']['Light']['Yes'] * \
           contingency_q1['Sunshine(S)']['Some']['Yes']

p_x2_no = p_no_q1 * \
          contingency_q1['RainRecently(RR)']['Light']['No'] * \
          contingency_q1['RainToday(RT)']['Moderate']['No'] * \
          contingency_q1['Temp(T)']['Warm']['No'] * \
          contingency_q1['Wind(W)']['Light']['No'] * \
          contingency_q1['Sunshine(S)']['Some']['No']

#normalising to 1
p_x2_yes *= 1 / (p_x2_yes + p_x2_no)
p_x2_no *= 1 / (p_x2_yes + p_x2_no)

print("P(Yes|X2) =", p_x2_yes)
print("P(No|X2)  =", p_x2_no)
print()
print("Prediction: it will NOT be possible to swim")

P(Yes|X2) = 0.0
P(No|X2)  = 1.0

Prediction: it will NOT be possible to swim


# Question 2

In [10]:
df2 = pandas.read_csv('Tutorials/tuto3_table2.txt', sep=" ")

print(df2)

    Name    Hair   Height    Build Lotion     Result
1  Sarah  blonde  average    light     no  sunburned
2   Dana  blonde     tall  average    yes       none
3   Alex   brown    short  average    yes       none
4  Annie  blonde    short  average     no  sunburned
5  Emily     red  average    heavy     no  sunburned
6   Pete   brown     tall    heavy     no       none
7   John   brown  average    heavy     no       none
8  Katie   brown    short    light    yes       none


## A/ Provide the contingency table of conditional and prior probabilities that would be used by Naïve Bayes to build a classifier for this dataset.

In [11]:
contingency_q2 = create_contingency_dict(df2[['Hair', 'Height', 'Build', 'Lotion', 'Result']], "Result")

print_contingency_table(contingency_q2)

Result
none		 none = 1.0
		 sunburned = 0.0
sunburned		 none = 0.0
		 sunburned = 1.0
Height
average		 none = 0.2
		 sunburned = 0.67
short		 none = 0.4
		 sunburned = 0.33
tall		 none = 0.4
		 sunburned = 0.0
Build
average		 none = 0.4
		 sunburned = 0.33
heavy		 none = 0.4
		 sunburned = 0.33
light		 none = 0.2
		 sunburned = 0.33
Hair
blonde		 none = 0.2
		 sunburned = 0.67
red		 none = 0.0
		 sunburned = 0.33
brown		 none = 0.8
		 sunburned = 0.0
Lotion
no		 none = 0.4
		 sunburned = 1.0
yes		 none = 0.6
		 sunburned = 0.0


In [12]:
p_yes_q2 = df2[df2['Result'] == 'sunburned'].shape[0] / df2.shape[0]
p_no_q2 = df2[df2['Result'] == 'none'].shape[0] / df2.shape[0]

print("P(Sunburned)     =", p_yes_q2)
print("P(Not sunburned) =", p_no_q2)

P(Sunburned)     = 0.375
P(Not sunburned) = 0.625


## B/ Based on the contingency table, predict a risk level for the new loan application X below.

In [13]:
df2b = pandas.read_csv('Tutorials/tuto3_table2b.txt', sep=" ")

print(df2b)

     Hair   Height  Build Lotion Result
X  blonde  average  heavy     no    ???


In [14]:
q2_p_x_yes = p_yes_q2 * \
             contingency_q2['Hair']['blonde']['sunburned'] * \
             contingency_q2['Height']['average']['sunburned'] *\
             contingency_q2['Build']['heavy']['sunburned'] *\
             contingency_q2['Lotion']['no']['sunburned']

q2_p_x_no = p_no_q2 * \
             contingency_q2['Hair']['blonde']['none'] * \
             contingency_q2['Height']['average']['none'] *\
             contingency_q2['Build']['heavy']['none'] *\
             contingency_q2['Lotion']['no']['none']

#normalize to 1
q2_p_x_yes *= 1/(q2_p_x_yes+q2_p_x_no)
q2_p_x_no *= 1/(q2_p_x_yes+q2_p_x_no)

print("P(Sunburned|X)     =", q2_p_x_yes)
print("P(Not sunburned|X) =", q2_p_x_no)
print()
print("Predicted output: sunburned")

P(Sunburned|X)     = 0.9328311059148507
P(Not sunburned|X) = 0.0042697130515258145

Predicted output: sunburned


# Question 3

In [15]:
df3 = pandas.read_csv('Tutorials/tuto3_table3.txt', sep=" ")

print(df3)

   CreditHistory  Debt  Income    Risk
1            bad   low   0to30    high
2            bad  high  30to60    high
3            bad   low   0to30    high
4        unknown  high  30to60    high
5        unknown  high   0to30    high
6           good  high   0to30    high
7            bad   low  over60  medium
8        unknown   low  30to60  medium
9           good  high  30to60  medium
10       unknown   low  over60     low
11       unknown   low  over60     low
12          good   low  over60     low
13          good  high  over60     low
14          good  high  over60     low


## A/ Calculate the contingency table that would be used by Naïve Bayes to build a classifier using this training data.

In [16]:
contingency_q3 = create_contingency_dict(df3, 'Risk')

print_contingency_table(contingency_q3)

Income
0to30		 low = 0.0
		 high = 0.67
		 medium = 0.0
over60		 low = 1.0
		 high = 0.0
		 medium = 0.33
30to60		 low = 0.0
		 high = 0.33
		 medium = 0.67
Risk
low		 low = 1.0
		 high = 0.0
		 medium = 0.0
high		 low = 0.0
		 high = 1.0
		 medium = 0.0
medium		 low = 0.0
		 high = 0.0
		 medium = 1.0
Debt
low		 low = 0.6
		 high = 0.33
		 medium = 0.67
high		 low = 0.4
		 high = 0.67
		 medium = 0.33
CreditHistory
unknown		 low = 0.4
		 high = 0.33
		 medium = 0.33
bad		 low = 0.0
		 high = 0.5
		 medium = 0.33
good		 low = 0.6
		 high = 0.17
		 medium = 0.33


In [17]:
p_low = df3[df3['Risk'] == 'low'].shape[0] / df3.shape[0]
p_med = df3[df3['Risk'] == 'medium'].shape[0] / df3.shape[0]
p_high = df3[df3['Risk'] == 'high'].shape[0] / df3.shape[0]

print("P(Risk=low)    =", p_low)
print("P(Risk=medium) =", p_med)
print("P(Risk=high)   =", p_high)

P(Risk=low)    = 0.35714285714285715
P(Risk=medium) = 0.21428571428571427
P(Risk=high)   = 0.42857142857142855


## B/ Based on the contingency table, predict a risk level for the new loan application X below.

In [18]:
df3b = pandas.read_csv('Tutorials/tuto3_table3b.txt', sep=" ")

print(df3b)

  CreditHistory Debt  Income Risk
X           bad  low  30to60  ???


In [19]:
p_x_low = p_low *\
          contingency_q3['CreditHistory']['bad']['low'] *\
          contingency_q3['Debt']['low']['low'] *\
          contingency_q3['Income']['30to60']['low']

p_x_med = p_med *\
          contingency_q3['CreditHistory']['bad']['medium'] *\
          contingency_q3['Debt']['low']['medium'] *\
          contingency_q3['Income']['30to60']['medium']

p_x_high = p_high *\
          contingency_q3['CreditHistory']['bad']['high'] *\
          contingency_q3['Debt']['low']['high'] *\
          contingency_q3['Income']['30to60']['high']


#Normalize to 1
total = p_x_low + p_x_med + p_x_high
p_x_low *= 1/total
p_x_med *= 1/total
p_x_high *= 1/total

print("P(low|X)  =", p_x_low)
print("P(med|X)  =", p_x_med)
print("P(high|X) =", p_x_high)
print()
print("Predicted: medium risk")

P(low|X)  = 0.0
P(med|X)  = 0.5763255873667994
P(high|X) = 0.4236744126332007

Predicted: medium risk
