# Binary Classification - Play Tennis

In [105]:
import pandas as pd

# Create a DataFrame
data = {
    'Day': ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10'],
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Overcast', 'Sunny', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal'],
    'Windy': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Day,Outlook,Temperature,Humidity,Windy,PlayTennis
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Overcast,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [84]:
# Q1

# Prior probabilities
prior = df['PlayTennis'].value_counts(normalize=True)
p_yes = prior['Yes']
p_no = prior['No']
print(f'1A.\nP("Play Tennis"="Yes")= {p_yes}, P("Play Tennis"="No")= {p_no}')

1A.
P("Play Tennis"="Yes")= 0.6, P("Play Tennis"="No")= 0.4


In [98]:
# Q2-3

import math
# Conditional probabilities
cond_probs = {
    'Outlook': df.groupby('PlayTennis')['Outlook'].value_counts(normalize=True).unstack(),
    'Temperature': df.groupby('PlayTennis')['Temperature'].value_counts(normalize=True).unstack(),
    'Humidity': df.groupby('PlayTennis')['Humidity'].value_counts(normalize=True).unstack(),
    'Wind': df.groupby('PlayTennis')['Windy'].value_counts(normalize=True).unstack()
}

# New instance
x_cond = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}

# Calculate posterior probabilities
posterior = {label: prior[label]
             * cond_probs['Outlook'].loc[label, x_cond['Outlook']]
             * cond_probs['Temperature'].loc[label, x_cond['Temperature']]
             * cond_probs['Humidity'].loc[label, x_cond['Humidity']]
             * cond_probs['Wind'].loc[label, x_cond['Wind']]
             for label in prior.index}
print('2B.\nP("Play Tennis"="Yes"|X) ∝', round(posterior['Yes'], 4))
print('3C.\nP("Play Tennis"="No"|X) ∝', round(posterior['No'], 4))

2B.
P("Play Tennis"="Yes"|X) ∝ 0.0028
3C.
P("Play Tennis"="No"|X) ∝ 0.0188


In [112]:
# Q4

# Normalize posterior probabilities
evidence = sum(posterior.values())
posterior_norm = {label: prob / evidence for label, prob in posterior.items()}

# Predict the label
predicted_label = max(posterior_norm, key=posterior_norm.get)
print('4B.\n"Play Tennis"=', predicted_label, '\n', posterior_norm)

4B.
"Play Tennis"= No 
 {'Yes': 0.12903225806451607, 'No': 0.8709677419354839}


In [101]:
posterior_norm

{'Yes': 0.12903225806451607, 'No': 0.8709677419354839}

# Multi-label Classification - Traffic Data

In [126]:
# Create a DataFrame

traffic_data = {
    'Day': ['Weekday', 'Weekday', 'Weekday', 'Holiday', 'Saturday', 'Weekday', 'Holiday', 'Sunday', 'Weekday',
            'Weekday', 'Saturday', 'Weekday', 'Weekday', 'Weekday', 'Weekday', 'Saturday', 'Weekday', 'Holiday',
            'Weekday', 'Weekday'],
    'Season': ['Spring', 'Winter', 'Winter', 'Winter', 'Summer', 'Autumn', 'Summer', 'Summer', 'Winter',
               'Summer', 'Spring', 'Summer', 'Winter', 'Summer', 'Winter', 'Autumn', 'Autumn', 'Spring',
               'Spring', 'Spring'],
    'Fog': ['None', 'None', 'None', 'High', 'Normal', 'Normal', 'High', 'Normal', 'High', 'None', 'High',
            'High', 'Normal', 'High', 'Normal', 'High', 'None', 'Normal', 'Normal', 'Normal'],
    'Rain': ['None', 'Slight', 'None', 'Slight', 'None', 'None', 'Slight', 'None', 'Heavy', 'Slight',
             'Heavy', 'Slight', 'None', 'None', 'Heavy', 'Slight', 'Heavy', 'Slight', 'None', 'Heavy'],
    'Class': ['On Time', 'On Time', 'On Time', 'Late', 'On Time', 'Very Late', 'On Time', 'On Time', 'Very Late',
              'On Time', 'Cancelled', 'On Time', 'Late', 'On Time', 'Very Late', 'On Time', 'On Time', 'On Time',
              'On Time', 'On Time']
}
traffic_df = pd.DataFrame(traffic_data)
traffic_df

Unnamed: 0,Day,Season,Fog,Rain,Class
0,Weekday,Spring,,,On Time
1,Weekday,Winter,,Slight,On Time
2,Weekday,Winter,,,On Time
3,Holiday,Winter,High,Slight,Late
4,Saturday,Summer,Normal,,On Time
5,Weekday,Autumn,Normal,,Very Late
6,Holiday,Summer,High,Slight,On Time
7,Sunday,Summer,Normal,,On Time
8,Weekday,Winter,High,Heavy,Very Late
9,Weekday,Summer,,Slight,On Time


In [129]:
x_cond = {'Day': 'Weekday', 'Season': 'Winter', 'Fog': 'High', 'Rain': 'Heavy'}

In [131]:
# Q5

# Prior Probabilities
prior = traffic_df['Class'].value_counts(normalize=True)
print('5A.\n', prior)

5A.
 Class
On Time      0.70
Very Late    0.15
Late         0.10
Cancelled    0.05
Name: proportion, dtype: float64


In [171]:
# Q6-7-8-9

# Conditional probabilities
cond_probs = {
    'Day': traffic_df.groupby('Class')['Day'].value_counts(normalize=True).unstack(),
    'Season': traffic_df.groupby('Class')['Season'].value_counts(normalize=True).unstack(),
    'Fog': traffic_df.groupby('Class')['Fog'].value_counts(normalize=True).unstack(),
    'Rain': traffic_df.groupby('Class')['Rain'].value_counts(normalize=True).unstack()
}

# Posterior
posterior = {label: prior[label]
             * cond_probs['Day'].loc[label, x_cond['Day']]
             * cond_probs['Season'].loc[label, x_cond['Season']]
             * cond_probs['Fog'].loc[label, x_cond['Fog']]
             * cond_probs['Rain'].loc[label, x_cond['Rain']]
             for label in prior.index}

posterior = pd.Series(posterior).fillna(0)

print('6C.\nP("On Time"|X) ∝', round(posterior['On Time'], 4))
print('7D.\nP("Late"|X) ∝', round(posterior['Late'], 4))
print('8A.\nP("Very Late"|X) ∝', round(posterior['Very Late'], 4))
print('9D.\nP("Cancelled"|X) ∝', round(posterior['Cancelled'], 4))

6C.
P("On Time"|X) ∝ 0.0026
7D.
P("Late"|X) ∝ 0.0
8A.
P("Very Late"|X) ∝ 0.0222
9D.
P("Cancelled"|X) ∝ 0.0


In [182]:
# Q10

# Normalize posterior probabilities
evidence = posterior.sum()
posterior_norm = {label: prob / evidence for label, prob in posterior.items()}

# Predict the label
predicted_label = max(posterior_norm, key=posterior_norm.get)
print('10C.', predicted_label, posterior_norm, sep='\n')

10C.
Very Late
{'On Time': 0.10560625814863105, 'Very Late': 0.894393741851369, 'Late': 0.0, 'Cancelled': 0.0}


# IRIS Classification

In [184]:
# Create a DataFrame

data_iris = {'Length': [1.4, 1.0, 1.3, 1.9, 2.0, 1.8, 3.0, 3.8, 4.1, 3.9, 4.2, 3.4],
        'Class': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]}
df_iris = pd.DataFrame(data_iris)
df_iris

Unnamed: 0,Length,Class
0,1.4,0
1,1.0,0
2,1.3,0
3,1.9,0
4,2.0,0
5,1.8,0
6,3.0,1
7,3.8,1
8,4.1,1
9,3.9,1


In [208]:
# Q11-12

# Calculate mean and variance for each class
summary_stats = df_iris.groupby('Class')['Length'].agg(['mean', lambda x: x.var(ddof=0)])
summary_stats.columns = ['mean', 'var']

# Extract values
mean_0, var_0 = summary_stats.loc[0, 'mean'], summary_stats.loc[0, 'var']
mean_1, var_1 = summary_stats.loc[1, 'mean'], summary_stats.loc[1, 'var']

print(f'11A.\nmean = {round(mean_0, 4)} and variance = {round(var_0, 4)}')
print(f'12B.\nmean = {round(mean_1, 4)} and variance = {round(var_1, 4)}')

11A.
mean = 1.5667 and variance = 0.1289
12B.
mean = 3.7333 and variance = 0.1722


In [226]:
# Q13

from scipy.stats import norm
x_cond = 3.4
# Calculate prior probabilities
prior_0 = df_iris['Class'].value_counts(normalize=True)[0]
prior_1 = df_iris['Class'].value_counts(normalize=True)[1]
# Calculate likelihood
likelihood_0 = norm.pdf(x_cond, mean_0, math.sqrt(var_0))
likelihood_1 = norm.pdf(x_cond, mean_1, math.sqrt(var_1))
# Calculate posterior probability
posterior_0 = prior_0 * likelihood_0
posterior_1 = prior_1 * likelihood_1

print(f'13A.\nP("Class"="0"|X) = {posterior_0} and P("Class"="1"|X) = {round(posterior_1, 4)}')


13A.
P("Class"="0"|X) = 1.2080820590230566e-06 and P("Class"="1"|X) = 0.3481


# Play Tennis Classifier Implementation

## 4.1 create_train_dataset()

In [281]:
# Q14

# Create data
import numpy as np

def create_train_data():
    data = {
    'Day': ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10'],
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Overcast', 'Sunny', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal'],
    'Windy': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes']
}
    data_tennis = pd.DataFrame(data)
    data_tennis = data_tennis.drop('Day', axis=1)
    data = data_tennis.values
    return np.array(data)

train_data = create_train_data()
train_data

array([['Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
       ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       ['Overcast', 'Mild', 'High', 'Weak', 'No'],
       ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'Normal', 'Weak', 'Yes']], dtype=object)

## 4.2 compute_prior_probability

In [280]:
def compute_prior_probability(train_data):
    y_unique = ['No', 'Yes']
    prior_probability = np.zeros(len(y_unique))
    play_tennis_column = [row[-1] for row in train_data]

    for i, label in enumerate(y_unique):
        prior_probability[i] = play_tennis_column.count(label) / len(play_tennis_column)

    return prior_probability

prior_probability = compute_prior_probability(train_data)
print(f'14A.\nP("PlayTennis"="Yes") = {prior_probability[1]}, P("PlayTennis"="No") = {prior_probability[0]}')

14A.
P("PlayTennis"="Yes") = 0.6, P("PlayTennis"="No") = 0.4


## 4.3 compute_conditional_probability

In [316]:
def compute_conditional_probability(train_data):
    y_unique = ["No", "Yes"]
    conditional_probability = []
    list_x_name = []
    for i in range(0, train_data.shape[1]-1):
        x_unique = np.unique(train_data[:, i])
        list_x_name.append(x_unique)
        x_conditional_probability = []

        for x_value in x_unique:
            probabilities = []
            for y_value in y_unique:
                # Calculate P(X=x_value | Y=y_value)
                count_y = np.sum(train_data[:, -1] == y_value)
                count_x_and_y = np.sum((train_data[:, i] == x_value) & (train_data[:, -1] == y_value))
                probabilities.append(count_x_and_y / count_y)
            x_conditional_probability.append(probabilities)

        conditional_probability.append(x_conditional_probability)
    return conditional_probability, list_x_name

In [317]:
# Q15

train_data = create_train_data()
_, list_x_name = compute_conditional_probability(train_data)
print("15B.")
print("x1 =", list_x_name[0])
print("x2 =", list_x_name[1])
print("x3 =", list_x_name[2])
print("x4 =", list_x_name[3])

15B.
x1 = ['Overcast' 'Rain' 'Sunny']
x2 = ['Cool' 'Hot' 'Mild']
x3 = ['High' 'Normal']
x4 = ['Strong' 'Weak']


## 4.4 get_index_from_value

In [318]:
def get_index_from_value(feature_name, list_features):
    return np.where(list_features == feature_name)[0][0]

In [319]:
# Q16

train_data = create_train_data()
_, list_x_name = compute_conditional_probability(train_data)
outlook = list_x_name[0]

i1 = get_index_from_value("Overcast", outlook)
i2 = get_index_from_value("Rain", outlook)
i3 = get_index_from_value("Sunny", outlook)

print('16A.')
print(i1, i2, i3)

16A.
0 1 2


In [328]:
# Q17

train_data = create_train_data()
conditional_probability, list_x_name = compute_conditional_probability(train_data)

# Compute P("Outlook"="Sunny"|"Play Tennis"="Yes")
x1 = get_index_from_value("Sunny", list_x_name[0])
print("17D.\nP('Outlook'='Sunny'|'Play Tennis'='Yes') =", np.round(conditional_probability[0][x1][1], 2))

17D.
P('Outlook'='Sunny'|'Play Tennis'='Yes') = 0.17


In [330]:
# Q18
train_data = create_train_data()
conditional_probability, list_x_name = compute_conditional_probability(train_data)

# Compute P("Outlook"="Sunny"|"Play Tennis"="No")
x1 = get_index_from_value("Sunny", list_x_name[0])
print("18A.\nP('Outlook'='Sunny'|'Play Tennis'='No') =", np.round(conditional_probability[0][x1][0], 2))

18A.
P('Outlook'='Sunny'|'Play Tennis'='No') = 0.5


## 4.5 train_naive_bayes

In [331]:
def train_naive_bayes(train_data):
    # Step 1: Calculate Prior Probability
    y_unique = ['No', 'Yes']
    prior_probability = compute_prior_probability(train_data)

    # Step 2: Calculate Conditional Probability
    conditional_probability, list_x_name = compute_conditional_probability(train_data)

    return prior_probability, conditional_probability, list_x_name

## 4.6 prediction_play_tennis

In [332]:
# Prediction

def prediction_play_tennis(X, list_x_name, prior_probability, conditional_probability):

    x1 = get_index_from_value(X[0], list_x_name[0])
    x2 = get_index_from_value(X[1], list_x_name[1])
    x3 = get_index_from_value(X[2], list_x_name[2])
    x4 = get_index_from_value(X[3], list_x_name[3])

    p0 = 0
    p1 = 0
    for i in range(len(prior_probability)):
        p0 += prior_probability[i] * conditional_probability[0][x1][i] * conditional_probability[1][x2][i] * conditional_probability[2][x3][i] * conditional_probability[3][x4][i]
    if p0>p1:
        y_pred = 0
    else:
        y_pred = 1

    return y_pred

In [335]:
# Q19
X = ['Sunny', 'Cool', 'High', 'Strong']
data = create_train_data()
prior_probability, conditional_probability, list_x_name = train_naive_bayes(data)
pred = prediction_play_tennis(X, list_x_name, prior_probability, conditional_probability)

if(pred):
    print("19B.\nAd should go!")
else:
    print("19A.\nAd should not go!")

19A.
Ad should not go!
