<a href="https://colab.research.google.com/github/RuchiRaina3/Baby-Cry-Project/blob/main/Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
from math import sqrt
from math import pi
from math import exp

**STEP 1:** READING EXCEL SHEETS

In [None]:
train_df = pd.read_excel("/content/drive/MyDrive/Baby Cry Project/Train Data.xlsx")
test_df = pd.read_excel("/content/drive/MyDrive/Baby Cry Project/Test Data.xlsx")

**STEP 2:** DERIVING STATISTICS(mean, std. deviation, length) OF COLUMNS - ENERGY & ZCR TO SUMMARIZE DATASET

In [None]:
#Function to find statistics
def stats(df):
  stats_df = [(df[column].mean(), df[column].std(), len(df['Energy'])) for column in df.columns]
  stats_df.pop(2) #Removes the element at index 2 of list and returns that removed element.
  print(stats_df)
  return stats_df

In [None]:
#Calling the stats function
stats_train_df = stats(train_df)

[(21.988192872790943, 37.172097473674654, 325), (5955.88923076923, 2278.602503818592, 325)]


**STEP 3:** SUMMARIZE DATASET BY CLASS
1.   Separate dataset by class
2.   Calculate statistics for each class

In [None]:
#Fuction To separate df by class
def separate_by_class(df):
  grouped_df = df.groupby('Label')
  i = 0
  for key, value in grouped_df:
    print("Class ", key, end=" ") 
    if i == 0:
      print("DISCOMFORT")
    if i == 1:
      print("HUNGRY")
    if i == 2:
      print("TIRED")
    i+=1 
    #print(grouped_df.get_group(key)) and print(value) print the dataset of that particular class(key)
    print(value)
    #print(value.Energy) prints only the Energy column of that class
    print()
  print() 
  return grouped_df

In [None]:
def display(d):
  for key in d:
    i = 1
    print("Class ", key)
    print("Energy Statistics: ", end = "")
    for value in d[key]:
      print(value)
      if i == 1:
        print("ZCR Statistics: ", end = "")
        i = 0
  print()

In [None]:
#Fuction To calculate stats of each class and storing in dictionary
def stats_class(df):
  grouped_df = separate_by_class(df)
  d = {}
  for key, value in grouped_df:
    #value.Energy.mean(), df.groupby('Label').get_group(0)['Energy'].mean() returns the mean of Energy column of that class and
    #df.groupby('Label')['Energy'].mean() returns means of Energy column of all the classes
    stats = [(value.Energy.mean(), value.Energy.std(),value.Energy.size), (value.ZCR.mean(), value.ZCR.std(),value.ZCR.size)]
    d[key] = stats
  display(d)
  return d

**STEP 4:** CALCULATING GAUSSIAN PROBABILITY DISTRIBUTION FUNCTION

In [None]:
def gaussian_pdf(x, mean, std):
  exponent = exp(-((x-mean)**2 / (2 * std**2 )))
  return (1 / (sqrt(2 * pi) * std)) * exponent

**STEP 5:** CALCULATE PROBABILITIES
1. Calculate Class prob. i.e. P(class) = Rows in class / Total Rows in training Dataset
2. Calculate Prob. for each input value in the row using the Gaussian probability density function and the statistics for that column and of that class

In [None]:
def calculate_prob(stats, df):
  total_rows = float(sum([stats[class_name][0][2] for class_name in stats]))
  prob = {}
  for class_name, class_values in stats.items():
    #P(class)        = Rows in class/Total Rows in training Dataset
    prob[class_name] = stats[class_name][0][2]/(total_rows)

    #Calculate Prob. for each input value in the row
    for i in range(len(class_values)):
      mean, std, count = class_values[i]
      #P(class=0|Energy(x1),ZCR(x2)) = P(x1|class=0) * P(x2|class=0) * P(class=0)
      prob[class_name] *= gaussian_pdf(df[i], mean, std)
      #prob[class_name] = gaussian_pdf(df.loc[0][i], mean, std) * prob[class_name]
  return prob #Dictionary

In [None]:
def predict(stats, df):
  probabilities = calculate_prob(stats, df)
  best_class, best_prob = None, -1
  for class_value, probability in probabilities.items():
      if best_class is None or probability > best_prob:
        best_prob = probability
        best_class = class_value
  return best_class

In [None]:
model = stats_class(train_df)

Class  1 DISCOMFORT
       Energy    ZCR  Label
0    2.193834   5156      1
1    0.638413   5507      1
2    1.733535   5463      1
3    1.206897   5043      1
4    1.226826   3872      1
..        ...    ...    ...
60  69.308051  11333      1
61  30.942355  12357      1
62  72.360452  11377      1
63  40.173546  11582      1
64  37.373773  11855      1

[65 rows x 3 columns]

Class  2 HUNGRY
        Energy   ZCR  Label
65    9.716139  6913      2
66    1.212461  6753      2
67    5.151559  8863      2
68    8.644998  5772      2
69    2.307016  5450      2
..         ...   ...    ...
125  76.645378  7244      2
126  42.536263  7914      2
127  56.423792  9639      2
128  53.688319  7487      2
129  42.000551  8875      2

[65 rows x 3 columns]

Class  3 TIRED
       Energy   ZCR  Label
130  0.270054  4824      3
131  0.100029  5182      3
132  6.928105  5463      3
133  0.205488  4732      3
134  0.841568  8670      3
..        ...   ...    ...
190  5.089931  3291      3
191  2.553286

In [None]:
for i in range(len(train_df)):
  label = predict(model,train_df.loc[i])
  print(label)

5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
1
5
5
5
5
5
5
4
4
3
1
4
5
5
5
5
5
5
5
5
5
5
1
5
5
5
5
5
5
5
5
5
5
5
5
5
5
2
2
2
2
2
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
2
1
2
5
2
1
2
1
5
1
2
2
2
2
2
2
2
2
2
5
2
5
2
5
2
5
2
5
5
5
5
4
5
1
5
4
5
5
4
4
2
2
2
2
2
2
2
2
2
2
5
5
5
5
5
4
5
4
1
5
4
1
4
4
5
5
5
5
5
1
3
4
4
1
3
2
2
3
2
3
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
2
4
4
5
5
4
1
1
2
4
5
4
4
5
5
5
5
3
3
5
5
4
4
3
3
5
5
4
4
4
4
1
1
5
5
5
1
5
2
2
4
5
5
3
5
5
5
4
2
5
5
5
5
4
5
5
5
2
5
5
5
5
5
2
2
5
5
5
5
5
5
5
5
5
5
4
5
1
5
5
5
5
5
5
5
5
5
5
5
1
5
5
5
5
5
5
5
1
5
5
2
5
1
1
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5


In [None]:
model1 = stats_class(test_df)

Class  1 DISCOMFORT
      Energy   ZCR  Label
0   0.191338  4252      1
1   0.085532  4537      1
2   1.411783  6746      1
3   0.003414  4653      1
4   0.746864  5554      1
5   0.336817  3821      1
6   0.104141  4089      1
7   0.341549  3337      1
8   0.382417  3076      1
9   0.001429  6018      1
10  0.812897  4651      1
11  0.344007  5072      1
12  0.602507  4821      1
13  0.532433  4742      1
14  0.980484  3606      1

Class  2 HUNGRY
       Energy    ZCR  Label
15  45.918124  11746      2
16  55.537417  11079      2
17  23.298573  13410      2
18  26.811426  12355      2
19  51.142511  11803      2
20  50.299981  11021      2
21  34.232958  12285      2
22  27.620347  12137      2
23  29.459922  10666      2
24  43.678250  11647      2
25   5.255000   7418      2
26   8.728190   5884      2
27   2.235084   8589      2
28   4.826286   6178      2
29   2.471857   6389      2

Class  3 TIRED
       Energy   ZCR  Label
30  63.329537  4706      3
31  31.988644  5581      3
32

In [None]:
predicted_list = []
for i in range(len(test_df)):
  label = predict(model1,test_df.loc[i])
  print(label)
  predicted_list.append(label)

1
1
5
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
5
5
2
5
5
4
3
4
5
4
1
1
1
1
1
3
2
3
2
3
1
1
2
4
4
1
4
5
5
5
5
4
1
5
3
5
5
3
5
5
1
1
5
1
1
1
1
1
1
1


In [None]:
actual_list = []
for i in range(len(test_df)):
  actual_list.append(test_df.iloc[i][2])

In [None]:
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

In [None]:
accuracy = accuracy_metric(actual_list, predicted_list)
print(accuracy)

50.66666666666667


### Label 1 - bellypain
### Label 2 - burping
### Label 3 - discomfort
### Label 4 - hungry
### Label 5 - tired


