In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv("Pima.csv")
columns = data.columns
columns = [col_name for col_name in columns if not col_name=="Outcome"]

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
y = data["Outcome"]
X = data.drop(["Outcome"], axis=1)

In [5]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3)

In [6]:
#generate summary
train_mean_pos = train_X[train_y==1].mean()
train_std_pos = train_X[train_y==1].std()
train_mean_neg = train_X[train_y==0].mean()
train_std_neg = train_X[train_y==0].std()

In [8]:
train_mean_pos

Pregnancies                   4.711230
Glucose                     142.657754
BloodPressure                69.283422
SkinThickness                22.390374
Insulin                     103.759358
BMI                          35.218717
DiabetesPedigreeFunction      0.537348
Age                          37.005348
dtype: float64

In [9]:
train_std_pos

Pregnancies                   3.688406
Glucose                      32.372670
BloodPressure                22.905601
SkinThickness                18.390113
Insulin                     139.974617
BMI                           7.201853
DiabetesPedigreeFunction      0.378570
Age                          10.877370
dtype: float64

In [10]:
train_mean_neg

Pregnancies                   3.282857
Glucose                     109.280000
BloodPressure                67.574286
SkinThickness                19.660000
Insulin                      71.600000
BMI                          30.446286
DiabetesPedigreeFunction      0.429740
Age                          31.440000
dtype: float64

In [11]:
train_std_neg

Pregnancies                   3.008582
Glucose                      26.259045
BloodPressure                19.131843
SkinThickness                15.169202
Insulin                     105.423936
BMI                           7.403598
DiabetesPedigreeFunction      0.291725
Age                          11.836063
dtype: float64

In [12]:
summary = { "train_mean_pos": train_mean_pos.tolist() , "train_std_pos": train_std_pos.tolist(), 
           "train_mean_neg": train_mean_neg.tolist(), "train_std_neg": train_std_neg.tolist()}

In [13]:
summary

{'train_mean_pos': [4.711229946524064,
  142.6577540106952,
  69.28342245989305,
  22.390374331550802,
  103.75935828877006,
  35.21871657754011,
  0.5373475935828876,
  37.00534759358289],
 'train_std_pos': [3.688405560113013,
  32.372670029221936,
  22.905600879799124,
  18.390112858120034,
  139.97461696012755,
  7.201853290444712,
  0.3785701906936314,
  10.877369882033621],
 'train_mean_neg': [3.282857142857143,
  109.28,
  67.57428571428571,
  19.66,
  71.6,
  30.446285714285715,
  0.42973999999999957,
  31.44],
 'train_std_neg': [3.008582347979917,
  26.25904455998807,
  19.131843013941722,
  15.169202323800713,
  105.42393610430676,
  7.403598244559471,
  0.29172466775455297,
  11.836062614885739]}

In [14]:
import numpy as np
def cond_prob(x, mn, stddv):  #value , mean, standard dev| this is that fancy formula: 1/root(sigma^2)....
    varnc = stddv*stddv
    p = 1/(np.sqrt(2*np.pi*varnc)) * np.exp((-(x-mn)**2)/(2*varnc))
    return p

In [15]:
def predict(row, summary):

  #prior probability obtained as probability of class
  #i.e. we find fraction of positive samples present in the whole dataset
  prob_positive=len(summary["train_mean_pos"])/( len(summary["train_mean_pos"])+len(summary["train_mean_neg"]) )
  #then multiply it with conditional probability of each feature
  for i in range(0, len(row)):
    prob_positive = prob_positive * cond_prob(row[i],summary["train_mean_pos"][i], summary["train_std_pos"][i])
  

  #exact same process for negative
  prob_negative=len(summary["train_mean_neg"])/( len(summary["train_mean_pos"])+len(summary["train_mean_neg"]) )
  for i in range(0, len(row)):
    prob_negative = prob_negative * cond_prob(row[i],summary["train_mean_neg"][i], summary["train_std_neg"][i])
  
  return [prob_positive, prob_negative]

In [16]:
predictions_raw = []
for row in test_X.values.tolist():
  predictions_raw.append(predict(row, summary))

In [17]:
predictions_raw

[[8.775303286592785e-14, 2.3049352077149994e-12],
 [2.5028790522966333e-14, 4.6421765378565444e-17],
 [1.527080786397166e-13, 1.86448504361148e-14],
 [7.65955039458099e-14, 5.462837841215855e-14],
 [8.684093991856395e-14, 8.150820786199172e-13],
 [2.789194890070318e-14, 5.1367384527295604e-14],
 [2.440912909352727e-13, 8.717510908022303e-15],
 [3.100682725395452e-14, 5.65860376559198e-13],
 [6.367452986680356e-16, 2.849229504965584e-19],
 [3.4081031994387e-14, 1.382407374707936e-12],
 [1.3031159410275536e-13, 3.192724639161891e-15],
 [1.8424838881612864e-13, 2.187125695721851e-14],
 [8.939752381884506e-15, 4.56322597871825e-13],
 [4.5741058796831955e-15, 1.2172299271344691e-15],
 [3.74902765920378e-15, 1.2751418209374436e-14],
 [3.71131419978473e-14, 9.957453616886732e-13],
 [2.574748051718121e-13, 9.462390249484296e-14],
 [1.6309063487316587e-14, 6.730743834805536e-17],
 [5.470454218172114e-14, 1.4038616833653735e-12],
 [2.815207560161022e-13, 8.849881867006112e-13],
 [5.2777735212973

In [18]:
predictions = []
for row in predictions_raw:
  if(row[0]>row[1]):
    predictions.append(1)
  else:
    predictions.append(0)

In [19]:
confusion_matrix(test_y.tolist(), predictions)

array([[112,  38],
       [ 28,  53]], dtype=int64)