# **Naive Bayes using Diabetes Dataset**

***Problem Statement***

Predict whether a person has Diabetes (1) or No Diabetes (0) based on medical measurements.

Learning Type: Supervised Learning

Problem Type: Binary Classification

Algorithm: Naive Bayes (GaussianNB)

***Step 1: Import Required Libraries***

In [1]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,f1_score

***Step 2: Load the Dataset***

In [2]:
data = load_diabetes()
df=pd.DataFrame(data.data,columns=data.feature_names)
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [3]:
df['diabetes']=data.target
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,diabetes
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


***Creating Binary Diabetes Label***

In [4]:
out=[]
for i in df.diabetes:
  if i>=120:
    out.append(1)
  else:
    out.append(0)
df['diabetes']=out
df.head()


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,diabetes
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,1
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,1
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,1
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,1


***Step 4: Separate Features and Target***

In [5]:
X=df.drop(['diabetes'],axis=1)
y=df.diabetes
y.head()


Unnamed: 0,diabetes
0,1
1,0
2,1
3,1
4,1


***Step 5: Train-Test Split***

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)


***Step 6: Create Naive Bayes Model and Train the Model***

In [7]:
model=GaussianNB()
model.fit(X_train,y_train)

***Step 7:Make Predictions***

In [8]:
yper=model.predict(X_test)
yper

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1])

***Step 8: Model Evaluation***

In [9]:
print("training Eff:",model.score(X_train,y_train))
print("testing Eff:",model.score(X_test,y_test))

training Eff: 0.7443365695792881
testing Eff: 0.7218045112781954


In [10]:
print("accuracy",accuracy_score(y_test,yper))
print("precision",precision_score(y_test,yper))
print("recall",recall_score(y_test,yper))
print("f1",f1_score(y_test,yper))

accuracy 0.7218045112781954
precision 0.7681159420289855
recall 0.7162162162162162
f1 0.7412587412587412


*New_input*

In [11]:
newinput=df.sample(1).drop("diabetes",axis=1)
pred=model.predict(newinput)[0]
prob=model.predict_proba(newinput)
print("pred",pred)
print("prob",prob)
print("-------------------------------------")
if pred==0:
  print("No diabetes",prob[0,0]*100)
else:
  print("diabetes",prob[0,1]*100)

newinput

pred 1
prob [[0.1430735 0.8569265]]
-------------------------------------
diabetes 85.69265015453348


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
154,0.067136,0.05068,0.020739,-0.00567,0.020446,0.026243,-0.002903,-0.002592,0.008641,0.003064
