## Diabetics Predictor

Importing the libs and data


In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt     
from sklearn.utils import shuffle
import seaborn as sns  

Mean Normalisation

In [None]:
def meannorm(nd):
    men=nd.mean();
    std=nd.max()-nd.min();
    nd=(nd-men)/std;
    return [nd,men,std];

In [None]:
def datacleanser(x):
    x = shuffle(x);
    x=x.reset_index(drop=True);
    print("Data before Cleansing:\n");
    print(x.head(5));
    x=x.drop(columns="Diabetes Pedigree Function");
    [nd,men,std]=meannorm(x);
    nd["Class"]=x["Class"];
    print("Data after Cleansing:\n");
    print(nd.head(5));
    return [nd,men,std];

In [None]:
def sigmoid(x):
    sig=(1/(1+(np.exp(-x))));
    return sig;

In [None]:
def grad(x,y,theta):
    alpha=9;
    h=sigmoid(x.dot(theta).rename(columns={0:"Class"}));
    theta=theta-((alpha/x.shape[0])*(x.T.dot(h.subtract(y))));
    costfunc(x,y,theta);
    J=(costfunc(x,y,theta)).to_numpy();
    return (theta,J);

In [None]:
def costfunc(x,y,theta):
    h=sigmoid(x.dot(theta));
    J=-(1/x.shape[0])*((y.T).dot(np.log(h))+(((1-y).T).dot(np.log(1-h))));
    return J;

In [None]:
def fit(x,y):
    x.insert(0,"ones",1);
    iters=100;
    J=np.zeros(shape=[iters,1],dtype=object);
    [m,n]=x.shape;
    theta=np.zeros(shape=[n,1],dtype=int);
    for i in range(iters):
        (theta,J[i])=grad(x,y,theta);
    plt.figure();
    plt.plot(J);
    plt.xlabel("No. of Iterations -->");
    plt.ylabel("Cost Function -->");
    plt.title("Cost vs Iters");
    print("The minimum value of cost:",J[-1,-1]);
    return theta;

In [None]:
def predict(x,theta,threshold):
    x.insert(0,"ones",1);
    h=sigmoid(x.dot(theta));
    return (h>=0.5);

In [None]:
def thresholdfinder(X_train,theta,y):
    f1score=[];
    for i in np.arange(0, 1, 0.1):
        X_train=X_train.drop(columns="ones");
        ypred=predict(X_train,theta,i)
        tp=((((y==1)&(ypred==1))["Class"]).sum());
        fp=((((y==1)&(ypred==0))["Class"]).sum());
        fn=((((y==0)&(ypred==1))["Class"]).sum());
        prec=tp/(tp+fp);
        rec=tp/(tp+fn);
        f1score=f1score.append([(prec*rec)/(prec+rec)]);
    print(f1score);

Importing DataSets

In [None]:
data=pd.read_csv("pima-indians-diabetes.csv");
data.rename(columns = {"6": "Pregnancies", 
                     "148":"Glucose",
                    "72":"BloodPressure",
                    "35":"SkinThickness",
                    "0":"Insulin",
                    "33.6":"BMI",
                    "0.627":"Diabetes Pedigree Function",
                    "50":"Age",
                    "1":"Class"}, 
                                 inplace = True) ;

In [None]:
sns.countplot(x="Class",hue="Pregnancies",data=data);

In [None]:
[nd,men,std]=datacleanser(data);

Seperating Training, Cross Validation and Test sets.

In [None]:
[m,n]=nd.shape;
trnd=nd.iloc[0:int(0.6*m),:];
tstnd=nd.iloc[int(0.6*m):,:]
X_train=trnd.iloc[:,:-1];
Y_train=trnd["Class"].to_frame();
X_test=tstnd.iloc[:,:-1];
Y_test=tstnd["Class"].to_frame();

In [None]:
theta=fit(X_train,Y_train);
threshold=thresholdfinder(X_train,theta,Y_train);
y= predict(X_test,theta,threshold);
print("The percentage of Accuaracy attained:",(y==Y_test).mean()["Class"]*100,"%");
X_test=X_test.drop(columns="ones");

Testing for a specfic data

In [None]:
dnww=list(map(float, input("Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age").split()))
dn = (dnww-men[:7])/(std[:7]);
a=pd.Series([1]);
dn=a.append(dn);
print(dn);
dn=sum(sum(theta.T.values*dn.T.values));
print(sigmoid(dn)>=0.5);
#Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age