In [81]:
import numpy as np
import pandas as pd


###### Fetching Data:

In [82]:
fileurl = 'https://raw.githubusercontent.com/mabhay3420/Deep-Into-CNN/master/Datasets/diabetes2.csv'
data = pd.read_csv(fileurl)

In [83]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


###### Checking for NaN values

In [86]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

###### funcions:

In [87]:
def hypothesis(X, theta):
    z = np.dot(theta, X.T)
    return 1/(1+np.exp(-(z))) - 0.0000001

In [88]:
def cost(X, y, theta):
    y1 = hypothesis(X, theta)
    return -(1/len(X)) * np.sum(y*np.log(y1) + (1-y)*np.log(1-y1))

In [89]:
def gradient_descent(X, y, theta, alpha, epochs):
    m =len(X)
    J = [cost(X, y, theta)] 
    for i in range(0, epochs):
        h = hypothesis(X, theta)
        
        if i%500 == 0:
          print("Loss after Epoch ",i,' ',J[i])
        
        for i in range(0, len(X.columns)):
            theta[i] -= (alpha/m) * np.sum((h-y)*X.iloc[:, i])
        J.append(cost(X, y, theta))
    return J, theta

In [90]:
def predict(X, y, theta, alpha, epochs):
     
    J, th = gradient_descent(X, y, theta, alpha, epochs) 
    h = hypothesis(X, th)
    for i in range(len(h)):
      h[i] = 1 if h[i]>=0.5 else 0
  
    
    y = list(y)
    acc = np.sum([y[i] == h[i] for i in range(len(y))])/len(y)
    return J, acc, th

###### Test-train split

In [91]:
msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]

###### Prediction and Accuracy on train set

In [None]:
y_train = train["Outcome"]
train.drop(['Outcome'], axis=1, inplace=True)
X_train = train

In [93]:
X_train.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50


In [94]:
theta = [0.5]*len(X_train.columns)
J, acc ,learned_theta = predict(X_train, y_train, theta, 0.0001, 5000)

Loss after Epoch  0   10.472671321269234


  This is separate from the ipykernel package so we can avoid doing imports until


Loss after Epoch  500   1.8654392558439232
Loss after Epoch  1000   1.064190254663696
Loss after Epoch  1500   0.8338715077185128
Loss after Epoch  2000   0.7283343049512477
Loss after Epoch  2500   0.6822623328650708
Loss after Epoch  3000   0.6608635037650649
Loss after Epoch  3500   0.6494118890945183
Loss after Epoch  4000   0.6422412154794948
Loss after Epoch  4500   0.6371085861683703


In [95]:
acc

0.7038917089678511

###### Prediction and Accuracy on test set

In [96]:
y_test = test["Outcome"]
test.drop(['Outcome'], axis=1, inplace=True)
X_test = test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [97]:
J, acc,_ = predict(X_test, y_test, learned_theta, 0.0001, 1)

Loss after Epoch  0   0.6352919999989184


In [98]:
acc

0.6949152542372882