In [5]:
import sys
sys.path.append('/Users/user/miniconda3/lib/python3.8/site-packages')

In [269]:
import pandas as pd
import math
import numpy as np
from sklearn.preprocessing import scale
from tqdm import tqdm
from sklearn.linear_model.logistic import LogisticRegression



In [212]:
f = np.array(pd.read_csv('diabetes_train.txt',header=None).values)
f

array([[6.00e+00, 1.48e+02, 7.20e+01, ..., 6.27e-01, 5.00e+01, 1.00e+00],
       [1.00e+00, 8.50e+01, 6.60e+01, ..., 3.51e-01, 3.10e+01, 0.00e+00],
       [8.00e+00, 1.83e+02, 6.40e+01, ..., 6.72e-01, 3.20e+01, 1.00e+00],
       ...,
       [2.00e+00, 8.10e+01, 7.20e+01, ..., 5.47e-01, 2.50e+01, 0.00e+00],
       [7.00e+00, 1.95e+02, 7.00e+01, ..., 1.63e-01, 5.50e+01, 1.00e+00],
       [6.00e+00, 1.54e+02, 7.40e+01, ..., 8.39e-01, 3.90e+01, 0.00e+00]])

In [267]:
def loadData(filename):
    """
    split data into feature and label
    standardize feature val
    """
    f=np.array(pd.read_csv(filename,header=None).values)
    feature=f[:,:-1]
    feature=scale(feature)
    label=f[:,-1]
    return np.array(feature),np.array(label)

In [268]:
feature,label=loadData('diabetes_train.txt')
feature

array([[ 0.65618344,  0.82013306,  0.16718959, ...,  0.19778399,
         0.40819719,  1.45499222],
       [-0.84058535, -1.12374891, -0.14184847, ..., -0.65641636,
        -0.39130339, -0.18060429],
       [ 1.25489095,  1.90006748, -0.24486116, ..., -1.05911081,
         0.53855054, -0.09452026],
       ...,
       [-0.54123159, -1.24716999,  0.16718959, ..., -0.22931619,
         0.17645789, -0.69710845],
       [ 0.9555372 ,  2.27033072,  0.06417691, ..., -0.83945929,
        -0.93589074,  1.88541235],
       [ 0.65618344,  1.00526467,  0.27020228, ..., -0.32693908,
         1.02230632,  0.50806793]])

In [279]:
def sigmoid(z):
    """
    sigmoid func
    """
    return 1/(1+np.exp(z))

def kernal(x,y):
    return sum(x[i]*y[i] for i in range(len(x)))

In [280]:
def cost(feature, label,theta):
    m=len(feature)
    sumOfcost=0
    for i in range(m):
        if label[i]==1:
            sumOfcost-=np.log(sigmoid(kernal(feature[i],theta)))
        else:
            sumOfcost-=np.log(1-sigmoid(kernal(feature[i],theta)))
    sumOfcost/=m
    return sumOfcost

In [281]:
def gradOfcost(feature,label,theta):
    gradTheta=[]
    m=len(feature)
    for p in range(len(theta)):
        gradTheta.append(sum(((sigmoid(np.dot(feature[i],theta))-label[i])*feature[i][p])
                         for i in range(m))/(-m))
    return gradTheta

def gradDecent(alpha,theta,feature,label,maxIter):
    m=len(feature)
    for it in tqdm(range(maxIter)):
        gradTheta=gradOfcost(feature,label,theta)
        # 更新每个维度
        for p in range(len(theta)):
            theta[p]+=alpha*gradTheta[p]
        # print current cost
#         print('current cost is ',cost(feature,label,theta))
#         print('current theta0 is',theta[0])
        alpha*=0.8
    return theta

In [282]:
theta=np.random.rand(len(feature[0]),1)
theta=gradDecent(0.01,theta,feature,label,100)
theta

100%|██████████| 100/100 [00:03<00:00, 27.66it/s]


array([[0.27440002],
       [0.97875216],
       [0.02785456],
       [0.91669502],
       [0.13358772],
       [0.23719691],
       [0.44972389],
       [0.24477537]])

In [283]:
testFeature,testLabel=loadData('diabetes_test.txt') 

In [284]:
def predict(testFeature,bestTheta,thold):
    """
    single prediction
    """
    res=[]
    m=len(testFeature)
    for sampleId in range(m):
        prediction=sigmoid(np.dot(testFeature[sampleId],bestTheta))
        if prediction > thold:
            res.append(0)
        else:
            res.append(1)
    return res

In [285]:
prediction=predict(testFeature,theta,0.5)

In [286]:
def accu(testLabel,prediction,thold):
    record=[]
    for sampleId in range(len(testLabel)):
        if prediction[sampleId]==testLabel[sampleId]:
            record.append(1)
        else:
            record.append(0)
    return sum(record)/len(record)
        

In [287]:
acc=accu(testLabel,prediction,0.9)
acc

0.7052238805970149

In [288]:
classifier=LogisticRegression()
classifier.fit(feature,label)
pred=classifier.predict(testFeature)
pred

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1.,
       1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [274]:
acc1=accu(testLabel,pred,0.5)
acc1

0.8134328358208955