In [1]:
#importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import ProbPlot

In [2]:
#plot settings
plt.style.use('seaborn')
plt.rc('font',size=14)
plt.rc('figure',titlesize=18)
plt.rc('axes',labelsize=15)
plt.rc('axes',titlesize=18)

In [3]:
#loding dataset
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()

x=pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
y=pd.DataFrame(diabetes.target)
#x = [Data]
#y = [response]


In [4]:
x.shape
y.columns = ['Glucose']
y.head()

Unnamed: 0,Glucose
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0


In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=0)


In [6]:
#create dataframe x,y for easier plot handling
dataframe = pd.concat([x,y],axis=1)
dataframe.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Glucose
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [7]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
y_pred = regressor.predict(x_test)

In [9]:
print('intercept:\n',regressor.intercept_)
print('coefficient:\n',regressor.coef_)


intercept:
 [153.06798218]
coefficient:
 [[ -43.26774487 -208.67053951  593.39797213  302.89814903 -560.27689824
   261.47657106   -8.83343952  135.93715156  703.22658427   28.34844354]]


In [10]:
#y = 152.1335+age*(-10.0122)+sex*(-239.8191)+bmi*(519.8398)+bp*(324.3904)+s1*(-792.1842)+s2*476.7458+s3*101.0446+s4*177.0642+s5*751.27+s6*67.62

In [11]:
y_pred.shape

(111, 1)

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

print('Mean Squared Error: %2f'
     % mean_squared_error(y_test,y_pred))

Mean Squared Error: 3180.198837


In [13]:
print('Variance score: %2f' %r2_score(y_test,y_pred))

Variance score: 0.359401


In [14]:
print(regressor.score(x_test,y_test))

0.35940090989715534


In [15]:
df = pd.DataFrame({'Actual':y_test['Glucose'], 'Predicted':y_pred.flatten()})
df.head()

Unnamed: 0,Actual,Predicted
362,321.0,241.847303
249,215.0,250.123039
271,127.0,164.964565
435,64.0,119.116393
400,175.0,188.231203


In [16]:
df['difference']=df['Actual']-df['Predicted']

In [17]:
df.head()

Unnamed: 0,Actual,Predicted,difference
362,321.0,241.847303,79.152697
249,215.0,250.123039,-35.123039
271,127.0,164.964565,-37.964565
435,64.0,119.116393,-55.116393
400,175.0,188.231203,-13.231203


In [18]:
df['pererror']=df['difference']/df['Actual']*100
df.head()

Unnamed: 0,Actual,Predicted,difference,pererror
362,321.0,241.847303,79.152697,24.658161
249,215.0,250.123039,-35.123039,-16.336297
271,127.0,164.964565,-37.964565,-29.893359
435,64.0,119.116393,-55.116393,-86.119365
400,175.0,188.231203,-13.231203,-7.560687


In [19]:
final = pd.concat([x_test,df],axis=1)
final.sample()


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Actual,Predicted,difference,pererror
142,0.041708,0.05068,0.012117,0.039087,0.054845,0.044406,0.00446,-0.002592,0.045601,-0.001078,235.0,172.245304,62.754696,26.704126


In [20]:
final.to_csv('Diabetes_Final.csv')

In [21]:
final[final.pererror>0]

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Actual,Predicted,difference,pererror
362,0.019913,0.05068,0.104809,0.070073,-0.035968,-0.026679,-0.024993,-0.002592,0.003712,0.040343,321.0,241.847303,79.152697,24.658161
403,-0.020045,-0.044642,0.097264,-0.005671,-0.005697,-0.023861,-0.021311,-0.002592,0.061686,0.040343,275.0,260.560794,14.439206,5.25062
12,0.016281,-0.044642,-0.02884,-0.009113,-0.004321,-0.009769,0.044958,-0.039493,-0.030751,-0.042499,179.0,113.075838,65.924162,36.829141
399,0.059871,0.05068,0.022895,0.049415,0.016318,0.011838,-0.013948,-0.002592,0.03954,0.019633,232.0,190.541175,41.458825,17.870183
78,0.005383,-0.044642,-0.057941,-0.022885,-0.067615,-0.068328,-0.054446,-0.002592,0.042896,-0.08392,252.0,168.768441,83.231559,33.028396
298,0.023546,0.05068,-0.037463,-0.046985,-0.091006,-0.07553,-0.032356,-0.039493,-0.030751,-0.013504,129.0,109.16037,19.83963,15.379558
268,0.063504,0.05068,0.088642,0.070073,0.020446,0.037517,-0.050764,0.07121,0.0293,0.07348,264.0,244.739905,19.260095,7.295491
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0,66.97735,8.02265,10.696866
10,-0.096328,-0.044642,-0.083808,0.008101,-0.103389,-0.090561,-0.013948,-0.076395,-0.062913,-0.034215,101.0,98.046734,2.953266,2.924026
37,-0.009147,-0.044642,0.011039,-0.057314,-0.02496,-0.042963,0.030232,-0.039493,0.017037,-0.00522,276.0,160.917691,115.082309,41.696489
