In [43]:
import pandas as pd
import numpy as np 
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
import warnings
warnings.filterwarnings('ignore')

In [14]:
#sns.get_dataset_names()
df=datasets.load_diabetes()
dir(df)

['DESCR',
 'data',
 'data_filename',
 'feature_names',
 'target',
 'target_filename']

In [25]:
df.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [22]:
#seprate input and output
df_input=pd.DataFrame(df.data,columns=df.feature_names)
df_output=pd.DataFrame(df.target,columns=['DiseaseProgression'])

In [23]:
df_input   #all values are in scaled format already

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [24]:
df_output

Unnamed: 0,DiseaseProgression
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0
...,...
437,178.0
438,104.0
439,132.0
440,220.0


In [26]:
#lets check whether it contains any null values or not
df_input.isnull().sum()
#no null value present in input

age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64

In [27]:
df_output.isnull().sum()   # no null value present in output

DiseaseProgression    0
dtype: int64

In [28]:
#lets check the correlation matrix
pd.concat([df_input,df_output],axis=1).corr()
# most of features have week correlation with target only some are around 0.5

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,DiseaseProgression
age,1.0,0.173737,0.185085,0.335427,0.260061,0.219243,-0.075181,0.203841,0.270777,0.301731,0.187889
sex,0.173737,1.0,0.088161,0.241013,0.035277,0.142637,-0.37909,0.332115,0.149918,0.208133,0.043062
bmi,0.185085,0.088161,1.0,0.395415,0.249777,0.26117,-0.366811,0.413807,0.446159,0.38868,0.58645
bp,0.335427,0.241013,0.395415,1.0,0.24247,0.185558,-0.178761,0.257653,0.393478,0.390429,0.441484
s1,0.260061,0.035277,0.249777,0.24247,1.0,0.896663,0.051519,0.542207,0.515501,0.325717,0.212022
s2,0.219243,0.142637,0.26117,0.185558,0.896663,1.0,-0.196455,0.659817,0.318353,0.2906,0.174054
s3,-0.075181,-0.37909,-0.366811,-0.178761,0.051519,-0.196455,1.0,-0.738493,-0.398577,-0.273697,-0.394789
s4,0.203841,0.332115,0.413807,0.257653,0.542207,0.659817,-0.738493,1.0,0.617857,0.417212,0.430453
s5,0.270777,0.149918,0.446159,0.393478,0.515501,0.318353,-0.398577,0.617857,1.0,0.46467,0.565883
s6,0.301731,0.208133,0.38868,0.390429,0.325717,0.2906,-0.273697,0.417212,0.46467,1.0,0.382483


In [29]:
# lets describe the dataset
pd.concat([df_input,df_output],axis=1).describe()
# scaled data

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,DiseaseProgression
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.634285e-16,1.308343e-16,-8.045349e-16,1.281655e-16,-8.835316000000001e-17,1.327024e-16,-4.574646e-16,3.777301e-16,-3.830854e-16,-3.412882e-16,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118,346.0


In [31]:
# lets check outlier using z score
from scipy.stats import zscore
df_combined=pd.concat([df_input,df_output],axis=1)
print(df_combined.shape)
z_scr=zscore(df_combined)
df_combined_new=df_combined.loc[(abs(z_scr)<3).all(axis=1)]
print(df_combined_new.shape)

(442, 11)
(430, 11)


In [47]:
# now again seprating input and output
df_x=df_combined_new.drop(columns=['DiseaseProgression'])
df_y=df_combined_new[["DiseaseProgression"]]

In [52]:
def maxr2_score(regr,df_x,df_y):
    max_r_score=0
    for r_state in range(42,100):
        x_train,x_test,y_train,y_test=train_test_split(df_x,df_y,random_state=r_state,test_size=0.20)
        
        regr.fit(x_train,y_train)
        y_pred=regr.predict(x_test)
        r2_scr=r2_score(y_test,y_pred)
        print("r2_score corrosponding to random state :",r_state," is: ",r2_scr)
        if r2_scr>max_r_score:
            max_r_score=r2_scr
            final_r_state=r_state

    print("max r2 score corrosponding to:",final_r_state," is ",max_r_score)
    return final_r_state

In [53]:
# lets use linear regression and check max r2_score to different random states
from sklearn.linear_model import LinearRegression
lreg=LinearRegression()
r_state=maxr2_score(lreg,df_x,df_y)

r2_score corrosponding to random state : 42  is:  0.3666696589544701
r2_score corrosponding to random state : 43  is:  0.5414633719716159
r2_score corrosponding to random state : 44  is:  0.502830202394811
r2_score corrosponding to random state : 45  is:  0.41940208977825444
r2_score corrosponding to random state : 46  is:  0.49944594921344987
r2_score corrosponding to random state : 47  is:  0.5696168464053883
r2_score corrosponding to random state : 48  is:  0.48941614789926113
r2_score corrosponding to random state : 49  is:  0.4157614115642505
r2_score corrosponding to random state : 50  is:  0.5113404267738513
r2_score corrosponding to random state : 51  is:  0.25353904234282654
r2_score corrosponding to random state : 52  is:  0.6129709896007236
r2_score corrosponding to random state : 53  is:  0.41643342507750947
r2_score corrosponding to random state : 54  is:  0.5374683464284008
r2_score corrosponding to random state : 55  is:  0.48857235252489817
r2_score corrosponding to ran

In [54]:
# lets use grid search to find the optimal value of n neighbor for KNN model
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
neighbors={'n_neighbors':range(1,30)}
knr=KNeighborsRegressor()
gsc=GridSearchCV(knr,neighbors,cv=10)
gsc.fit(df_x,df_y)
gsc.best_params_


{'n_neighbors': 19}

In [55]:
#lets use KNN regression and check max r2_score corrosponding different random state
knr=KNeighborsRegressor(n_neighbors=19)
r_state=maxr2_score(knr,df_x,df_y)

r2_score corrosponding to random state : 42  is:  0.3754639421781797
r2_score corrosponding to random state : 43  is:  0.4850176749925311
r2_score corrosponding to random state : 44  is:  0.5070124907462026
r2_score corrosponding to random state : 45  is:  0.34696634001528726
r2_score corrosponding to random state : 46  is:  0.451542905441379
r2_score corrosponding to random state : 47  is:  0.4895196971602195
r2_score corrosponding to random state : 48  is:  0.4958809239179024
r2_score corrosponding to random state : 49  is:  0.42630690730410414
r2_score corrosponding to random state : 50  is:  0.45313119612132746
r2_score corrosponding to random state : 51  is:  0.2678001378172621
r2_score corrosponding to random state : 52  is:  0.5358847489904657
r2_score corrosponding to random state : 53  is:  0.4586727843601762
r2_score corrosponding to random state : 54  is:  0.46474045120456287
r2_score corrosponding to random state : 55  is:  0.41518128514327657
r2_score corrosponding to rand

In [56]:
# lets check mean r2_score for KNN and linear regression
from sklearn.model_selection import cross_val_score
print('Mean r2_score for Linear Regression :',cross_val_score(lreg,df_x,df_y,cv=5,scoring='r2').mean())
print('Standard deviation  for Linear Regression :',cross_val_score(lreg,df_x,df_y,cv=5,scoring='r2').std())
print()

Mean r2_score for Linear Regression : 0.4676911221693185
Standard deviation for Linear Regression : 0.05595217785841939



In [57]:
print('Mean r2_score for KNN :',cross_val_score(knr,df_x,df_y,cv=5,scoring='r2').mean())
print('Mean r2_score for KNN :',cross_val_score(knr,df_x,df_y,cv=5,scoring='r2').std())


Mean r2_score for KNN : 0.4447393304649888
Mean r2_score for KNN : 0.06885364198445655


In [58]:
# lets check the lasso regression find the best value of alpha
from sklearn.linear_model import Lasso
lsreg=Lasso()
parameter={"alpha":[0.0001,0.001,0.01,0.1,1]}
lgsc=GridSearchCV(lsreg,parameter,cv=10)
lgsc.fit(df_x,df_y)
lgsc.best_params_

{'alpha': 0.1}

In [59]:
#lets check max r2_score when we use Lasso
lsreg=Lasso(alpha=0.1)
r_state=maxr2_score(lsreg,df_x,df_y)

r2_score corrosponding to random state : 42  is:  0.39559614133403775
r2_score corrosponding to random state : 43  is:  0.5279704191720399
r2_score corrosponding to random state : 44  is:  0.50503294565865
r2_score corrosponding to random state : 45  is:  0.4036196797761409
r2_score corrosponding to random state : 46  is:  0.5035821455123144
r2_score corrosponding to random state : 47  is:  0.5557723339863485
r2_score corrosponding to random state : 48  is:  0.4951902457271492
r2_score corrosponding to random state : 49  is:  0.41898867700436127
r2_score corrosponding to random state : 50  is:  0.4841336844871694
r2_score corrosponding to random state : 51  is:  0.2705971586987356
r2_score corrosponding to random state : 52  is:  0.5965279705889219
r2_score corrosponding to random state : 53  is:  0.4227011451196119
r2_score corrosponding to random state : 54  is:  0.514897039321015
r2_score corrosponding to random state : 55  is:  0.4921979652112449
r2_score corrosponding to random st

In [60]:
# lets use cross_val_score with lasso
print('Mean r2_score for lasso regression:',cross_val_score(lsreg,df_x,df_y,cv=5,scoring='r2').mean())
print('Std. deviation in r2_score for lasso regression :',cross_val_score(lsreg,df_x,df_y,cv=5,scoring='r2').std())
#based on below output i can say lasso performed just like Linear regression

Mean r2_score for lasso regression: 0.4668358456202258
Std. deviation in r2_score for lasso regression : 0.05024883603439513


In [61]:
# now try to boost the result using gradient boosting technique
# for getting best set of parameter we will use grid search
from sklearn.ensemble import GradientBoostingRegressor
gbr=GradientBoostingRegressor()
parameters={'learning_rate':[0.001,0.01,0.1,1],'n_estimators':[10,100,500,1000]}
clf=GridSearchCV(gbr,parameters,cv=5)
clf.fit(df_x,df_y)
clf.best_params_

{'learning_rate': 0.01, 'n_estimators': 500}

In [62]:
# now we will use cross_val_score to check the mean r2 score and standard deviation
gbr=GradientBoostingRegressor(learning_rate=0.01,n_estimators=500)
print('Mean r2_score for gradient boosting regression:',cross_val_score(gbr,df_x,df_y,cv=5,scoring='r2').mean())
print('Std. deviation in r2_score for gradient boosting regression :',cross_val_score(gbr,df_x,df_y,cv=5,scoring='r2').std())


Mean r2_score for gradient boosting regression: 0.4087664974730674
Std. deviation in r2_score for gradient boosting regression : 0.07535985977794878


In [66]:
#now lets use AdaBoost Regression algorithm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
ada_reg=AdaBoostRegressor()
parameters={'learning_rate':[0.001,0.01,0.1,1],'n_estimators':[10,100,500,1000],'base_estimator':[lreg,lsreg,DecisionTreeRegressor()]}
clf1=GridSearchCV(ada_reg,parameters,cv=5)
clf1.fit(df_x,df_y)
clf1.best_params_



{'base_estimator': Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
       normalize=False, positive=False, precompute=False, random_state=None,
       selection='cyclic', tol=0.0001, warm_start=False),
 'learning_rate': 0.001,
 'n_estimators': 10}

In [68]:
ada_reg=AdaBoostRegressor(base_estimator=lsreg,learning_rate=0.001,n_estimators=10)
print('Mean r2_score for Ada boosting regression:',cross_val_score(ada_reg,df_x,df_y,cv=5,scoring='r2').mean())
print('Std. deviation in r2_score for Ada boosting regression :',cross_val_score(ada_reg,df_x,df_y,cv=5,scoring='r2').std())



Mean r2_score for Ada boosting regression: 0.46626601160322617
Std. deviation in r2_score for Ada boosting regression : 0.051101306359568985


In [69]:
# lets check maximum r2_score corresponding to this 
r_state=maxr2_score(ada_reg,df_x,df_y)

r2_score corrosponding to random state : 42  is:  0.3935442373118436
r2_score corrosponding to random state : 43  is:  0.5226269199319575
r2_score corrosponding to random state : 44  is:  0.4965225409076255
r2_score corrosponding to random state : 45  is:  0.39763305500819424
r2_score corrosponding to random state : 46  is:  0.49970891131838324
r2_score corrosponding to random state : 47  is:  0.5667103267251337
r2_score corrosponding to random state : 48  is:  0.4933377610816234
r2_score corrosponding to random state : 49  is:  0.4182429167577235
r2_score corrosponding to random state : 50  is:  0.4796276583872824
r2_score corrosponding to random state : 51  is:  0.2567198997338441
r2_score corrosponding to random state : 52  is:  0.605477999490541
r2_score corrosponding to random state : 53  is:  0.42671427701471965
r2_score corrosponding to random state : 54  is:  0.524443741016313
r2_score corrosponding to random state : 55  is:  0.48378568848001247
r2_score corrosponding to random

In [70]:
# we tried all the models and till now linear regression is best
#random state corrosponding to highest r2_score is 99
x_train,x_test,y_train,y_test=train_test_split(df_x,df_y,random_state=99,test_size=0.20)
lreg=LinearRegression()
lreg.fit(x_train,y_train)
y_pred=lreg.predict(x_test)


In [71]:
#find rmse and r2_score using sklearn.metrics
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print('RMSE is :',np.sqrt(mean_squared_error(y_test,y_pred)))
print('r2_score is :',r2_score(y_test,y_pred))

RMSE is : 48.95593018443329
r2_score is : 0.626951653171429


In [72]:
from sklearn.externals import joblib
#save file as pickle
joblib.dump(lreg,'diabetes.pkl')

['diabetes.pkl']