In [73]:
%matplotlib inline

In [74]:
print(__doc__)

import numpy as np
from sklearn import datasets
diabetes = datasets.load_diabetes()
print(diabetes.DESCR)

Automatically created module for IPython interactive environment
.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.

In [75]:
print(len(diabetes))
print(diabetes.keys())
print(diabetes.data_filename)
print(diabetes.target_filename)
print(diabetes.feature_names)

6
dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])
C:\ProgramData\Anaconda3\envs\IT1602-17-737-114\lib\site-packages\sklearn\datasets\data\diabetes_data.csv.gz
C:\ProgramData\Anaconda3\envs\IT1602-17-737-114\lib\site-packages\sklearn\datasets\data\diabetes_target.csv.gz
['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


In [76]:
print(type(diabetes.target))
print(len(diabetes.target))
print(diabetes.target[:10])


<class 'numpy.ndarray'>
442
[151.  75. 141. 206. 135.  97. 138.  63. 110. 310.]


In [77]:
print(type(diabetes.data))
print(diabetes.data.shape)
print(diabetes.data[:2])

<class 'numpy.ndarray'>
(442, 10)
[[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990842 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632783 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06832974 -0.09220405]]


In [78]:
import pandas as pd
df = pd.DataFrame(diabetes.data)
print(df.describe())
print( df.info())

                  0             1             2             3             4  \
count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   
mean  -3.639623e-16  1.309912e-16 -8.013951e-16  1.289818e-16 -9.042540e-17   
std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   
min   -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123996e-01 -1.267807e-01   
25%   -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665645e-02 -3.424784e-02   
50%    5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670611e-03 -4.320866e-03   
75%    3.807591e-02  5.068012e-02  3.124802e-02  3.564384e-02  2.835801e-02   
max    1.107267e-01  5.068012e-02  1.705552e-01  1.320442e-01  1.539137e-01   

                  5             6             7             8             9  
count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  
mean   1.301121e-16 -4.563971e-16  3.863174e-16 -3.848103e-16 -3.398488e-16  
std    4.761905e-02  4.761905e-02  4.761905e-02  4.761

In [93]:
diabetes_X = diabetes.data[:,0:10]
print(diabetes_X.shape)
print(diabetes_X[0:4])

(442, 10)
[[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990842 -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 -0.02632783 -0.00844872 -0.01916334
   0.07441156 -0.03949338 -0.06832974 -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 -0.00567061 -0.04559945 -0.03419447
  -0.03235593 -0.00259226  0.00286377 -0.02593034]
 [-0.08906294 -0.04464164 -0.01159501 -0.03665645  0.01219057  0.02499059
  -0.03603757  0.03430886  0.02269202 -0.00936191]]


In [80]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split( diabetes_X,diabetes.target,test_size = 0.1,random_state=42)

In [94]:
from sklearn import linear_model
regr = linear_model.LinearRegression()

regr.fit(X_train, Y_train)
print( 'Coefficients:\n',regr.coef_)
print('Bias:\n,',regr.intercept_)

Coefficients:
 [  19.92576904 -262.55453086  509.19112446  336.09693678 -849.29530342
  480.22076125  120.68418641  236.71853501  716.61035542   70.41045019]
Bias:
, 151.7227046642232


In [91]:
print(regr.score(X_test,Y_test))

0.5514251914993505


In [90]:
print(regr.predict(X_test[0:10]))

[143.06621271 177.70923973 134.80159283 288.66523611 123.58429291
  96.64399491 252.70865552 183.51563317  93.96508916 109.83316004]


In [89]:
diabetes_y_pred = regr.predict(X_test)

from sklearn.metrics import mean_squared_error,r2_score
print( "mean squared error: %.2f" % mean_squared_error(Y_test,diabetes_y_pred))

print(  'variance score: %.2f'% r2_score( Y_test,diabetes_y_pred))

mean squared error: 2743.88
variance score: 0.55
