# The goal of this assignment is to load scikit learn datasets, perform some data exploration and train a regression model.
#### Use Manual Hypertuning

## We use the diabetics dataset which is modelled for regression task

In [16]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
import pandas as pd

In [17]:
diabetes=load_diabetes()
diabetes.keys()

dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [18]:
diabetes.data.shape

(442, 10)

In [19]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [20]:
print (diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bra

In [21]:
diabetes.target # Progression of diabetics

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

# Load the data into a pandas frame and explore

In [22]:
df = pd.DataFrame(diabetes.data)
df.columns = diabetes.feature_names
df['PROGRESS'] = diabetes.target

In [23]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,PROGRESS
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [24]:
df.age.describe()

count    4.420000e+02
mean    -3.634285e-16
std      4.761905e-02
min     -1.072256e-01
25%     -3.729927e-02
50%      5.383060e-03
75%      3.807591e-02
max      1.107267e-01
Name: age, dtype: float64

#### This dataset is already scaled as per the description. So we can skip on the preprocessing or scaling aspect

In [25]:
#Split the dataset into train and test

In [26]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, random_state=5, test_size=0.2) 

In [27]:
#Fit a regression model and score it

regressor = Ridge()  
regressor.fit(X_train, y_train) #training the algorithm
print("R^2 on training set: %f" % regressor.score(X_train, y_train))
print("R^2 on test set: %f" % regressor.score(X_test, y_test))

R^2 on training set: 0.430110
R^2 on test set: 0.431605


In [28]:
features = diabetes.feature_names
coefficents =regressor.coef_ # these are the coeff for our 13 features
df_diabetes_model=pd.DataFrame(features,
                             columns= ['feature'])
df_diabetes_model['coeff'] = coefficents
df_diabetes_model.head(15)

Unnamed: 0,feature,coeff
0,age,34.980295
1,sex,-70.418618
2,bmi,289.990899
3,bp,197.301093
4,s1,14.361706
5,s2,-6.028385
6,s3,-140.961042
7,s4,112.855692
8,s5,215.648129
9,s6,95.414386


In [29]:
# Find the rmse metrics for both train and test data
y_test_pred = regressor.predict(X_test)
y_train_pred = regressor.predict(X_train)
print (mean_squared_error(y_train_pred, y_train))
print (mean_squared_error(y_test_pred, y_test))

3319.3669227403043
3584.0955479468953


In [15]:
# Hypertune your model parameters

for value in [0.001, 0.01, 0.1, 1, 10, 100]:
    print ("="* 10)
    print (f"The value for regularization is: {value}")
    clf_ridge = Ridge(alpha=value)
    clf_ridge.fit(X_train, y_train)
    print("R^2 on training set: %f" % clf_ridge.score(X_train, y_train))
    print("R^2 on test set: %f" % clf_ridge.score(X_test, y_test))
    

The value for regularization is: 0.001
R^2 on training set: 0.511486
R^2 on test set: 0.527259
The value for regularization is: 0.01
R^2 on training set: 0.509993
R^2 on test set: 0.526460
The value for regularization is: 0.1
R^2 on training set: 0.504853
R^2 on test set: 0.517717
The value for regularization is: 1
R^2 on training set: 0.430110
R^2 on test set: 0.431605
The value for regularization is: 10
R^2 on training set: 0.158308
R^2 on test set: 0.157495
The value for regularization is: 100
R^2 on training set: 0.021484
R^2 on test set: 0.014421
