This example is a modification of an example in https://towardsdatascience.com/from-linear-regression-to-ridge-regression-the-lasso-and-the-elastic-net-4eaecaf5f7e6

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
import os
cancerData=pd.read_csv('data/prostate.txt',delimiter='\t')

In [3]:
print(cancerData)

    id    lcavol   lweight  age      lbph  svi       lcp  gleason  pgg45  \
0    1 -0.579818  2.769459   50 -1.386294    0 -1.386294        6      0   
1    2 -0.994252  3.319626   58 -1.386294    0 -1.386294        6      0   
2    3 -0.510826  2.691243   74 -1.386294    0 -1.386294        7     20   
3    4 -1.203973  3.282789   58 -1.386294    0 -1.386294        6      0   
4    5  0.751416  3.432373   62 -1.386294    0 -1.386294        6      0   
..  ..       ...       ...  ...       ...  ...       ...      ...    ...   
92  93  2.830268  3.876396   68 -1.386294    1  1.321756        7     60   
93  94  3.821004  3.896909   44 -1.386294    1  2.169054        7     40   
94  95  2.907447  3.396185   52 -1.386294    1  2.463853        7     10   
95  96  2.882564  3.773910   68  1.558145    1  1.558145        7     80   
96  97  3.471966  3.974998   68  0.438255    1  2.904165        7     20   

        lpsa train  
0  -0.430783     T  
1  -0.162519     T  
2  -0.162519     T  
3  

In [4]:
trainCancer=cancerData[cancerData.loc[:,'train']=='T']
testCancer=cancerData[cancerData.loc[:,'train']=='F']
x_train=trainCancer.drop(columns=['id','lpsa','train','svi','gleason'])
x_test= testCancer.drop(columns=['id','lpsa','train','svi','gleason'])
y_train=trainCancer.loc[:,'lpsa']
x_train_scaled=sklearn.preprocessing.scale(x_train, axis=0, with_mean=True,with_std=True, copy=True)
x_test_scaled=sklearn.preprocessing.scale(x_test, axis=0, with_mean=True,with_std=True, copy=True)
y_test=testCancer.loc[:,'lpsa']


In [6]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(x_train_scaled,y_train)
print("lr.coef_:", lr.coef_)
print("lr.intercept_:", lr.intercept_)

lr.coef_: [ 0.78439116  0.3273385  -0.15323211  0.16153859 -0.15480369  0.28357531]
lr.intercept_: 2.4523450850746262


In [7]:
print("Training set score:  {:.2f}".format(lr.score(x_train_scaled,y_train)))
print("Test set score:  {:.2f}".format(lr.score(x_test_scaled,y_test)))

Training set score:  0.66
Test set score:  0.40


Vary alpha in example below to check effect of regularisation

In [13]:
from sklearn.linear_model import Ridge
lr = Ridge(alpha=30).fit(x_train_scaled,y_train)
print("lr.coef_:", lr.coef_)
print("lr.intercept_:", lr.intercept_)
print("Training set score:  {:.2f}".format(lr.score(x_train_scaled,y_train)))
print("Test set score:  {:.2f}".format(lr.score(x_test_scaled,y_test)))

lr.coef_: [ 0.45113504  0.25401597 -0.02744823  0.13620735  0.09472754  0.17215893]
lr.intercept_: 2.4523450850746262
Training set score:  0.61
Test set score:  0.47


Vary alpha in example below to check effect of regularisation

In [17]:
from sklearn.linear_model import Lasso
lr = Lasso(alpha=0.2).fit(x_train_scaled,y_train)
print("lr.coef_:", lr.coef_)
print("lr.intercept_:", lr.intercept_)
print("Training set score:  {:.2f}".format(lr.score(x_train_scaled,y_train)))
print("Test set score:  {:.2f}".format(lr.score(x_test_scaled,y_test)))

lr.coef_: [0.60493138 0.19780351 0.         0.         0.         0.03015419]
lr.intercept_: 2.4523450850746267
Training set score:  0.58
Test set score:  0.53
