In [207]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import Ridge, Lasso

### Load dataset

In [208]:
df = pd.read_csv('Hitters.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,-Andy Allanson,293,66,1,30,29,14,1,293,66,...,30,29,14,A,E,446,33,20,,A
1,-Alan Ashby,315,81,7,24,38,39,14,3449,835,...,321,414,375,N,W,632,43,10,475.0,N
2,-Alvin Davis,479,130,18,66,72,76,3,1624,457,...,224,266,263,A,W,880,82,14,480.0,A
3,-Andre Dawson,496,141,20,65,78,37,11,5628,1575,...,828,838,354,N,E,200,11,3,500.0,N
4,-Andres Galarraga,321,87,10,39,42,30,2,396,101,...,48,46,33,N,E,805,40,4,91.5,N


In [209]:
# As before we have 59 null value 
df = df.iloc[:, 1:]
df.isnull().sum()

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64

In [210]:
#3 Let's remove null values
df_hitter = df.dropna()

In [211]:
# Check what we get
df_hitter.isnull().sum()

AtBat        0
Hits         0
HmRun        0
Runs         0
RBI          0
Walks        0
Years        0
CAtBat       0
CHits        0
CHmRun       0
CRuns        0
CRBI         0
CWalks       0
League       0
Division     0
PutOuts      0
Assists      0
Errors       0
Salary       0
NewLeague    0
dtype: int64

In [212]:
print('Shape before removal of null values {}'.format(df.shape))
print('Shape after removal of null values {}'.format(df_hitter.shape))

Shape before removal of null values (322, 20)
Shape after removal of null values (263, 20)


In [213]:
# Object columns:
object_col = df_hitter.dtypes == object
object_col[object_col == True].index

Index(['League', 'Division', 'NewLeague'], dtype='object')

In [214]:
df_hitter[object_col[object_col == True].index].head()

Unnamed: 0,League,Division,NewLeague
1,N,W,N
2,A,W,A
3,N,E,N
4,N,E,N
5,A,W,A


In [215]:
df_hitter[object_col[object_col == True].index].describe()

Unnamed: 0,League,Division,NewLeague
count,263,263,263
unique,2,2,2
top,A,W,A
freq,139,134,141


In [216]:
# dummy variables:
dummies = pd.get_dummies(df_hitter[object_col[object_col == True].index])
dummies.head()

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,0,1,0,1,0,1
2,1,0,0,1,1,0
3,0,1,1,0,0,1
4,0,1,1,0,0,1
5,1,0,0,1,1,0


In [217]:
# Intotal 6 columns with dummy variables, we just need 3 of them:
df_dummies = dummies.iloc[:, 1::2] 
df_dummies.head()

Unnamed: 0,League_N,Division_W,NewLeague_N
1,1,1,1
2,0,1,0
3,1,0,1
4,1,0,1
5,0,1,0


In [218]:
df_hitter.drop(object_col[object_col == True].index, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [219]:
# Concatenate dataframe with dummy variables to one with float varianles.
df = pd.concat([df_hitter, df_dummies], axis = 1)
df.shape

(263, 20)

In [220]:
# Let's see the head of the dataframe: 
df.head(5)

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_N,Division_W,NewLeague_N
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,1,1,1
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,0,1,0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,1,0,1
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,1,0,1
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0,0,1,0


In [221]:
# Get the features of the dataframe
features = df.columns.tolist()

# Remove the target from features
features.remove('Salary')
features

['AtBat',
 'Hits',
 'HmRun',
 'Runs',
 'RBI',
 'Walks',
 'Years',
 'CAtBat',
 'CHits',
 'CHmRun',
 'CRuns',
 'CRBI',
 'CWalks',
 'PutOuts',
 'Assists',
 'Errors',
 'League_N',
 'Division_W',
 'NewLeague_N']

In [222]:
# Get predictors and Targets (or called response) 
x = df[features]
y = df['Salary']

In [223]:
print(x.shape, y.shape)

(263, 19) (263,)


### Ridge Regression

In [227]:
alphas = np.logspace(-2, 10, 13)
alphas = np.hstack([alphas, 2*11.498, 2*705, 1.5199])
alphas

array([1.0000e-02, 1.0000e-01, 1.0000e+00, 1.0000e+01, 1.0000e+02,
       1.0000e+03, 1.0000e+04, 1.0000e+05, 1.0000e+06, 1.0000e+07,
       1.0000e+08, 1.0000e+09, 1.0000e+10, 2.2996e+01, 1.4100e+03,
       1.5199e+00])

In [228]:
ridge_models = []
results = []
df_model = pd.DataFrame([])
for alpha in alphas:
    model = Ridge(alpha = alpha, normalize=True, fit_intercept=True)
    ridge_models.append(model.fit(x, y))
    df_model['Alpha {}'.format(alpha)] = np.hstack([model.intercept_, model.coef_])

index = ['Intercept'] + features    
df_model.index = index

In [229]:
df_model.head()

Unnamed: 0,Alpha 0.01,Alpha 0.1,Alpha 1.0,Alpha 10.0,Alpha 100.0,Alpha 1000.0,Alpha 10000.0,Alpha 100000.0,Alpha 1000000.0,Alpha 10000000.0,Alpha 100000000.0,Alpha 1000000000.0,Alpha 10000000000.0,Alpha 22.996,Alpha 1410.0,Alpha 1.5199
Intercept,147.109548,52.604119,26.666878,290.028874,496.797786,531.766371,535.50729,535.883996,535.921693,535.9255,535.9258,535.9259,535.9259,396.482569,532.969849,52.026887
AtBat,-1.583572,-0.397827,0.09798,0.069853,0.011285,0.0012,0.000121,1.2e-05,1e-06,1.208959e-07,1.20896e-08,1.20896e-09,1.20896e-10,0.040057,0.000853,0.111494
Hits,5.546088,2.062878,0.767038,0.273515,0.041257,0.004358,0.000438,4.4e-05,4e-06,4.385436e-07,4.385439e-08,4.385439e-09,4.385439e-10,0.150266,0.003096,0.662912
HmRun,0.632268,-1.32131,0.893927,0.952606,0.163672,0.017532,0.001766,0.000177,1.8e-05,1.767093e-06,1.767094e-07,1.767095e-08,1.767095e-09,0.566905,0.012463,1.16382
Runs,-0.280001,1.144877,1.017932,0.448171,0.069536,0.007367,0.000741,7.4e-05,7e-06,7.416103e-07,7.416108e-08,7.416108e-09,7.416108e-10,0.250534,0.005235,0.943735


In [199]:
# Coefficients for alpha = 2 * 11.498 and l_2 norm of coefficients
l_2_norm = np.sqrt(sum(df_model[1:]['Alpha 22.996'].apply(lambda x: x**2)))
print('l_2 norm of coefficients: {}'.format(l_2_norm))
df_model['Alpha 22.996']

l_2 norm of coefficients: 7.000275850415389


Intercept      396.482569
AtBat            0.040057
Hits             0.150266
HmRun            0.566905
Runs             0.250534
RBI              0.260034
Walks            0.314859
Years            1.195985
CAtBat           0.003391
CHits            0.012634
CHmRun           0.094870
CRuns            0.025346
CRBI             0.026170
CWalks           0.027059
PutOuts          0.018061
Assists          0.002852
Errors          -0.023451
League_N         0.145566
Division_W      -6.843212
NewLeague_N      0.368163
Name: Alpha 22.996, dtype: float64