In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [6]:
x = df.drop(['MEDV'], axis =1)
y = df['MEDV']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


In [8]:
Ridge_model = Ridge(alpha=1).fit(x_train, y_train)
Ridge_model.intercept_

24.878370472969436

In [9]:
y_pred = Ridge_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

4.741357980709098

In [10]:
Ridge_model.coef_

array([-0.12383039,  0.03139178,  0.01767668,  2.54277179, -8.77249222,
        4.37980204, -0.01537349, -1.29086084,  0.24406848, -0.01082435,
       -0.83346553,  0.01348642, -0.53435396])

In [11]:
r2_score(y_test, y_pred)

0.6789748327846081

In [13]:
from sklearn.model_selection import GridSearchCV
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

#define grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.1)
model= Ridge()
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv =cv, n_jobs=-1)

results = search.fit(x_train, y_train)
print('MAE: %.3f' % results.best_score_)
print('config: %s' % results.best_params_)

MAE: -3.500
config: {'alpha': 0.7000000000000001}


In [14]:
Ridge_model = Ridge(alpha = 0.7,).fit(x_train, y_train)
y_pred = Ridge_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

4.7314372103936835

In [15]:
r2_score(y_test, y_pred)

0.680316847044105

In [17]:
pd.Series(Ridge_model.coef_, index = x_train.columns)

CRIM       -0.124643
ZN          0.031027
INDUS       0.023550
CHAS        2.595725
NOX       -10.178221
RM          4.382223
AGE        -0.014263
DIS        -1.311503
RAD         0.246439
TAX        -0.010654
PTRATIO    -0.850029
B           0.013431
LSTAT      -0.530846
dtype: float64