In [12]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# load training data
training_data = np.load('./training_data.npy')
prices = np.load('./prices.npy')
# print the first 4 samples
print('The first 4 samples are:\n ', training_data[:4])
print('The first 4 prices are:\n ', prices[:4])
# shuffle  
training_data, prices = shuffle(training_data, prices, random_state=0)

The first 4 samples are:
  [[2.0150e+03 4.1000e+04 1.9670e+01 1.5820e+03 1.2620e+02 5.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0110e+03 4.6000e+04 1.8200e+01 1.1990e+03 8.8700e+01 5.0000e+00
  1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0120e+03 8.7000e+04 2.0770e+01 1.2480e+03 8.8760e+01 7.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0130e+03 8.6999e+04 2.3080e+01 1.4610e+03 6.3100e+01 5.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]]
The first 4 prices are:
  [12.5  4.5  6.   3.5]


In [13]:
def normalize_data(training_data, norm_type = 'standardize'):

    scaler = StandardScaler() if norm_type == 'standardize' else MinMaxScaler()

    scaled_data = scaler.fit_transform(training_data)

    return scaled_data,scaler


In [22]:
def cross_validate_regression_model(model, X, y, kfolds = 3):
    scores = cross_validate(model, X, prices, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'),cv=kfolds)
    mae = np.mean( - scores['test_neg_mean_absolute_error'])
    mse = np.mean( - scores['test_neg_mean_squared_error'])
    print("Mean MAE", mae)
    print("MEAN MSE", mse)
    return mae, mse


In [23]:
X,_ = normalize_data(training_data)
linear_regression_model = LinearRegression()
cross_validate_regression_model(linear_regression_model, X, prices, kfolds= 4)

Mean MAE 1.317816585302353
MEAN MSE 3.161262333393097


(1.317816585302353, 3.161262333393097)

In [30]:
best_mse = 1e10
best_alpha = 0
for alpha in [1,10,100,1000]:
    ridge_model = Ridge(alpha=alpha)
    print('Alpha',alpha)
    mae, mse = cross_validate_regression_model(ridge_model,X, prices, kfolds=3)
    if mse < best_mse:
        best_mse = mse
        best_alpha = alpha

print('Best alpha', best_alpha)
print(best_mse)

Alpha 1
Mean MAE 1.319583574930827
MEAN MSE 3.1674206256866455
Alpha 10
Mean MAE 1.319376826286316
MEAN MSE 3.1672795613606772
Alpha 100
Mean MAE 1.318572719891866
MEAN MSE 3.1722089449564614
Alpha 1000
Mean MAE 1.3664931058883667
MEAN MSE 3.4328153928120932
Best alpha 10
3.1672795613606772


In [36]:
ridge_model = Ridge(alpha=10)
ridge_model.fit(X, prices)
print('Bias',ridge_model.intercept_)
print('Coeffs', ridge_model.coef_)
print("Coeffs inds desc",np.argsort( - ridge_model.coef_))

Bias 5.69513
Coeffs [ 1.6635183  -0.15533449 -0.4603473   0.40462846  1.3356851   0.13251413
 -0.08683531  0.          0.36667007 -0.36665967  0.          0.
 -0.22936955  0.22937755]
Coeffs inds desc [ 0  4  3  8 13  5  7 10 11  6  1 12  9  2]
