## Effect on minmax scaling on regression in three of its variants (ordinary least squares, ridge and lasso)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn import datasets

### load data

In [None]:
murders = pd.read_csv('data/murders.txt', sep=" ")
attributes = ['inhabitants','income','unemployment']
X = murders[attributes]
y = murders['murders']

FileNotFoundError: [Errno 2] No such file or directory: 'data/murders.txt'

In [None]:
X.head()

Unnamed: 0,inhabitants,income,unemployment
0,587,16.5,6.2
1,643,20.5,6.4
2,635,26.3,9.3
3,692,16.5,5.3
4,1248,19.2,7.3


### split into train and test ..

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=55)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(15, 3) (15,)
(5, 3) (5,)


### now, scale the training data and apply it to test data
(this is to avoid **_leakage_** which is an important concept in machine learning)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### visualize the transformation of _training_ data .. all values are between 0 and 1

In [1]:
pd.DataFrame(data = X_train_scaled, columns = list(X)).head()

NameError: name 'pd' is not defined

### visualize the transformation of _test_ data .. values can be outside [0,1] range !!!

In [None]:
pd.DataFrame(data = X_test_scaled, columns = list(X)).head()

Unnamed: 0,inhabitants,income,unemployment
0,0.299637,0.321429,0.648649
1,1.253853,0.047619,0.486486
2,0.021759,1.166667,1.189189
3,0.073436,-0.261905,0.405405
4,3.312783,0.190476,0.297297


### now, apply regression _with_ and _without_ scaling

In [None]:
alpha = 1
from sklearn.linear_model import LinearRegression, Ridge, Lasso
ols = LinearRegression().fit(X_train, y_train)
lasso = Lasso(alpha = alpha).fit(X_train, y_train)
ridge = Ridge(alpha = alpha).fit(X_train, y_train)
ols_scaled = LinearRegression().fit(X_train_scaled, y_train)
lasso_scaled = Lasso(alpha = alpha).fit(X_train_scaled, y_train)
ridge_scaled = Ridge(alpha = alpha).fit(X_train_scaled, y_train)

In [None]:
print ("model    \ttrain\ttest")
print()
print("ols      \t{:.4f}\t{:.4f}".format(ols.score(X_train, y_train), ols.score(X_test, y_test)))
print("lasso    \t{:.4f}\t{:.4f}".format(lasso.score(X_train, y_train), lasso.score(X_test, y_test)))
print("ridge    \t{:.4f}\t{:.4f}".format(ridge.score(X_train, y_train), ridge.score(X_test, y_test)))
print()
print("ols_sc  \t{:.4f}\t{:.4f}".format(ols_scaled.score(X_train_scaled, y_train), ols_scaled.score(X_test_scaled, y_test)))
print("lasso_sc\t{:.4f}\t{:.4f}".format(lasso_scaled.score(X_train_scaled, y_train), lasso_scaled.score(X_test_scaled, y_test)))
print("ridge_sc\t{:.4f}\t{:.4f}".format(ridge_scaled.score(X_train_scaled, y_train), ridge_scaled.score(X_test_scaled, y_test)))



model    	train	test

ols      	0.8387	0.3677
lasso    	0.8238	0.4810
ridge    	0.8375	0.4169

ols_sc  	0.8387	0.3677
lasso_sc	0.6830	0.5992
ridge_sc	0.7765	0.6137
