# Scikit-learn

In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("../Data/Advertising.csv", index_col=0)
df.head()


Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [65]:
number_of_samples, number_of_features = df.shape[0], df.shape[1] -1 # -1 because sales is label and not a feature
number_of_samples,number_of_features


(200, 3)

In [66]:
X,y = df.drop("Sales", axis = "columns"),df["Sales"]
X.head()


Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [67]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64

## Sklearn- steps

1. train|test split - some cases train|validation|test - split
2. Scaling sometimes required
   - min-max scaling
   - standardization
   - ....
   - scale the training data
   - Scale test data to the training data ---> avoiding date leakage
3. Fit the algorithm to the training data - model training
4. Predict the test data
5. Evaluate


## Train|test-split

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

## Feature Scaling

Normalization (min-max feature scaling)
- $X' = \frac{X-X_{\min}}{X_{max}-X_{min}}$

Feature Standardization

- $X' = \frac{X-\mu}{\sigma}$

In [69]:
from sklearn.preprocessing import MinMaxScaler
# we use normalization here
# instantiate a scalar instance
scaler = MinMaxScaler()
scaler.fit(X_train) # use the training data to fit the scaler

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}") # natural that it isn't [0,1] since we fit to training data 
# note scaled_X_test.min != 0 ,scaled_X_test.max != 0

0.00 ≤ scaled_X_train ≤ 1.00
0.01 ≤ scaled_X_test ≤ 1.13


In [70]:
scaled_X_train.shape, scaled_X_test.shape

((140, 3), (60, 3))

## Linear regression algorithm

### LinearRegression()

In [71]:
from sklearn.linear_model import LinearRegression

# this model uses SVD approach for solving normal equation
#model_SVD = LinearRegression()
#model_SVD.fit(scaled_X_train, y_train)
#print(f"Parameters: {model_SVD.coef_}")
#print(f"Intercept: {model_SVD.intercept_}")

In [72]:
model_OLS = LinearRegression()
model_OLS.fit(scaled_X_train, y_train)
print(f"Parameters : {model_OLS.coef_}") # beta1,beta2,beta3
print(f"Intercept: {model_OLS.intercept_}") # beta0

Parameters : [13.02832938  9.88465985  0.69237469]
Intercept: 2.7418553248528124


### Stochastic gradient descent

In [73]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(loss = "squared_error", max_iter = 10000)
model_SGD.fit(scaled_X_train, y_train)
print(f"Parameters: {model_SGD.coef_}") # beta1,beta2,beta3
print(f"Intercept: {model_SGD.intercept_}") # beta0

Parameters: [11.97010773  9.02302067  1.32337779]
Intercept: [3.5634186]


### Manual test
We test predict one sample to manually do a reasonability check.


In [74]:
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_label = y_test.values[0]
test_sample_features, test_sample_label

(array([[0.54988164, 0.63709677, 0.52286282]]), 16.9)

In [75]:
test_sample_features.shape

(1, 3)

In [76]:
model_OLS.predict(test_sample_features)[0]

16.56539629743484

In [77]:
model_SGD.predict(test_sample_features)[0]

16.586043440798882

In [78]:
print(f"Scaled features {test_sample_features}, label {test_sample_label}")
print(f"Prediction OLS: {model_OLS.predict(test_sample_features)[0]:.2f}")
print(f"Prediction SGD: {model_SGD.predict(test_sample_features)[0]:.2f}")

Scaled features [[0.54988164 0.63709677 0.52286282]], label 16.9
Prediction OLS: 16.57
Prediction SGD: 16.59


##  Evaluation

In [80]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# first predict on our test data 
y_pred_OLS = model_OLS.predict(scaled_X_test)
y_pred_SGD = model_SGD.predict(scaled_X_test)



y_pred_OLS[:5]

array([16.5653963 , 21.18822792, 21.55107058, 10.88923816, 22.20231988])

In [81]:
y_pred_SGD[:5]

array([16.58604344, 20.81431349, 21.10440273, 11.31204848, 21.40497442])

In [84]:
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
mse_OLS = mean_squared_error(y_test, y_pred_OLS)
rmse_OLS = np.sqrt(mse_OLS)

mae_SGD = mean_absolute_error(y_test, y_pred_SGD)
mse_SGD = mean_squared_error(y_test, y_pred_SGD)
rmse_SGD = np.sqrt(mse_SGD)

print(f"OLS, MAE: {mae_OLS:.2f}, MSE: {mse_OLS:.2f}, RMSE: {rmse_OLS:.2f}")
print(f"SGD, MAE: {mae_SGD:.2f}, MSE: {mse_SGD:.2f}, RMSE: {rmse_SGD:.2f}")

OLS, MAE: 1.51, MSE: 3.80, RMSE: 1.95
SGD, MAE: 1.52, MSE: 4.08, RMSE: 2.02


In [87]:
print(f"{mae_OLS=:.4f} \t\t {mse_OLS=:.4f} \t {rmse_OLS=:.4f}")
print(f"{mae_SGD=:.4f} \t\t {mse_SGD=:.4f} \t {rmse_SGD=:.4f}")

mae_OLS=1.5117 		 mse_OLS=3.7968 	 rmse_OLS=1.9485
mae_SGD=1.5235 		 mse_SGD=4.0816 	 rmse_SGD=2.0203
