# Scikit-learn

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read in the data
df = pd.read_csv("../data/Advertising.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [41]:
# creating samples and features variables
number_of_samples, number_of_features = df.shape[0], df.shape[1] - 1 # -1 to exclude the sales. because we are predicting sales
print("Number of samples: ", number_of_samples)
print("Number of features: ", number_of_features)


Number of samples:  200
Number of features:  3


In [42]:
# setting X and y
X, y = df.drop("Sales", axis=1), df["Sales"] # axis = 1 means column, axis = 0 means row
# Convention is using capital X for the features and lowercase y for the response variable
X.head()

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [43]:
y.head()

0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
Name: Sales, dtype: float64

## Sklearn - typical steps
  
1. train|test split, sometimes train|val|test split
  if we want to hyperparameter evaluate we need to evaluate.
2. scaling sometimes required.
    - min-max scaling
    - standardization
    - ...
    - scale training data, 
    - scale test data to the training data --> avoiding data leakage.
3. Fit algorithm to training data (Actually training the model).
4. Predict test data
5. Evaluate

recipe for many algorithms

## Train|test split

In [44]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
print("Number of samples in training set: ", X_train.shape[0], " and number of features: ", X_train.shape[1])
print("Number of samples in testing set: ", X_test.shape[0], " and number of features: ", X_test.shape[1])


Number of samples in training set:  140  and number of features:  3
Number of samples in testing set:  60  and number of features:  3


## Feature scaling
Normalization (min-max feature scaling)
### $X' = \frac{X-X_{\min}}{X_{max}-X_{min}}$

Feature standardization
### $X' = \frac{X-\mu}{\sigma}$


In [45]:
from sklearn.preprocessing import MinMaxScaler

# Scaling the data
scaler = MinMaxScaler()

scaler.fit(X_train) # fit the scaler to the training data

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")
# Note: scaled_X_test min and max are not 0 and 1 because we are using the same scaler as the training data
# 0 < = scaled_X_train <= 1
# 0.005964 <= scaled_X_test <= 1.130218

# Note: this type of scaling is very sensitive to outliers because it is based on the min and max values of the data.
# it is good practice to remove outliers before scaling the data

# Choosing a model or type of scaling is a hyperparameter


scaled_X_train.min() = 0.0
scaled_X_train.max() = 1.0
scaled_X_test.min() = 0.005964214711729622
scaled_X_test.max() = 1.1302186878727631


In [46]:
# Checking what scaled_X_train returns
scaled_X_train.shape, scaled_X_test.shape # returns a numpy array

((140, 3), (60, 3))

# Linear regression

In [47]:
from sklearn.linear_model import LinearRegression

# Creating a model
model_OLS = LinearRegression() # LinearRegression fits a linear model with coefficients w = (w1, ..., wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.

# Fitting the model to the training data
model_OLS.fit(scaled_X_train, y_train) # fit the model to the training data

print(f"Parameters: {model_OLS.coef_}") # beta_1, beta_2, beta_3
print(f"Intercept: {model_OLS.intercept_}") # beta_0 


Parameters: [13.02832938  9.88465985  0.69237469]
Intercept: 2.7418553248528124


## Stochastic gradient descent

In [53]:
from sklearn.linear_model import SGDRegressor

# Creating a model

model_SGD = SGDRegressor(loss= "squared_error", max_iter=10000) # fitted by minimizing a regularized empirical loss with SGD
model_SGD.fit(scaled_X_train, y_train)

print(f"Parameters: {model_SGD.coef_}") # beta_1, beta_2, beta_3
print(f"Intercept: {model_SGD.intercept_}") # beta_0 

Parameters: [11.97674049  9.01777722  1.34945842]
Intercept: [3.55935577]


## Manual prediction

In [61]:
test_sample_features = scaled_X_test[0].reshape(1, -1) #
test_sample_label = y_test.values[0] # the actual value of the first sample in the test set
test_sample_features, test_sample_label

(array([[0.54988164, 0.63709677, 0.52286282]]), 16.9)

In [64]:
# Predicting the label of a test sample
model_OLS.predict(test_sample_features)[0] # predicted value of the first sample in the test set

16.56539629743484

In [65]:
# Predicting with SGD
model_SGD.predict(test_sample_features)[0] # predicted value of the first sample in the test set

16.595923854092426

## Evaluation

In [70]:
# Evaluating the model
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predicting the labels of the test set
y_pred_OLS = model_OLS.predict(scaled_X_test)  # important to use the scaled test set
y_pred_SGD = model_SGD.predict(scaled_X_test)  # important to use the scaled test set

# Calculating the mean absolute error
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)

print(f"Mean absolute error of OLS: {mae_OLS}") # OLS is better than SGD because it is closer to 0 (the lower the better)

print(f"Mean absolute error of SGD: {mae_SGD}")

# Calculating the mean squared error
mse_OLS = mean_squared_error(y_test, y_pred_OLS) # MSE is lower than SGD. OLS is better than SGD
mse_SGD = mean_squared_error(y_test, y_pred_SGD)

print(f"Mean squared error of OLS: {mse_OLS}")

print(f"Mean squared error of SGD: {mse_SGD}")

# Calculating the root mean squared error
rmse_OLS = np.sqrt(mse_OLS) # OLS is better than SGD
rmse_SGD = np.sqrt(mse_SGD) 

print(f"Root mean squared error of OLS: {rmse_OLS}")

print(f"Root mean squared error of SGD: {rmse_SGD}")


Mean absolute error of OLS: 1.511669222454909
Mean absolute error of SGD: 1.5231020384464784
Mean squared error of OLS: 3.7967972367152223
Mean squared error of SGD: 4.0876225249670926
Root mean squared error of OLS: 1.9485372043446392
Root mean squared error of SGD: 2.0217869632993217
