In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('../data/Advertising.csv', index_col=0)
# df.shape = (200, 4)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [7]:
X = df.drop("Sales", axis="columns")
Y = df["Sales"]
X, Y

(        TV  Radio  Newspaper
 1    230.1   37.8       69.2
 2     44.5   39.3       45.1
 3     17.2   45.9       69.3
 4    151.5   41.3       58.5
 5    180.8   10.8       58.4
 ..     ...    ...        ...
 196   38.2    3.7       13.8
 197   94.2    4.9        8.1
 198  177.0    9.3        6.4
 199  283.6   42.0       66.2
 200  232.1    8.6        8.7
 
 [200 rows x 3 columns],
 1      22.1
 2      10.4
 3       9.3
 4      18.5
 5      12.9
        ... 
 196     7.6
 197     9.7
 198    12.8
 199    25.5
 200    13.4
 Name: Sales, Length: 200, dtype: float64)

### A machine-learning workflow

1. Train / test split, if algorithm needs separate validation (ie. k-folds) we have to split multiple times
2. Data-processing eg. polynomial features, scaling, centering etc. 
3. Train, in other words perform regression aka fit to the data
4. Predict / transform data / apply model
5. Evaluate, compute metrics, statistics etc

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # create an instance
scaler.fit(X_train) # fit only to the training data

scaled_X_train = scaler.transform(X_train) # transform the training data
scaled_X_test = scaler.transform(X_test) # transform the test data

print(f"{scaled_X_train.min():.2f} <= {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} <= {scaled_X_test.max():.2f}") 

0.00 <= 1.00
0.01 <= 1.13


In [16]:
from sklearn.linear_model import LinearRegression # under the hood, svd is used to solve the linear regression

model = LinearRegression() # create an instance of the model
model.fit(scaled_X_train, y_train) # fit the model to the scaled training data

print(f"Parameters: {model.coef_}") # the coefficients
print(f"Intercept: {model.intercept_}") # the intercept
print(f"Coefficient of multiple determination R^2: {model.score(scaled_X_train, y_train):.3f}") # the R^2 score

Parameters: [13.02832938  9.88465985  0.69237469]
Intercept: 2.7418553248528124
Coefficient of multiple determination R^2: 0.906


In [25]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_pred = model.predict(scaled_X_test) # predict the test data

mae = mean_absolute_error(y_test, y_pred) # the mean absolute error
mse = mean_squared_error(y_test, y_pred) # the mean squared error
rmse = np.sqrt(mse) # the root mean squared error

print(f"Mean Absolute Error, MAE: {mae:.3f}\nMean Squared Error, MSE: {mse:.3f}\nRoot Mean Squared Error, RMSE: {rmse:.3f}")

Mean Absolute Error, MAE: 1.512
Mean Squared Error, MSE: 3.797
Root Mean Squared Error, RMSE: 1.949
