In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('./data/Advertising.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [6]:
X = df.drop("Sales", axis = 'columns')
Y = df["Sales"]

X, Y

(     Unnamed: 0     TV  Radio  Newspaper
 0             1  230.1   37.8       69.2
 1             2   44.5   39.3       45.1
 2             3   17.2   45.9       69.3
 3             4  151.5   41.3       58.5
 4             5  180.8   10.8       58.4
 ..          ...    ...    ...        ...
 195         196   38.2    3.7       13.8
 196         197   94.2    4.9        8.1
 197         198  177.0    9.3        6.4
 198         199  283.6   42.0       66.2
 199         200  232.1    8.6        8.7
 
 [200 rows x 4 columns],
 0      22.1
 1      10.4
 2       9.3
 3      18.5
 4      12.9
        ... 
 195     7.6
 196     9.7
 197    12.8
 198    25.5
 199    13.4
 Name: Sales, Length: 200, dtype: float64)

# Steps we will take
1. Train | Test split, if algorithm needs separate validation (i.e. k-dolds) we may have to split multiple times
2. data processing e.g. polynomial features, scaling, centering etc.
3. train, in other words perform regression aka fit to the data
4. predict, transform data / apply data
5. evaluate, compute metrics, statistics, etc

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)

In [9]:
# Scaling the data, i.e. normalizing the data, fitting to the training data and transforming the training and test data

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min():.2f} <= scaled_X_train <= {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} <= scaled_X_train <= {scaled_X_test.max():.2f}")


0.00 <= scaled_X_train <= 1.00
0.01 <= scaled_X_train <= 1.13


In [14]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(scaled_X_train, Y_train)

print(f"Parameters: \n\nCoefficients: {model.coef_}\nIntercept:{model.intercept_}")
print(f"Score: {model.score(scaled_X_train, Y_train)}")

Parameters: 

Coefficients: [ 0.02018551 13.02798143  9.88460702  0.69598082]
Intercept:2.7309799792161247
Score: 0.9055173150726941


In [15]:
# 5. Evaluating the model

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(scaled_X_test)
y_pred

array([16.56581778, 21.1804797 , 21.54440211, 10.89594674, 22.20422155,
       13.35811188, 21.19371189,  7.35723223, 13.28243942, 15.11907355,
        9.01025527,  6.53399652, 14.31067157,  8.9675858 ,  9.46556446,
       12.01266538,  8.91173461, 16.160977  , 10.29431598, 18.72083349,
       19.77226614, 13.76939567, 12.50085507, 21.53582335,  7.60425655,
        5.60788293, 20.91614465, 11.80237377,  9.08337367,  8.51666065,
       12.16659861,  9.96068717, 21.72664731, 12.77756633, 18.10644734,
       20.07544506, 14.26503177, 20.94306381, 10.84517829,  4.37943779,
        9.51113547, 12.41397184, 10.16460264,  8.08293205, 13.1656671 ,
        5.23369548,  9.28235707, 14.08483506,  8.69496881, 11.65949021,
       15.72025743, 11.63848467, 13.3458554 , 11.1568059 ,  6.33779081,
        9.75906265,  9.42324492, 24.25567017,  7.70231778, 12.15200343])

In [17]:
MAE = mean_absolute_error(Y_test, y_pred)
MSE = mean_squared_error(Y_test, y_pred)
RMSE = np.sqrt(MSE)

print(f"MAE: {MAE:.2f}\nMSE: {MSE:.2f}\nRMSE: {RMSE:.2f}\nR2: {r2_score(Y_test, y_pred):.2f}")

MAE: 1.51
MSE: 3.80
RMSE: 1.95
R2: 0.86
