# Linear regression

Advertisement data

In [None]:

import pandas as pd
import numpy as np

df = pd.read_csv("advertising.csv", index_col=0)

print(f"{df.shape[0]} samples")
print(f"{df.shape[1]-1} features") # subtract one as price_unit_area is the label and not    

df

In [None]:
# 200 samples
#3 features
# 1label

df.shape

Dependent and independent variable

In [None]:
X, y = df.drop("Sales", axis="columns"), df["Sales"]
X.head(2), y.head(2)

In [None]:
y.head()

Train|test split

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

Feature scaling

In [None]:
# we use normalization here
# instantiate an object from the class MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train) # use the training data to fit the scaler

# very important that we fit to training data, i.e. use training datas parameters to transform 
# both training and test data, else if we use test datas parameters to scale test data, we have 
# leaked data, which might give misleading results 
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}") # natural that it isn't [0,1] since we fit to training data 

# we do not scale our target variable y in this lecture 

Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

# this model uses SVD approach for solving normal equation
model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}")
print(f"Intercept parameter: {model.intercept_}")

Predict on test data

In [None]:
# first predict on our test data
y_pred = model.predict(scaled_X_test)

Evaluate performance

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}")