# Linear regression

# Advertisement data

## Short EDA on ads data

In [None]:
# read in the data into pandas dataframe

import pandas as pd
import numpy as np

df = pd.read_csv("advertising.csv", index_col=0)

df.head()

# and start doing some EDA

print(f"{df.shape[0]} samples")
print(f"{df.shape[1]-1} features") # subtract one as price_unit_area is the label and not    

df

-features/independent variable: TV, radio, newspaper

-label/target/dependent variable: sales

In [None]:
df.info()

In [None]:
# 200 rows/samples
# 4 column: 3 features and 1 label


df.shape

In [None]:
import seaborn as sns
# 1 example plot

sns.scatterplot(data = df, x="TV", y="Sales")

In [None]:
import matplotlib.pyplot as plt

#want to plot TV, radio, news vs sales on the 3 axes


fig, axes = plt.subplots(1,3, figsize=(14,4))
sns.scatterplot(data = df, x = "TV", y="Sales", ax = axes[0])
sns.scatterplot(data = df, x = "Radio", y="Sales", ax = axes[1])
sns.scatterplot(data = df, x = "Newspaper", y="Sales", ax = axes[2])

axes[0].set(title = "TV spending vs sales")

#and so on

same as above, but put into loop to keep it more DRY 

In [None]:
fig, axes = plt.subplots(1,3, figsize=(14,4))

features = ("TV","Radio", "Newspaper")
for feature, ax in zip(features, axes.flatten()):
    sns.scatterplot(data = df, x = feature, y="Sales", ax = ax)  
    ax.set(title=f"{feature} vs sales")

## Scikit-learn steps

See this as a recipe to follow, works for most machine learning algorithms with some modifications

steps:

0. divide into feature matrix X and label y
1. train|test split
2. scale dataset ( some algorithms don´t need scaling)
3. fit algorithm with training data
4. transform training data and test data
5. evealuate on test data


### 0. divide into features X and labe y

In [None]:
y = df["Sales"]
y

In [None]:
# df[["TV", "Newspaper""Radio"]]
X = df.drop("Sales", axis=1)
X.head()

common way that i will do the above with tuple unpacking

In [None]:
X, y=  df.drop("Sales", axis=1), df["Sales"]

X.head(3)

In [None]:
y.head(3)

### 1.train|test split

- possible to do manually, but sklearn has this implemented of off the shelf that we can use

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

print(f"{X_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_train.shape = }")
print(f"{y_test.shape = }")


In [None]:
X_train.head()

In [None]:
y_test.head()

### 2. scale dataset

common scaling techniques
- min-max also called normalization
- feature standardization

many algorithms work better or only works when the features as scaled
- values "closer" to each other

for normalization
-  $X' = \frac{X-X_{min}}{X_{max}-X_{min}}$

In [None]:
from sklearn.preprocessing import MinMaxScaler

# instantiate a MinMaxScaler instance
scaler = MinMaxScaler()

# important note: fit on X-train and not X_test -> this avoids data leakage
scaler.fit(X_train) # use training data to fit the scaler

# transforms or scales X_train and X_test
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train.shape, scaled_X_test.shape


In [None]:
scaled_X_train.min(), scaled_X_train.max()

In [None]:
# we have used parameters X_min abd X_max from X_train to scale X_test
# if you get exactly 0 and 1 here then probably you have fit X_test which would leak data
scaled_X_test.min(), scaled_X_test.max()

### 3. Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
# put in training data features and label
model.fit(scaled_X_train, y_train)

model.intercept_, model.coef_


test manual prediction

In [53]:
# TV, radio, news but scaled
test_sample_feature = scaled_X_test[0]

test_sample_feature

array([0.54988164, 0.63709677, 0.52286282])

In [56]:
w = model.coef_
w0 = model.intercept_

print("prediction on the test sample")
w0 + w[0]*test_sample_feature[0] + w[1]*test_sample_feature[1] +  w[2]*test_sample_feature[2]

prediction on the test sample


np.float64(16.586730852231778)

Below is not in class

Dependent and independent variable

In [None]:
X, y = df.drop("Sales", axis="columns"), df["Sales"]
X.head(2), y.head(2)

In [None]:
y.head()

Train|test split

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

Feature scaling

In [None]:
# we use normalization here
# instantiate an object from the class MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train) # use the training data to fit the scaler

# very important that we fit to training data, i.e. use training datas parameters to transform 
# both training and test data, else if we use test datas parameters to scale test data, we have 
# leaked data, which might give misleading results 
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}") # natural that it isn't [0,1] since we fit to training data 

# we do not scale our target variable y in this lecture 

Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

# this model uses SVD approach for solving normal equation
model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}")
print(f"Intercept parameter: {model.intercept_}")

Predict on test data

In [None]:
# first predict on our test data
y_pred = model.predict(scaled_X_test)

Evaluate performance

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}")