# Linear regression code alongs

- we have laels -> supervised learning
- try to predict real number -> regression
- predict discrete values -> classification

In [2]:
import pandas as pd

df = pd.read_csv("../../data/Advertising.csv", index_col=0)
df.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
df.shape

(200, 4)

In [8]:
print(f"{df.shape[0]} samples")
print(f"{df.shape[1] - 1} features")
print("Sales column is our label/target")

200 samples
3 features
Sales column is our label/target


## Divide data into X and y

In [None]:
# X - design matrix / feature matrix / features / independent variables
# y - target variable / label / dependent variable
X, y = df.drop("sales", axis=1), df["sales"]
X.head()

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [11]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: sales, dtype: float64

In [12]:
type(y), type(X)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

## Scikit-learn steps

1. train|test split or train|val|test split
2. scale dataset
    - many algorithms require scaling, some don't
    - there exists different types of scaling (e.g. feature standardization, min-max scaling)
    - scale training data and test data to the training datas parameters to avoid data leakage
3. Fit algorithm to training data
4. Predict on test data
5. Evaluation metrics

In [None]:
from sklearn.model_selection import train_test_split

help(train_test_split)

## 1.train|test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_test.shape = }")

X_train.shape = (134, 3)
y_train.shape = (134,)
X_test.shape = (66, 3)
y_test.shape = (66,)


In [19]:
X_train.head()

Unnamed: 0,TV,radio,newspaper
43,293.6,27.7,1.8
190,18.7,12.1,23.4
91,134.3,4.9,9.3
137,25.6,39.0,9.3
52,100.4,9.6,3.6


In [20]:
y_train.head()

43     20.7
190     6.7
91     11.2
137     9.5
52     10.7
Name: sales, dtype: float64

## 2.feature scaling

- min-max scaling
- values transform into 0.0 to 1.0

In [21]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
type(scaler)

sklearn.preprocessing._data.MinMaxScaler

In [25]:
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")

scaled_X_train.min() = np.float64(0.0)
scaled_X_train.max() = np.float64(1.0)
scaled_X_test.min() = np.float64(0.005964214711729622)
scaled_X_test.max() = np.float64(1.1302186878727631)


In [27]:
scaled_X_train.shape

(134, 3)

In [28]:
scaled_X_train[:5]

array([[0.99053094, 0.55846774, 0.01491054],
       [0.06087251, 0.24395161, 0.22962227],
       [0.45180927, 0.09879032, 0.08946322],
       [0.08420697, 0.78629032, 0.08946322],
       [0.33716605, 0.19354839, 0.03280318]])

In [29]:
type(scaled_X_train)

numpy.ndarray

## 3.Linear regression

$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters or weights: {model.coef_}")
print(f"Intercept parameter: {model.intercept_}")

Parameters: [13.20747617  9.75285112  0.61108329]
Intercept parameter: 2.7911595196243653


## 4. Prediction

In [37]:
X_test.iloc[0]

TV           163.3
radio         31.6
newspaper     52.9
Name: 96, dtype: float64

In [34]:
sample_features = scaled_X_test[0].reshape(1, -1)
sample_features

array([[0.54988164, 0.63709677, 0.52286282]])

In [35]:
model.predict(sample_features)

array([16.58673085])

In [36]:
y_test.iloc[0]

np.float64(16.9)

## Predict on whole test data

In [39]:
y_pred = model.predict(scaled_X_test)
y_pred[:5]

array([16.58673085, 21.18622524, 21.66752973, 10.81086512, 22.25210881])

In [40]:
y_test[:5]

96     16.9
16     22.4
31     21.4
159     7.3
129    24.7
Name: sales, dtype: float64

## 5. Evaluate

common metrics for regression case
- mae - mean absolute error
- mse - mean squared error
- rmse - root mean squared error

In [41]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = }")
print(f"{mse = }")
print(f"{rmse = }")

mae = 1.4937750024728977
mse = 3.72792833068152
rmse = np.float64(1.9307843822347228)
