# Linear regression

In [3]:
import pandas as pd

df = pd.read_csv("../data/Advertising.csv", index_col=0)
df

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


In [None]:
# 200 samples
# 3 features
# 1 lable
df.shape

(200, 4)

In [9]:
x, y = df.drop("sales", axis = 1), df["sales"]

x.head()

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [11]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: sales, dtype: float64

## Scikit learn steps
1. train|test split - some cases train|validation|test - split
2. Scale the dataset
    • many algorithms require scaling, some don't
    • which type of scaling to use?
    • scale training data, test data to the training data, to avoid data leakage
3. Fit the algorithm to the training data
4. Transform the training data, transform the test data
5. Calculate evaluation metrics

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42
    )

X_train

Unnamed: 0,TV,radio,newspaper
170,284.3,10.6,6.4
98,184.9,21.0,22.0
32,112.9,17.4,38.6
13,23.8,35.1,65.9
36,290.7,4.1,8.5
...,...,...,...
107,25.0,11.0,29.7
15,204.1,32.9,46.0
93,217.7,33.5,59.0
180,165.6,10.0,17.6


In [13]:
X_test.shape

(60, 3)

In [14]:
y_train.shape, y_test.shape

((140,), (60,))

# Feature scaling

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

scaler 

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [16]:
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train

array([[0.95908015, 0.21370968, 0.06063618],
       [0.62292864, 0.4233871 , 0.21570577],
       [0.37943862, 0.35080645, 0.38071571],
       [0.07811972, 0.70766129, 0.65208748],
       [0.98072371, 0.08266129, 0.08151093],
       [0.06323977, 0.32258065, 0.21868787],
       [0.99053094, 0.55846774, 0.01491054],
       [0.06087251, 0.24395161, 0.22962227],
       [0.45180927, 0.09879032, 0.08946322],
       [0.08420697, 0.78629032, 0.08946322],
       [0.33716605, 0.19354839, 0.03280318],
       [0.26885357, 0.        , 0.08846918],
       [0.63476496, 0.36491935, 0.25149105],
       [0.59621238, 0.6733871 , 0.38170974],
       [0.42272574, 0.74395161, 0.78429423],
       [0.70645925, 0.41532258, 0.10337972],
       [0.4808928 , 0.59072581, 0.1222664 ],
       [0.62292864, 0.88508065, 0.0139165 ],
       [0.74974636, 0.08669355, 0.49204771],
       [0.81501522, 0.76612903, 0.22763419],
       [0.0557998 , 0.92540323, 0.68588469],
       [0.40514034, 0.57459677, 0.13817097],
       [0.

In [18]:
from sklearn.linear_model import LinearRegression

# this model uses SVD approach for solving normal equation
model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}")
print(f"Intercept parameter: {model.intercept_}")

Parameters: [13.02832938  9.88465985  0.69237469]
Intercept parameter: 2.7418553248528124
