# Linear Regression Code Along
- labels -> supervised learning
- try to predict real number -> regression (float, decimaltal)
- predict discreet values -> classification (realtal)


In [6]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("../../data/Advertising.csv", index_col=0)
df.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [10]:
df.shape # 200 rader 4 kolumner

(200, 4)

In [12]:
df.shape[0] # 1 row is a sample

200

In [16]:
print(f"{df.shape[0]} samples")
print(f"{df.shape[1] -1} features")
print("sales column is our label/target")

200 samples
3 features
sales column is our label/target


# Divide data into X & Y

In [22]:
# tuple unpacking
# X - design matrix / feature matrix / features / independent variable
# y - target variable /label / dependent variable
X, y = df.drop("sales", axis = "columns"), df["sales"]
X.head()

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [23]:
y.head() #

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: sales, dtype: float64

In [26]:
type(y), type(X) # to see what data type variables are. list? array?

(pandas.core.series.Series, pandas.core.frame.DataFrame)

# Scikit-learn Steps:
1. train | test split or train|val|test split
2. scale data set
    - many algorithms require scaliing, some don't
    - different types of scaling are in existence (e.g. feature standardization, min-max scaling)
    - scale training data and test data to the training data parameters to avoid data leakage
3. fit algorithm to training data
4. Predict on test data
5. Evaluation metrics


# 1. train|test split

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

In [30]:

help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
    Split arrays or matrices into random train and test subsets.

    Quick utility that wraps input validation,
    ``next(ShuffleSplit().split(X, y))``, and application to input data
    into a single call for splitting (and optionally subsampling) data into a
    one-liner.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        com

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42) #seed 42 (controls shuffle)

print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_test.shape = }")

X_train.shape = (134, 3)
y_train.shape = (134,)
X_test.shape = (66, 3)
y_test.shape = (66,)


In [35]:
X_train.head()

Unnamed: 0,TV,radio,newspaper
43,293.6,27.7,1.8
190,18.7,12.1,23.4
91,134.3,4.9,9.3
137,25.6,39.0,9.3
52,100.4,9.6,3.6


In [36]:
y_train.head()

43     20.7
190     6.7
91     11.2
137     9.5
52     10.7
Name: sales, dtype: float64

# 2. Feature scaling
- min-max scaling
- values transform into 0 to 1

In [40]:
from sklearn.preprocessing import MinMaxScaler

#instantiate an instance from the MinMaxScaler class
scaler = MinMaxScaler()
type(scaler)

sklearn.preprocessing._data.MinMaxScaler

In [39]:
scaler

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [48]:
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")


scaled_X_train.min() = np.float64(0.0)
scaled_X_train.max() = np.float64(1.0)
scaled_X_test.min() = np.float64(0.005964214711729622)
scaled_X_test.max() = np.float64(1.1302186878727631)


In [50]:
scaled_X_train[:5] # Numpy array [:5] equals .head

array([[0.99053094, 0.55846774, 0.01491054],
       [0.06087251, 0.24395161, 0.22962227],
       [0.45180927, 0.09879032, 0.08946322],
       [0.08420697, 0.78629032, 0.08946322],
       [0.33716605, 0.19354839, 0.03280318]])

In [51]:
type(scaled_X_train)

numpy.ndarray

# Linear regression

$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

In [54]:
from sklearn.linear_model import LinearRegression

# instantiate and instance from LinearRegression class
model = LinearRegression()
model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [56]:
model.fit(scaled_X_train, y_train) #facit och result
print(f"Parameters or weights: {model.coef_}") #w1 w2 w3 ie coefficients
print(f"Intercept: {model.intercept_}") 

Parameters or weights: [13.20747617  9.75285112  0.61108329]
Intercept: 2.7911595196243653


# 4. Prediction

In [62]:
X_test.iloc[0]

TV           163.3
radio         31.6
newspaper     52.9
Name: 96, dtype: float64

In [59]:
sample_features = scaled_X_test[0].reshape(1, -1)
sample_features

array([[0.54988164, 0.63709677, 0.52286282]])

In [60]:
model.predict(sample_features)

array([16.58673085])

In [61]:
y_test.iloc[0]

np.float64(16.9)

### Predict on whole test data

In [65]:
y_pred = model.predict(scaled_X_test)
y_pred[:5]

array([16.58673085, 21.18622524, 21.66752973, 10.81086512, 22.25210881])

In [66]:
y_test.iloc[:5]

96     16.9
16     22.4
31     21.4
159     7.3
129    24.7
Name: sales, dtype: float64

# 5. Evaluate

Common metrics fro regression case
- mae - mean abslute error
- mse - mean squared error
- rmse - root mean squared error

In [70]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = }")
print(f"{mse = }")
print(f"{rmse = }")

mae = 1.4937750024728977
mse = 3.72792833068152
rmse = np.float64(1.9307843822347228)


In [8]:
df.info() # option enter for next block of code

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   radio      200 non-null    float64
 2   newspaper  200 non-null    float64
 3   sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB
