# Linear Regression Code Along

    - We have labels -> supervised learning
    - Try to predict real number -> regression
    - Predict discrete values -> classification


In [7]:
import pandas as pd

df = pd.read_csv("../../data/Advertising.csv", index_col=0)
df.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [11]:
df.shape

(200, 4)

In [15]:
print(f"{df.shape[0]} samples")
print(f"{df.shape[1]-1} features")
print("sales column is our label/target")

200 samples
3 features
sales column is our label/target


## EDA left for the reader (me)

## divide data into X and Y

In [None]:
# Tuple unpacking
# X will contain all columns except 'sales' / Design matrix /features / independent variables
# y will contain the 'sales' column / Target variable / label / dependent variable

X, y = df.drop("sales", axis=1), df["sales"]
X.head()

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [21]:
y.head()
type(y)

pandas.core.series.Series

## Scikit-learn Steps

1. Train | test split  or  train | val | test split
2. Scale dataset
    - Many algorithms require scaling, some don't
    - Different types of scaling (ex. feature standardization, min-max scaling)
    - Scale training data and test data to the training dats parameters, to avoid data leakage
3. Fit algorithm to training data
4. Predict on test data
5. Evaluation metrics

## 1. Train|Test Split

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_test.shape = }")

X_train.shape = (134, 3)
y_train.shape = (134,)
X_test.shape = (66, 3)
y_test.shape = (66,)


In [27]:
X_train.head()

Unnamed: 0,TV,radio,newspaper
43,293.6,27.7,1.8
190,18.7,12.1,23.4
91,134.3,4.9,9.3
137,25.6,39.0,9.3
52,100.4,9.6,3.6


In [31]:
y_train.head()

43     20.7
190     6.7
91     11.2
137     9.5
52     10.7
Name: sales, dtype: float64

## 2. Feature scaling
    - Min - Max scaling
    - Values transformed into 0 to 1


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Instantiate an instance of the MinMaxScaler Class
# This will scale the data to a range between 0 and 1
scaler = MinMaxScaler()
type(scaler)

sklearn.preprocessing._data.MinMaxScaler

In [39]:
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min() = }")
print(f"{scaled_X_train.max() = }")
print(f"{scaled_X_test.min() = }")
print(f"{scaled_X_test.max() = }")

scaled_X_train.min() = np.float64(0.0)
scaled_X_train.max() = np.float64(1.0)
scaled_X_test.min() = np.float64(0.005964214711729622)
scaled_X_test.max() = np.float64(1.1302186878727631)


In [42]:
scaled_X_train.shape

(134, 3)

In [44]:
scaled_X_train[:5]  # Display the first 5 rows of the scaled training data

array([[0.99053094, 0.55846774, 0.01491054],
       [0.06087251, 0.24395161, 0.22962227],
       [0.45180927, 0.09879032, 0.08946322],
       [0.08420697, 0.78629032, 0.08946322],
       [0.33716605, 0.19354839, 0.03280318]])

In [48]:
type(scaled_X_train)

numpy.ndarray

## 3. Linear Regression

$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

In [49]:
from sklearn.linear_model import LinearRegression
    # Instantiate an instance of the LinearRegression class
model = LinearRegression()
model


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [51]:
model.fit(scaled_X_train, y_train)
print(f"Parameters or weights: {model.coef_}")
print(f"Intercept: {model.intercept_}")

Parameters or weights: [13.20747617  9.75285112  0.61108329]
Intercept: 2.7911595196243653


## 4. Prediction


In [None]:
sample_features = scaled_X_test[0].reshape(1, -1)  # Reshape to 2D array for prediction
sample_features

array([[0.54988164],
       [0.63709677],
       [0.52286282]])

In [55]:
model.predict([sample_features])  # Predicting the sales for the first sample in the test set

ValueError: Found array with dim 3, while dim <= 2 is required by LinearRegression.