### Importing libraries

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import pandas as pd
from tqdm import tqdm

### Reading data

In [2]:
df = pd.read_csv("../../Datasets/mpg Data/auto-mpg.data.csv")
df.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name']
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140,3449,10.5,70,1,ford torino
4,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


### Checking for missing values
This data has missing values where a '?' appears in the data, we find the indices of those instances and remove those rows from the data

In [3]:
missing_dict = {}
for col in df.columns:
    missing_dict[col]= np.where(df[col]=='?')
missing_dict

{'mpg': (array([], dtype=int64),),
 'cylinders': (array([], dtype=int64),),
 'displacement': (array([], dtype=int64),),
 'horsepower': (array([ 31, 125, 329, 335, 353, 373], dtype=int64),),
 'weight': (array([], dtype=int64),),
 'acceleration': (array([], dtype=int64),),
 'model year': (array([], dtype=int64),),
 'origin': (array([], dtype=int64),),
 'car name': (array([], dtype=int64),)}

Dropping the missing values from the data

In [4]:
df.drop(missing_dict['horsepower'][0], inplace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140,3449,10.5,70,1,ford torino
4,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


Separating features and labels

In [5]:
X = df.iloc[:,1:-1]
X = np.array(X)
y = df.iloc[:,0]
y = np.array(y)

Split arrays or matrices into random train and test subsets

**random_state**: Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.15, random_state=42)

LinearRegression fits a linear model with coefficients w = (w1, …, wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation

In [7]:
reg = LinearRegression().fit(X_train, y_train)

Predict using the linear model

In [8]:
preds = reg.predict(X_test)

### Calculating performance metrics of the trained model

**max_error** metric calculates the maximum residual error

In [9]:
metrics.max_error(preds, y_test)

9.885881126770684

Mean absolute error regression loss

In [10]:
metrics.mean_absolute_error(preds, y_test)

2.4505444932911526

Mean squared error regression loss

In [11]:
metrics.mean_squared_error(preds, y_test)

10.223707672057884

If **squared** = True, returns MSE value, if False returns RMSE value

In [12]:
metrics.mean_squared_error(preds, y_test, squared=False)

3.197453310379666

R<sup>2</sup> (coefficient of determination) regression score function.

Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a
score of 0.0.

In [13]:
metrics.r2_score(preds, y_test, multioutput='variance_weighted')

0.8112193400669614

### Training using normalization

This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one

In [14]:
scaler = MinMaxScaler()

In [15]:
scaler.fit(X)

MinMaxScaler()

In [16]:
norm_X = scaler.transform(X)

In [17]:
norm_X

array([[1.        , 0.72868217, 0.64673913, ..., 0.20833333, 0.        ,
        0.        ],
       [1.        , 0.64599483, 0.56521739, ..., 0.17857143, 0.        ,
        0.        ],
       [1.        , 0.60981912, 0.56521739, ..., 0.23809524, 0.        ,
        0.        ],
       ...,
       [0.2       , 0.17312661, 0.20652174, ..., 0.21428571, 1.        ,
        0.        ],
       [0.2       , 0.13436693, 0.17934783, ..., 0.63095238, 1.        ,
        0.        ],
       [0.2       , 0.13178295, 0.19565217, ..., 0.67857143, 1.        ,
        0.        ]])

In [18]:
Xn_train, Xn_test, yn_train, yn_test = train_test_split(norm_X,y,test_size = 0.15, random_state=42)

In [19]:
reg_norm = LinearRegression().fit(Xn_train, yn_train)

In [20]:
norm_preds = reg_norm.predict(Xn_test)

In [21]:
metrics.max_error(norm_preds, y_test)

9.88588112677067

In [22]:
metrics.mean_absolute_error(norm_preds, y_test)

2.4505444932911535

In [23]:
metrics.mean_squared_error(norm_preds, y_test)

10.223707672057916

In [24]:
metrics.mean_squared_error(norm_preds, y_test, squared=False)

3.1974533103796707

In [25]:
metrics.r2_score(norm_preds, y_test, multioutput='variance_weighted')

0.8112193400669614