<a href="https://colab.research.google.com/github/SinghReena/MachineLearning/blob/master/Multiple_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Imports
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import sklearn.linear_model as linear_module

In [None]:
#@title Read the data as CSV file.
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

df = pd.read_csv(url, header=None, sep='\s+')

In [None]:
 #@title Clean up.
 df.columns = ["mpg", "cylinders", "displacement", "horsepower", 
                   "weight", "acceleration", "year", "origin", "name"]
 df.horsepower = pd.to_numeric(df['horsepower'],errors='coerce')
 df = df.dropna()
 

In [None]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [None]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


# Two variable regression

In [None]:
Y = df[['mpg']]
X = df[['cylinders', 'horsepower']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=1)

# training
import sklearn.linear_model as linear_module
model = linear_module.LinearRegression()
reg = model.fit(X_train, y_train)
print("intercept", reg.intercept_)
print("slope", reg.coef_)

intercept [41.99863077]
slope [[-1.67720319 -0.09341019]]


In [None]:
# training error
y_train_predict = reg.predict(X_train)

from sklearn.metrics import mean_squared_error
import math

regression_train_rse = math.sqrt(mean_squared_error(y_train_predict, y_train))

from sklearn.metrics import r2_score
r2_score(y_train, y_train_predict)


0.6489048239448725

In [None]:

# test error
y_predict = reg.predict(X_test)
regression_test_rse = math.sqrt(mean_squared_error(y_predict, y_test))

from sklearn.metrics import r2_score
r2_score(y_test, y_predict)


0.664572220012015

## Standardization

Suppose we pick `cylinders`, which ranges from 4-8 and `weight` which is in the 1000s as our predictors.

The weights differ by a factor of a 100, -0.6 and -0.006. It will be hard to know the relative importance of the variables from these weights alone.

In [None]:
Y = df[['mpg']]
X = df[['cylinders', 'weight']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=1)

# training
import sklearn.linear_model as linear_module
model = linear_module.LinearRegression()
reg = model.fit(X_train, y_train)
print("intercept", reg.intercept_)
print("slope", reg.coef_)

intercept [45.22774304]
slope [[-0.60265114 -0.00629629]]


So we will standardize the model

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X)

scaler


StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
print(scaler.mean_, scaler.scale_)


[   5.47193878 2977.58418367] [  1.70360611 848.31844657]


In [None]:
X_train_scaled = scaler.transform(X_train)
X_train_scaled[0:5]

array([[ 0.30996673,  0.10894001],
       [ 1.48394702,  1.62959537],
       [-0.86401356, -0.82349286],
       [-0.86401356, -0.48635531],
       [ 0.30996673, -0.03841032]])

In [None]:
X_train[0:5]

Unnamed: 0,cylinders,weight
259,6,3070.0
289,8,4360.0
108,4,2279.0
206,4,2565.0
99,6,2945.0


In [None]:
# training
import sklearn.linear_model as linear_module
model = linear_module.LinearRegression()
reg = model.fit(X_scaled, y_train)
print("intercept", reg.intercept_)
print("slope", reg.coef_)

intercept [23.18234238]
slope [[-1.02668016 -5.3412581 ]]


Do not forget to scale the test dataset too!

In [None]:
# test error
X_test_scaled = scaler.transform(X_test)


y_predict = reg.predict(X_test_scaled)
regression_test_rse = math.sqrt(mean_squared_error(y_predict, y_test))

from sklearn.metrics import r2_score
r2_score(y_test, y_predict)


0.7333980039099433

Notice that the `r2` score improved from 0.69 to 0.73 after using scaling.

### Min-max scaling



In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax[0:5]

array([[0.6       , 0.41309895],
       [1.        , 0.77884888],
       [0.2       , 0.18882903],
       [0.2       , 0.26991778],
       [0.6       , 0.37765807]])

In [None]:
X_train[0:5]

Unnamed: 0,cylinders,weight
259,6,3070.0
289,8,4360.0
108,4,2279.0
206,4,2565.0
99,6,2945.0


### Exercise

Use min-max scaling to see if it improves the r2 score.

## Exercise

Use min-max scaling for all the variables and see if it improves.

## Categorical variables: One-Hot encoding

What should we do with categorical variables, like car model names?



In [None]:
# Let us extract the model names first
df.name.str.split().str[0].unique()


array(['chevrolet', 'buick', 'plymouth', 'amc', 'ford', 'pontiac',
       'dodge', 'toyota', 'datsun', 'volkswagen', 'peugeot', 'audi',
       'saab', 'bmw', 'chevy', 'hi', 'mercury', 'opel', 'fiat',
       'oldsmobile', 'chrysler', 'mazda', 'volvo', 'renault', 'toyouta',
       'maxda', 'honda', 'subaru', 'chevroelt', 'capri', 'vw',
       'mercedes-benz', 'cadillac', 'mercedes', 'vokswagen', 'triumph',
       'nissan'], dtype=object)

Notice that there is some duplication. `'chevrolet', 'chevroelt', 'chevy'` are the same make. Similarly, `'volkswagen', 'vw', 'vokswagen'` are the same car.  After this clean up, we can use one-hot encoding.


Sample code for one-hot encoding from [scikit documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features).
```
>>> genders = ['female', 'male']
>>> locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
>>> browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
>>> enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
>>> # Note that for there are missing categorical values for the 2nd and 3rd
>>> # feature
>>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
>>> enc.fit(X)
OneHotEncoder(categories=[['female', 'male'],
                          ['from Africa', 'from Asia', 'from Europe',
                           'from US'],
                          ['uses Chrome', 'uses Firefox', 'uses IE',
                           'uses Safari']])
>>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])
```

### Exercise: try to transform the name colum using one-hot vector encoding.

## Forward Selection

In [None]:
!pip show scikit-learn

Name: scikit-learn
Version: 0.22.2.post1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /usr/local/lib/python3.7/dist-packages
Requires: scipy, numpy, joblib
Required-by: yellowbrick, textgenrnn, sklearn, sklearn-pandas, mlxtend, lightgbm, librosa, imbalanced-learn, fancyimpute


In [None]:
sklearn.show_versions()


System:
    python: 3.7.10 (default, Feb 20 2021, 21:17:23)  [GCC 7.5.0]
executable: /usr/bin/python3
   machine: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic

Python dependencies:
       pip: 19.3.1
setuptools: 56.0.0
   sklearn: 0.22.2.post1
     numpy: 1.19.5
     scipy: 1.4.1
    Cython: 0.29.22
    pandas: 1.1.5
matplotlib: 3.2.2
    joblib: 1.0.1

Built with OpenMP: True


In [None]:
!pip install  -U sklearn

Requirement already up-to-date: sklearn in /usr/local/lib/python3.7/dist-packages (0.0)


In [None]:
#@title Forward and backward selection Works with sklearn version 0.24 onwards.
# only available in sklearn version 0.24 and above
# This code is modified from scikit's user guide

from sklearn.feature_selection import SequentialFeatureSelector
from time import time

model = linear_module.LinearRegression()
reg = model.fit(X_train, y_train)

tic_fwd = time()
sfs_forward = SequentialFeatureSelector(reg, n_features_to_select=4,
                                        direction='forward').fit(X_train, y_train)
toc_fwd = time()

tic_bwd = time()
sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select=2,
                                         direction='backward').fit(X, y)
toc_bwd = time()

sfs_forward.get_support()

#print("Features selected by forward sequential selection: "
#      f"{feature_names[sfs_forward.get_support()]}")
#print(f"Done in {toc_fwd - tic_fwd:.3f}s")
#print("Features selected by backward sequential selection: "
#      f"{feature_names[sfs_backward.get_support()]}")
# print(f"Done in {toc_bwd - tic_bwd:.3f}s")

### Advanced exercise: Try to do forward and backward selection by hand.

Hint: you might have to write a function for not repeating the code.