# Selecting and Training Models
1.Select and Train a few Algorithms(Linear Regression, Decision Tree, RandomForest)
2.Evaluation using Mean Squared Error
3. Model Evaluation using Cross Validation
4. Hyperparameter Tuning using GridSearchCV
5. Check Feature Importance
6. Evaluate the Final System on test data
7. Saving the Model


In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
os.listdir()

['.ipynb_checkpoints',
 'auto-mpg.data',
 'Data pipeline - Part 2.ipynb',
 'data_preparation_part1',
 'EDA - Part 1.ipynb',
 'Training models - Part 3.ipynb']

In [2]:
col = ['MPG','Cylinders','Displacement','Horsepower','Weight','Acceleration','Model Year','Origin']
df = pd.read_csv('auto-mpg.data',names = col, comment ='\t', na_values = '?',skipinitialspace= True, sep = ' ')
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB


In [4]:
data = df.copy()

# shuffle splitting of data and 
getting categorical data in proportion in train and test data with the help of StratifiedShuffleSplit

In [5]:
#shuffle splitting of data and 
#getting categorical data in proportion in train and test data with the help of StratifiedShuffleSplit

strat = StratifiedShuffleSplit(n_splits= 1, test_size = 0.2, random_state= 42)
strat.split(data, data.Cylinders)

<generator object BaseShuffleSplit.split at 0x000001F8FE72CB30>

In [41]:
for train_index, test_index in strat.split(data, data.Cylinders):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

In [7]:
train_set

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,32.0,4,83.0,61.0,2003.0,19.0,74,3
151,31.0,4,79.0,67.0,2000.0,16.0,74,2
388,26.0,4,156.0,92.0,2585.0,14.5,82,1
48,18.0,6,250.0,88.0,3139.0,14.5,71,1
114,26.0,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...,...
147,24.0,4,90.0,75.0,2108.0,15.5,74,2
156,16.0,8,400.0,170.0,4668.0,11.5,75,1
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
14,24.0,4,113.0,95.0,2372.0,15.0,70,3


In [8]:
#Seperating Features and Labels
data_features = train_set.drop('MPG',axis = 1)
data_target = train_set[['MPG']]

In [9]:
# Mapping Origin column
def preprocess_origin_col(df) :
    df['Origin'] = df['Origin'].map({1 : 'India',2 : 'USA', 3: 'Germany'})
    return df

In [10]:
#Adding custom attributes
from sklearn.base import BaseEstimator, TransformerMixin
acc_i = 4
cyl_i = 0
hp_i =  2

class CustomAttrAdder(BaseEstimator, TransformerMixin) :
    def __init__(self, acc_on_power = True) :
        self.acc_on_power = acc_on_power
    def fit(self, x , y = None) :
        return self #nothing else to do
    def transform(self,x) :
        acc_on_cyl = x[: , acc_i]/x[: , cyl_i]
        if self.acc_on_power :
            acc_on_power = x[:, acc_i]/x[:, hp_i]
            return np.c_[x,acc_on_power,acc_on_cyl]
        return np.c_[x,acc_on_cyl]
    




In [11]:
def num_pipeline_transformer(data) :

    #pipeline for numerical attributes
    #imputing >> attribute adding >> scaling
    num_data = data.drop('Origin', axis = 1)

    num_pipeline = Pipeline([
         ('imputer', SimpleImputer(strategy='median')),
         ('attr',CustomAttrAdder()),
         ('scale', StandardScaler())
         ])
    return num_pipeline, num_data

def pipeline_transformer(data) :
 #coloumn transformer to each column
    cat_data = ['Origin']
    num_pipeline , num_data = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ('num', num_pipeline, list(num_data)),
        ('ohe', OneHotEncoder(), cat_data)
        ])
    prepared_pipeline = full_pipeline.fit_transform(data)
    return prepared_pipeline
    

In [12]:
#finalize our data in the 2 steps only
preprocess_origin = preprocess_origin_col(data_features)
prepared_data = pipeline_transformer(preprocess_origin)

In [13]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

# Selection of ML models
1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regressor

In [14]:
#linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(prepared_data, data_target)

LinearRegression()

In [15]:
#testing prediction
sample_feature = data_features.iloc[ :5]
sample_target = data_target.iloc[:5]

#preprocess_origin = preprocess_origin_col(sample_feature)
prepare_data = pipeline_transformer(sample_feature)

print('Linear Regression Prediction is \n', lr.predict(prepared_data))

Linear Regression Prediction is 
 [[30.87084533]
 [28.8942444 ]
 [29.18645219]
 [18.38508216]
 [24.4280997 ]
 [36.18469432]
 [13.60551672]
 [24.65953039]
 [17.26766558]
 [19.38905099]
 [35.92755441]
 [16.45468724]
 [20.28204604]
 [35.02573532]
 [33.65240691]
 [11.59232805]
 [24.4514598 ]
 [26.37971264]
 [30.84711092]
 [22.2452215 ]
 [22.28715891]
 [12.90717675]
 [26.87113214]
 [20.14368333]
 [12.70989269]
 [16.38274284]
 [26.5104452 ]
 [28.18498378]
 [15.81148568]
 [13.63278255]
 [31.73294078]
 [15.94760202]
 [26.58408924]
 [26.02732585]
 [19.04426774]
 [23.33090062]
 [28.34023775]
 [19.17203363]
 [25.09467495]
 [28.1435552 ]
 [11.27625745]
 [18.57828147]
 [19.66093927]
 [24.77875193]
 [32.13270977]
 [25.39252605]
 [27.25586437]
 [29.46045033]
 [23.86950707]
 [33.48782217]
 [30.75444731]
 [33.35219367]
 [15.27668031]
 [30.77045717]
 [20.11194899]
 [17.52541891]
 [23.68997615]
 [24.52375464]
 [29.47727946]
 [25.82115371]
 [27.1329692 ]
 [21.02810991]
 [25.58557306]
 [19.23905262]
 [14.9

In [16]:
print("Actual Labels of samples: \n", sample_target.values)

Actual Labels of samples: 
 [[32.]
 [31.]
 [26.]
 [18.]
 [26.]]


# Mean Squared Error

In [17]:
from sklearn.metrics import mean_squared_error
lr_predicted = lr.predict(prepared_data)
actual = sample_target.values
mse = mean_squared_error(lr_predicted,data_target)
rmse = np.sqrt(mse)
rmse

2.9590402225760872

# Decision Tree

In [18]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(prepared_data, data_target)

DecisionTreeRegressor()

In [19]:
pred = dt.predict(prepared_data)
dt_mse = mean_squared_error(pred,data_target)
rmse = np.sqrt(dt_mse)
rmse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent.

We won't be touching out test data until we finalize our model. So, how do we check for what's happening?

# Model Evaluation using Cross Validation
Scikit-Learn’s K-fold cross-validation feature randomly splits the training set into K distinct subsets called folds, then it trains and evaluates the model K times, picking a different fold for evaluation every time and training on the other K-1 folds.

The result is an array containing the K evaluation scores:

In [20]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt, prepared_data, data_target, scoring = 'neg_mean_squared_error', cv = 10)

dt_score = np.sqrt(-scores)

In [21]:
dt_score

array([2.97000631, 3.04400025, 3.07393274, 3.07774268, 2.22696542,
       2.94475169, 3.66891503, 5.02882317, 4.12451366, 2.5790721 ])

In [22]:
dt_score.mean()

3.2738723052435668

In [23]:
#for linear regression
scores = cross_val_score(lr, prepared_data, data_target, scoring = 'neg_mean_squared_error', cv = 10)

lr_score = np.sqrt(-scores)

In [24]:
lr_score.mean()

3.0757081793709324

# Random Forest Model

In [25]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(prepared_data, data_target)

rf_scores = cross_val_score(rf, prepared_data, data_target, scoring = 'neg_mean_squared_error', cv = 10)

rf_score = np.sqrt(-scores)
rf_score.mean()


3.0757081793709324

# Support Vector Machine

In [26]:
from sklearn.svm import SVR

svm = SVR(kernel='linear')

svm.fit(prepared_data, data_target)

svm_scores = cross_val_score(svm, prepared_data, data_target, scoring= 'neg_mean_squared_error' , cv =10 )

svm_score = np.sqrt(-svm_scores)
svm_score.mean()



3.086591620802819

# After concluding RMSE from various ML algorithm we selected Random forest regressor for our model

# Fine-Tuning Hyperparameters
After testing all the models, you’ll find that RandomForestRegressor has performed the best but it still needs to be fine-tuned.

A model is like a radio station with a lot of knobs to handle and tune. Now, you can either tune all these knobs manually or provide a range of values/combinations that you want to test.

We use GridSearchCV to find out the best combination of hyperparameters for the RandomForest model:

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
param_grid = [{'n_estimators' : [3,10,30], 'max_features' :[2,4,6,8]},
             {'bootstrap': [False],'n_estimators' : [3,10],'max_features' :[2,4,3] }]
grid_search = GridSearchCV(rf, param_grid, scoring = 'neg_mean_squared_error', return_train_score = True, cv = 10 )
grid_search.fit(prepared_data,data_target)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 4, 3],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [29]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [30]:
cv_scores = grid_search.cv_results_
for mean_score , params in zip(cv_scores['mean_test_score'],cv_scores['params']):
    print(np.sqrt(-mean_score), params)

3.3285873639737527 {'max_features': 2, 'n_estimators': 3}
2.8736346179253043 {'max_features': 2, 'n_estimators': 10}
2.9007393857592554 {'max_features': 2, 'n_estimators': 30}
3.3017015950777666 {'max_features': 4, 'n_estimators': 3}
2.7539660733321525 {'max_features': 4, 'n_estimators': 10}
2.7399652344303744 {'max_features': 4, 'n_estimators': 30}
3.05266475342023 {'max_features': 6, 'n_estimators': 3}
2.782055849184892 {'max_features': 6, 'n_estimators': 10}
2.7039125781516566 {'max_features': 6, 'n_estimators': 30}
2.9831582591147807 {'max_features': 8, 'n_estimators': 3}
2.9338548557343676 {'max_features': 8, 'n_estimators': 10}
2.7672077288254537 {'max_features': 8, 'n_estimators': 30}
3.2004060352278083 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.908311126559703 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.1664332822882586 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
2.8397933605537498 {'bootstrap': False, 'max_features': 4, 'n_est

# Checking feature importance

In [33]:
feature_imp = grid_search.best_estimator_.feature_importances_
feature_imp


array([0.15642082, 0.22057712, 0.16216312, 0.22054461, 0.01766554,
       0.10987796, 0.04467523, 0.05776908, 0.00260542, 0.00439351,
       0.00330759])

In [40]:
extra_attr = ['acc_on_power', 'acc_on_cyl']
num_attr = list(data_features.select_dtypes(['int64','float64']))
attr = num_attr + extra_attr
sorted(zip(attr,feature_imp),reverse=True)

[('acc_on_power', 0.04467522718086139),
 ('acc_on_cyl', 0.057769075226421454),
 ('Weight', 0.2205446125454955),
 ('Model Year', 0.10987796326688817),
 ('Horsepower', 0.16216312280299816),
 ('Displacement', 0.22057711664334564),
 ('Cylinders', 0.15642081585173764),
 ('Acceleration', 0.017665542995828693)]

# Evaluating the entire system on test data

In [42]:
test_set

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
128,15.0,6,250.0,100.0,3336.0,17.0,74,1
100,18.0,6,250.0,88.0,3021.0,16.5,73,1
330,40.9,4,85.0,,1835.0,17.3,80,2
57,24.0,4,113.0,95.0,2278.0,15.5,72,3
160,17.0,6,231.0,110.0,3907.0,21.0,75,1
...,...,...,...,...,...,...,...,...
266,30.0,4,98.0,68.0,2155.0,16.5,78,1
389,22.0,6,232.0,112.0,2835.0,14.7,82,1
217,30.0,4,111.0,80.0,2155.0,14.8,77,1
66,17.0,8,304.0,150.0,3672.0,11.5,72,1


In [48]:
x_test = test_set.drop('MPG',axis = 1) 
y_test = test_set[['MPG']]  #()

x_test_preprocessed= preprocess_origin_col(x_test)
x_test_prepared = pipeline_transformer(x_test_preprocessed)

final_model = grid_search.best_estimator_

pred = final_model.predict(x_test_prepared)
mse = mean_squared_error(y_test,pred)
rmse = np.sqrt(mse)

In [49]:
rmse

2.9441253181359137

# Creating funtion to cover this entire flow

In [50]:
def predict_mpg(config,model) :
    if type(config)== dict :
        df =  pd.DataFrame(config)
    else :
        df = config
    
    preprocess = preprocess_origin_col(df)
    final_prepare = pipeline_transformer( preprocess)
    pred = model.predict(final_prepare)
    return pred

In [12]:
#cheking on random sample
vehical_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

In [52]:
predict_mpg(vehical_config,final_model)

array([32.19666667, 17.53      , 20.20666667])

# Saving The model

In [53]:
import pickle
pickle.dump(final_model,open('model.bin','wb'))

In [54]:
model = pickle.load(open('model.bin','rb'))

In [55]:
predict_mpg(vehical_config,model)

array([32.19666667, 17.53      , 20.20666667])

In [14]:
import requests

url = 'http://127.0.0.1:9696/'
r = requests.post(url, json = vehical_config)
r.text.strip()

'{\n  "My Prediction ": [\n    32.19666666666665, \n    17.529999999999998, \n    20.206666666666663\n  ]\n}'