<a href="https://colab.research.google.com/github/Otobi1/Fuel-Effficiency-Prediction-End-to-End/blob/master/End_to_End_MPG_Part_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Selecting and Training Models 

"""
Select and train a few algos, - Linear Reg, Decision tree, RandomForest 
Evaluation using Mean Squared Error
Model Eval using Cross Validation 
Hyperparameter Tuning using GridSearchCV
Check Feature Importance 
Eval final data 
Save Model 
"""

'\nSelect and train a few algos, - Linear Reg, Decision tree, RandomForest \nEvaluation using Mean Squared Error\nModel Eval using Cross Validation \nHyperparameter Tuning using GridSearchCV\nCheck Feature Importance \nEval final data \nSave Model \n'

In [None]:
# Importing the necessary libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

In [None]:
!wget http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data

--2021-01-02 11:51:13--  http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30286 (30K) [application/x-httpd-php]
Saving to: ‘auto-mpg.data’


2021-01-02 11:51:13 (1.05 MB/s) - ‘auto-mpg.data’ saved [30286/30286]



In [None]:
# Reading the .data file using pandas

cols = ["MPG", "Cylinders", "Displacement", "Horsepower", "Weight", "Acceleration", "Model Year", "Origin"]

df = pd.read_csv("./auto-mpg.data", names = cols, na_values = "?", comment = "\t", sep = " ", skipinitialspace = True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(data, data["Cylinders"]):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]

In [None]:
# Separating the Target and the Feature Variables 

data = strat_train_set.drop("MPG", axis = 1)
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [None]:
# Preprocessing the Origin Column
#data["Origin"].head()

def preprocess_origin_cols(df):
  df["Origin"].head()
  df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
  
  return df 
#data_tr = preprocess_origin_cols(data)
#data_tr.head()


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power = True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y = None):
        return self # nothing else to do
    def transform(self, X): 
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power: 
          acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
          return np.c_[X, acc_on_power, acc_on_cyl]

        return np.c_[X, acc_on_cyl]

In [None]:
def num_pipeline_transformer(data):
    """ 
    Function to process numerical transformations 
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical  dataframe
        num_pipeline: numerical pipeline object 
    """

    numerics = ["float64", "int64"]

    num_attrs = data.select_dtypes(include = numerics)

    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy = "median")), 
        ("attrs_adder", CustomAttrAdder()),                   
        ("std_scaler", StandardScaler()), 
        ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    """
    Complete transformation pipeline for both
    numerical and categorical data. 

    Argument: 
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use 
    """

    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
         ("num", num_pipeline, list(num_attrs)), 
         ("cat", OneHotEncoder(), cat_attrs), 
         ])                             
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [None]:
# From raw data to processed data in 2 steps

preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [None]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

In [None]:
# Selecting and Training Models

# Linear Regression 

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
# testing the predictions with the 

sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [None]:
print("Actual Labels of Samples: ", list(sample_labels))

Actual Labels of Samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [None]:
# Mean Squared Error 

from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.9590402225760872

In [None]:
# Decision Tree 

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [None]:
# Model Evaluation using Cross Validation 

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, 
                         prepared_data, 
                         data_labels, 
                         scoring = "neg_mean_squared_error", 
                         cv = 10)
tree_reg_rmse_scores = np.sqrt(-scores)

In [None]:
tree_reg_rmse_scores

array([3.79345654, 3.22959943, 2.90037713, 3.60312364, 2.18717855,
       3.04912897, 3.44229211, 4.21870833, 4.20767348, 2.76510514])

In [None]:
tree_reg_rmse_scores.mean()

3.3396643340013

In [None]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring = "neg_mean_squared_error")
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.47334514, 3.17094688, 2.6556235 , 2.92218787, 3.33366761])

In [None]:
lin_reg_rmse_scores.mean()

3.111154198042705

In [None]:
# Random Forest Model 

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg, 
                                       prepared_data, 
                                       data_labels, 
                                       scoring = "neg_mean_squared_error", 
                                       cv = 10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.588366432183667

In [None]:
# Support Vector Machine Regressor 

from sklearn.svm import SVR

svm_reg = SVR(kernel = "linear")
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels, scoring = "neg_mean_squared_error", cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.08659162080283

In [None]:
# Hyperparameter Tu[ning using GridSearchCV

from sklearn.model_selection import GridSearchCV

param_grid = [
              {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]}, 
              {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3]}
]
forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, 
                           scoring = "neg_mean_squared_error", 
                           return_train_score = True, 
                           cv = 10,
                           )
grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [None]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [None]:
cv_scores = grid_search.cv_results_

In [None]:
cv_scores = grid_search.cv_results_

# printing all the parameters along with their scores 

for mean_score, params in zip(cv_scores["mean_test_score"], cv_scores["params"] ):
  print(np.sqrt(-mean_score), params)

3.4361357018936194 {'max_features': 2, 'n_estimators': 3}
3.013765610534757 {'max_features': 2, 'n_estimators': 10}
2.9363935182111263 {'max_features': 2, 'n_estimators': 30}
3.243095540189296 {'max_features': 4, 'n_estimators': 3}
2.861352398849751 {'max_features': 4, 'n_estimators': 10}
2.7394335767973255 {'max_features': 4, 'n_estimators': 30}
3.0689781165898733 {'max_features': 6, 'n_estimators': 3}
2.779106845529867 {'max_features': 6, 'n_estimators': 10}
2.761132058648354 {'max_features': 6, 'n_estimators': 30}
3.0854213360234173 {'max_features': 8, 'n_estimators': 3}
2.753533131752965 {'max_features': 8, 'n_estimators': 10}
2.641967393884615 {'max_features': 8, 'n_estimators': 30}
3.407287809248547 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.901277973764042 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.191530886524769 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.878697916341412 {'bootstrap': False, 'max_features': 3, 'n_estimators

In [None]:
# Checking Feature Importance 

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.23459717, 0.27366968, 0.12804363, 0.17745672, 0.01846406,
       0.11143703, 0.02789091, 0.02329464, 0.00191294, 0.00122205,
       0.00201117])

In [None]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ["float64", "int64"]
num_attrs = list(data.select_dtypes(include = numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse = True)

[('acc_on_power', 0.027890911437804047),
 ('acc_on_cyl', 0.023294639327013453),
 ('Weight', 0.17745672169542728),
 ('Model Year', 0.11143703021158649),
 ('Horsepower', 0.12804363270644917),
 ('Displacement', 0.27366968034259703),
 ('Cylinders', 0.23459717215414663),
 ('Acceleration', 0.01846406005161365)]

In [None]:
forest_reg = RandomForestRegressor(bootstrap = False, 
                                   max_features = 3, 
                                   n_estimators = 10)

forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg, prepared_data, 
                                       scoring = "neg_mean_squared_error", 
                                       cv = 10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

nan

In [None]:
# Evaluating the Entire System on Test Data 

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("MPG", axis = 1)
y_test = strat_test_set["MPG"].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

3.093732934857537

In [None]:
# Creating a fucntion to cover the entire workflow

def predict_mpg(config, model):
  if type(config) == dict:
    df = pd.DataFrame(config)
  else:
    df = config

  preproc_df = preprocess_origin_cols(df)
  prepared_df = pipeline_transformer(preproc_df)
  print(prepared_df)
  y_pred = model.predict(prepared_df)
  return y_pred

In [None]:
# Checking it on random sample 

vehicle_config = {
    "Cylinder": [4, 6, 8], 
    "Displacement": [155.0, 160.0, 165.5], 
    "Horsepower": [93.0, 130.0, 98.0], 
    "Weight": [2500.0, 3150.0, 2600.0], 
    "Acceleration": [15.0, 14.0, 16.0],
    "Model Year": [81, 80, 78], 
    "Origin": [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

[[-1.22474487 -1.20484922 -0.85412443 -0.87481777  0.          1.06904497
   0.6684025   1.39127885  1.          0.          0.        ]
 [ 0.         -0.0388661   1.40320441  1.39970842 -1.22474487  0.26726124
  -1.41351982 -0.47596382  0.          0.          1.        ]
 [ 1.22474487  1.24371532 -0.54907999 -0.52489066  1.22474487 -1.33630621
   0.74511732 -0.91531503  0.          1.          0.        ]]


array([33.15666667, 18.50333333, 19.22333333])

In [None]:
# Save the Model

import pickle

In [None]:
# saving the model 

with open("model.bin", "wb") as f_out:
  pickle.dump(final_model, f_out)
  f_out.close()

In [None]:
# loading the model from the saved files 

with open ("model.bin", "rb") as f_in:
  model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

[[-1.22474487 -1.20484922 -0.85412443 -0.87481777  0.          1.06904497
   0.6684025   1.39127885  1.          0.          0.        ]
 [ 0.         -0.0388661   1.40320441  1.39970842 -1.22474487  0.26726124
  -1.41351982 -0.47596382  0.          0.          1.        ]
 [ 1.22474487  1.24371532 -0.54907999 -0.52489066  1.22474487 -1.33630621
   0.74511732 -0.91531503  0.          1.          0.        ]]


array([33.15666667, 18.50333333, 19.22333333])

In [None]:
import requests

url = "https://fuel-efficiency.herokuapp.com/"
r = requests.post(url, json = vehicle_config)
r.text.strip()

'{"mpg_predictions":[26.270000000000003,23.78,19.240000000000002]}'