# Auto MPG Prediction

In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
#reading .data file

cols=['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', ' Model_Year', 'Origin']

df=pd.read_csv('./auto-mpg.data', names=cols, na_values="?",
              comment='\t',
              sep=" ",
              skipinitialspace=True)

data=df.copy()

In [62]:
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model_Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [63]:
data.dtypes

MPG             float64
Cylinders         int64
Displacement    float64
Horsepower      float64
Weight          float64
Acceleration    float64
 Model_Year       int64
Origin            int64
dtype: object

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6    Model_Year   398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB


In [65]:
data.isnull().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
 Model_Year     0
Origin          0
dtype: int64

In [66]:
median=data['Horsepower'].median()
median

93.5

In [67]:
data['Horsepower']=data['Horsepower'].fillna(median)

In [68]:
data.isnull().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
 Model_Year     0
Origin          0
dtype: int64

In [69]:
def add_Origin_names(df):
    df["Origin"]=df["Origin"].map({1:"India", 2:"USA", 3:"Germany"})
    return df

data1=add_Origin_names(data)
data1.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model_Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,India
394,44.0,4,97.0,52.0,2130.0,24.6,82,USA
395,32.0,4,135.0,84.0,2295.0,11.6,82,India
396,28.0,4,120.0,79.0,2625.0,18.6,82,India
397,31.0,4,119.0,82.0,2720.0,19.4,82,India


In [70]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    398 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6    Model_Year   398 non-null    int64  
 7   Origin        398 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 25.0+ KB


In [71]:
data2=pd.get_dummies(data=data1, columns=['Origin'])
data2.columns

Index(['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
       'Acceleration', ' Model_Year', 'Origin_Germany', 'Origin_India',
       'Origin_USA'],
      dtype='object')

In [72]:
data2.dtypes

MPG               float64
Cylinders           int64
Displacement      float64
Horsepower        float64
Weight            float64
Acceleration      float64
 Model_Year         int64
Origin_Germany      uint8
Origin_India        uint8
Origin_USA          uint8
dtype: object

In [74]:
data3=data2.drop('MPG', axis='columns')
y=data2.MPG
data3.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model_Year,Origin_Germany,Origin_India,Origin_USA
0,8,307.0,130.0,3504.0,12.0,70,0,1,0
1,8,350.0,165.0,3693.0,11.5,70,0,1,0
2,8,318.0,150.0,3436.0,11.0,70,0,1,0
3,8,304.0,150.0,3433.0,12.0,70,0,1,0
4,8,302.0,140.0,3449.0,10.5,70,0,1,0


In [75]:
y.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: MPG, dtype: float64

In [78]:
cols_to_scale=['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()

data3[cols_to_scale]=scaler.fit_transform(data3[cols_to_scale])

In [81]:
data3.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model_Year,Origin_Germany,Origin_India,Origin_USA
0,1.0,0.617571,0.456522,0.53615,0.238095,70,0,1,0
1,1.0,0.728682,0.646739,0.589736,0.208333,70,0,1,0
2,1.0,0.645995,0.565217,0.51687,0.178571,70,0,1,0
3,1.0,0.609819,0.565217,0.516019,0.238095,70,0,1,0
4,1.0,0.604651,0.51087,0.520556,0.14881,70,0,1,0


In [87]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(data3, y, test_size=0.2)

In [88]:
X_train.shape

(318, 9)

In [89]:
X_test.shape

(80, 9)

## Linear Regression Model

In [111]:
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [112]:
yp=lin_reg.predict(X_test)

In [113]:
yp[:5]

array([11.01231866, 19.96431506, 15.16167726, 15.52970474, 23.09564727])

In [114]:
y_test[:5]

137    13.0
200    18.0
12     15.0
2      18.0
264    18.1
Name: MPG, dtype: float64

In [115]:
lin_reg.score(X_test, y_test)

0.8301662576042547

##### Mean Squared Error

In [116]:
from sklearn.metrics import mean_squared_error

yp=lin_reg.predict(X_test)
lin_mse=mean_squared_error(y_test, yp)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

3.1533786952620146

 ### Decision Tree

In [117]:
from sklearn.tree import DecisionTreeRegressor

tree_reg=DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [118]:
yp_tree=tree_reg.predict(X_test)


In [119]:
tree_reg.score(X_test, y_test)

0.7700754526318622

In [120]:
tree_mse=mean_squared_error(y_test, yp_tree)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

3.669076859374848

Now we will check with

## Model Evaluation using Cross Validation

In [129]:
from sklearn.model_selection import cross_val_score

scores=cross_val_score(lin_reg, X_test, y_test, scoring="neg_mean_squared_error", cv=10 )
lin_reg_rmse_cross=np.sqrt(-scores)

In [130]:
lin_reg_rmse_cross.mean()

3.2100586926542256

In [131]:
scores=cross_val_score(tree_reg, X_test, y_test, scoring="neg_mean_squared_error", cv=10 )
tree_reg_rmse_cross=np.sqrt(-scores)

In [132]:
tree_reg_rmse_cross

array([2.76947648, 5.65574929, 3.08342342, 3.80952753, 2.22120463,
       4.18120796, 3.02014073, 4.10974452, 5.55709007, 5.70931257])

In [133]:
tree_reg_rmse_cross.mean()

4.011687718821106

## Random Forest

In [134]:
from sklearn.ensemble import RandomForestRegressor

forest_reg=RandomForestRegressor()
forest_reg.fit(X_train, y_train)

forest_reg_cv=cross_val_score(forest_reg, X_test, y_test, scoring="neg_mean_squared_error", cv=10)

forest_reg_rmse=np.sqrt(-forest_reg_cv)


In [135]:
forest_reg_rmse


array([2.60800772, 3.25229542, 2.16059688, 3.41210034, 1.96140157,
       2.92930169, 3.12583405, 4.03358074, 4.16580312, 4.02434964])

In [136]:
forest_reg_rmse.mean()

3.167327116713398

## SVM

In [137]:
from sklearn.svm import SVR

svm_reg=SVR(kernel='linear')
svm_reg.fit(X_train, y_train)
svm_cv=cross_val_score(svm_reg, X_test, y_test, scoring='neg_mean_squared_error', cv=10)

svm_rmse=np.sqrt(-svm_cv)


In [138]:
svm_rmse

array([2.67812896, 2.85486734, 3.72279942, 2.1306659 , 2.93424843,
       3.59453528, 3.52875931, 5.14162383, 4.64989762, 4.83591239])

In [139]:
svm_rmse.mean()

3.607143846488667

Random forest is giving goos mse score.

So we will hyper tune Random forest

## Hyper Parameter tunning from GridSearchCV

In [141]:
from sklearn.model_selection import GridSearchCV

param_grid=[
    {'n_estimators':[3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

# passing two dictionaries

forest_reg=RandomForestRegressor()

grid_search=GridSearchCV(forest_reg, param_grid,
                        scoring='neg_mean_squared_error',
                        return_train_score=True,
                        cv=10,
                        )

grid_search.fit(X_train, y_train)


GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [142]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}

In [143]:
cv_scores=grid_search.cv_results_

# printing all parameters along with their scores

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)

3.025225079750947 {'max_features': 2, 'n_estimators': 3}
2.9674611480729753 {'max_features': 2, 'n_estimators': 10}
2.9229474544090865 {'max_features': 2, 'n_estimators': 30}
3.1848129970478447 {'max_features': 4, 'n_estimators': 3}
2.9874112046461163 {'max_features': 4, 'n_estimators': 10}
2.819970772531182 {'max_features': 4, 'n_estimators': 30}
3.1814138783303525 {'max_features': 6, 'n_estimators': 3}
2.879308776817461 {'max_features': 6, 'n_estimators': 10}
2.8796346810068374 {'max_features': 6, 'n_estimators': 30}
3.0832688165293143 {'max_features': 8, 'n_estimators': 3}
2.8299743334211676 {'max_features': 8, 'n_estimators': 10}
2.8847497497966117 {'max_features': 8, 'n_estimators': 30}
3.2052981558540146 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.8622609316987564 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.2799948060049355 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
3.0793338526061853 {'bootstrap': False, 'max_features': 3, 'n_es