# Task for Today  

***

## Automobile MPG Prediction  

Given *data about various cars*, let's try to predict the **miles per gallon** of a given vehicle.  
  
We will use linear regression, decision tree, and random forest models to make our predictions.

# Getting Started

In [1]:
import numpy as np
import pandas as pd

import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv('../input/autompg-dataset/auto-mpg.csv')

In [3]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


# Preprocessing

In [5]:
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [6]:
def onehot_encode(df, column_dict):
    df = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [7]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Fill in missing horsepower values with the column mean
    df['horsepower'] = df['horsepower'].replace('?', np.NaN).astype(np.float)
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mean())
    
    # Create make feature
    df['make'] = df['car name'].apply(lambda x: re.search(r'^\w+', x).group(0))
    df = df.drop('car name', axis=1)
    
    # Fix typos in make names
    make_typo_correction = {
        'vw': 'volkswagen',
        'chevy': 'chevrolet',
        'maxda': 'mazda',
        'vokswagen': 'volkswagen',
        'toyouta': 'toyota',
        'chevroelt': 'chevrolet'
    }
    df['make'] = df['make'].replace(make_typo_correction)
    
    # One-hot encode nominal features
    nominal_feature_dict = {
        'origin': 'orig',
        'make': 'mk'
    }
    df = onehot_encode(df, nominal_feature_dict)
    
    # Split df into X and y
    y = df['mpg'].copy()
    X = df.drop('mpg', axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X_train and X_test with a standard scaler fit only on X_train
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [9]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,orig_1,orig_2,orig_3,mk_amc,...,mk_peugeot,mk_plymouth,mk_pontiac,mk_renault,mk_saab,mk_subaru,mk_toyota,mk_triumph,mk_volkswagen,mk_volvo
0,1.497785,1.620399,1.711111,1.968967,-0.958513,-0.827971,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,3.591657,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
1,1.497785,2.288862,2.446571,1.601619,-2.072349,-1.640357,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
2,-0.844742,-0.714375,-0.337673,-0.636509,-0.215956,1.067595,-1.313579,-0.427760,1.926620,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
3,0.326521,0.312538,0.135123,0.755426,1.157775,0.526005,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
4,-0.844742,-0.927508,-1.545930,-1.339518,1.826077,-1.640357,-1.313579,2.337759,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,4.342481,-0.135333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,1.497785,1.523521,1.711111,1.395058,-1.552559,0.255210,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
274,0.326521,0.554735,-0.127541,0.354042,0.897880,-0.827971,0.761279,-0.427760,-0.519044,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
275,-0.844742,-1.034075,-1.046867,-1.016767,0.860752,1.067595,-1.313579,-0.427760,1.926620,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,-0.278423,0.0,-0.230283,-0.135333
276,-0.844742,-0.820942,-0.915535,-0.858326,0.489473,1.609186,-1.313579,-0.427760,1.926620,-0.255214,...,-0.120824,-0.278423,-0.212398,-0.104447,-0.104447,-0.104447,3.591657,0.0,-0.230283,-0.135333


# Training

In [10]:
# Using a simple linear model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_r2 = linear_model.score(X_test, y_test)
print("Linear Regression R^2: {:.1f}".format(linear_r2))

Linear Regression R^2: -103413915227363066511360.0


In [11]:
# Using a decision tree model
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)

tree_r2 = tree_model.score(X_test, y_test)
print("Decision Tree R^2: {:.5f}".format(tree_r2))

Decision Tree R^2: 0.75984


In [12]:
# Using random forest regression model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

rf_r2 = rf_model.score(X_test, y_test)
print("Random Forest R^2: {:.5f}".format(rf_r2))

Random Forest R^2: 0.84344


# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/Q-mEPW2Zf4Q