In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf 

from sklearn.model_selection import train_test_split


In [4]:
df = pd.read_csv('Auto.csv')
df.shape

(397, 9)

In [5]:
df.head(4)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    object 
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


In [13]:
# Convert object datatype of horsepower to float datatype 
df['horsepower'] =  pd.to_numeric(df['horsepower'],errors='coerce')

In [14]:
# To see if we have any nan value
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [15]:
df.dropna(inplace=True)
print(df.shape)
df.isna().sum()

(392, 9)


mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [66]:
x = df['horsepower']
y = df['mpg']

In [68]:
def model_MSE(x, y, order): 
    
    # Split 50% of data into training and 50% into test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5, random_state = 42, shuffle = False)
    
    df_train = pd.DataFrame({'mpg': y_train, 'horsepower': x_train})
    
    if order == '1st':
        formula = 'mpg ~ horsepower'
    elif order == '2nd':
        formula = 'mpg ~ horsepower + I(horsepower**2)'
    elif order == '3rd':
        formula = 'mpg ~ horsepower + I(horsepower**2) + I(horsepower**3)'
    # Create the polynomial model   
    model = smf.ols(formula = formula, data = df_train)
    
    # Fit the model
    result = model.fit()
    
    # Predict mpg on test dataset
    y_test_prediction = result.predict(x_test)
    
    # Print Mean Squared Error
    print('MSE of {} order polynomial fit function: {}'.format(order, np.mean(y_test - y_test_prediction)**2))
  

In [71]:
# MSE of evaluation dataset for a linear model
model_MSE(x, y, '1st')  

# MSE of evaluation dataset for a quadratic model
model_MSE(x, y, '2nd') 

# MSE of evaluation dataset for a cubic model
model_MSE(x, y, '3rd') 

MSE of 1st order polynomial fit function: 24.521263367513804
MSE of 2nd order polynomial fit function: 22.918389468935647
MSE of 3rd order polynomial fit function: 22.913525056358036


### Leave-One-Out Cross-Validation