In [2]:
import pandas as pd 

data = pd.read_csv("auto-mpg.csv")

data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data)

In [5]:
data.shape

(392, 9)

In [6]:
train.shape

(294, 9)

In [7]:
test.shape

(98, 9)

In [8]:
y = data["mpg"]

X = data.drop(["mpg", "car name"], axis=1)

# split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
106,6,232.0,100,2789,15.0,73,1
68,8,350.0,160,4456,13.5,72,1
239,6,146.0,97,2815,14.5,77,3
18,4,97.0,88,2130,14.5,70,3
276,4,89.0,71,1990,14.9,78,2


In [10]:
X_train.shape

(294, 7)

In [11]:
y_train.shape

(294,)

In [12]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

294 98 294 98


In [14]:
X_train,X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [15]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

313 79 313 79


## Log Transformation

In [18]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# Instantiate a custom transformer for log transformation 
log_transformer = FunctionTransformer(np.log, validate=True)

# Columns to be log transformed 
log_columns = ['displacement', 'horsepower', 'weight']

# New names for columns after transformation
new_log_columns = ['log_disp', 'log_hp', 'log_wt']

# Log transform the training columns and convert them into a DataFrame 
X_train_log = pd.DataFrame(log_transformer.fit_transform(X_train[log_columns]), 
                           columns=new_log_columns, index=X_train.index)

X_train_log.head()

Unnamed: 0,log_disp,log_hp,log_wt
258,5.4161,4.70048,8.194229
182,4.941642,4.521789,7.852439
172,5.141664,4.574711,8.00102
63,5.762051,5.010635,8.327243
340,4.454347,4.158883,7.536364


In [19]:
# Log transform the test columns and convert them into a DataFrame 
X_test_log = pd.DataFrame(log_transformer.transform(X_test[log_columns]), 
                          columns=new_log_columns, index=X_test.index)

X_test_log.head()

Unnamed: 0,log_disp,log_hp,log_wt
78,4.564348,4.234107,7.6912
274,4.795791,4.744932,7.935587
246,4.51086,4.094345,7.495542
55,4.51086,4.248495,7.578145
387,4.941642,4.454347,7.933797


In [20]:
from sklearn.preprocessing import OneHotEncoder

# Instantiate OneHotEncoder
# Need to use sparse_output=False for sklearn 1.2 or greater
ohe=OneHotEncoder(drop="first", sparse=False)

# Create X_cat which contains only the categorical variables
cat_columns = ["origin"]
X_train_cat = X_train.loc[:,cat_columns]

# Transform training set
X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train_cat),
                           index=X_train.index)

X_train_ohe.head()

Unnamed: 0,0,1
258,0.0,0.0
182,0.0,0.0
172,0.0,0.0
63,0.0,0.0
340,0.0,0.0


In [21]:
# Drop transform 
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
258,6,225.0,110,3620,18.7,78,1
182,4,140.0,92,2572,14.9,76,1
172,6,171.0,97,2984,14.5,75,1
63,8,318.0,150,4135,13.5,72,1
340,4,86.0,64,1875,16.4,81,1


In [22]:
# Drop transformed columns

cols_to_drop = log_columns + cat_columns
X_train = X_train.drop(columns=cols_to_drop)

# Combine the three datasets into training
X_train_tr = pd.concat([X_train, X_train_log, X_train_ohe], axis=1)
X_train_tr.head()

Unnamed: 0,cylinders,acceleration,model year,log_disp,log_hp,log_wt,0,1
258,6,18.7,78,5.4161,4.70048,8.194229,0.0,0.0
182,4,14.9,76,4.941642,4.521789,7.852439,0.0,0.0
172,6,14.5,75,5.141664,4.574711,8.00102,0.0,0.0
63,8,13.5,72,5.762051,5.010635,8.327243,0.0,0.0
340,4,16.4,81,4.454347,4.158883,7.536364,0.0,0.0


In [23]:
# Transform testing set
X_test_ohe = pd.DataFrame(ohe.transform(X_test[cat_columns]),
                          index=X_test.index)
X_test_ohe.head()

Unnamed: 0,0,1
78,1.0,0.0
274,1.0,0.0
246,0.0,1.0
55,0.0,0.0
387,0.0,0.0


In [24]:
X_test = X_test.drop(columns=cols_to_drop)

# Combine test set
X_test_tr = pd.concat([X_test, X_test_log, X_test_ohe], axis=1)
X_test_tr.head()

Unnamed: 0,cylinders,acceleration,model year,log_disp,log_hp,log_wt,0,1
78,4,18.0,72,4.564348,4.234107,7.6912,1.0,0.0
274,4,15.7,78,4.795791,4.744932,7.935587,1.0,0.0
246,4,16.4,78,4.51086,4.094345,7.495542,0.0,1.0
55,4,20.5,71,4.51086,4.248495,7.578145,0.0,0.0
387,4,15.6,82,4.941642,4.454347,7.933797,0.0,0.0


## Building, Evaluating and Validating a Model 

In [25]:
# convert feature names to strings so there is not a TypeError with sklearn

X_train_tr.columns = X_train_tr.columns.astype(str)
X_test_tr.columns = X_test_tr.columns.astype(str)

In [26]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train_tr, y_train)

y_hat_train = linreg.predict(X_train_tr)
y_hat_test = linreg.predict(X_test_tr)

In [27]:
train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

In [28]:
mse_train = np.sum((y_train - y_hat_train)**2)/len(y_train)
mse_test = np.sum((y_test - y_hat_test)**2)/len(y_test)
print('Train Mean Squared Error:', mse_train)
print('Test Mean Squared Error:', mse_test)

Train Mean Squared Error: 9.091818811315939
Test Mean Squared Error: 10.010059484009497


In [29]:
from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squared Error:', train_mse)
print('Test Mean Squared Error:', test_mse)

Train Mean Squared Error: 9.091818811315939
Test Mean Squared Error: 10.010059484009497


### Overfitting with a Different Model

In [30]:
from sklearn.tree import DecisionTreeRegressor

other_model = DecisionTreeRegressor(random_state=42)
other_model.fit(X_train_tr, y_train)

other_train_mse = mean_squared_error(y_train, other_model.predict(X_train_tr))
other_test_mse = mean_squared_error(y_test, other_model.predict(X_test_tr))
print('Train Mean Squared Error:', other_train_mse)
print('Test Mean Squared Error:', other_test_mse)

Train Mean Squared Error: 0.0
Test Mean Squared Error: 11.403164556962025
