In [51]:
import pandas as pd
import numpy as np 
import sklearn.linear_model as lm
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#from sklearn.metrics import accuracy
#from sklearn.metrics import report

In [52]:
df = pd.read_csv('auto-mpg.csv',na_values='?').dropna()
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [53]:
train_df = df.sample(170,random_state=1)
train_df.head()
test_df = df[~df.isin(train_df)].dropna(how='all')
print(train_df.head())
#the x parts of data will be reshaped (-1,1) due to the requirement of linear regression
#y parts do not really need to be shaped 
x_train =  train_df['horsepower'].values.reshape(-1,1)
y_train = train_df['mpg']
x_test = test_df['horsepower'].values.reshape(-1,1)
y_test = test_df['mpg']
print(x_train)

      mpg  cylinders  displacement  horsepower  weight  acceleration  \
82   23.0          4         120.0        97.0    2506          14.5   
167  29.0          4          97.0        75.0    2171          16.0   
356  32.4          4         108.0        75.0    2350          16.8   
120  19.0          4         121.0       112.0    2868          15.5   
385  38.0          4          91.0        67.0    1995          16.2   

     model year  origin                     car name  
82           72       3  toyouta corona mark ii (sw)  
167          75       3               toyota corolla  
356          81       3               toyota corolla  
120          73       2                  volvo 144ea  
385          82       3                datsun 310 gx  
[[ 97.]
 [ 75.]
 [ 75.]
 [112.]
 [ 67.]
 [ 83.]
 [ 69.]
 [150.]
 [ 92.]
 [ 72.]
 [115.]
 [ 76.]
 [130.]
 [ 70.]
 [ 84.]
 [140.]
 [ 84.]
 [ 48.]
 [220.]
 [100.]
 [ 75.]
 [110.]
 [150.]
 [170.]
 [198.]
 [150.]
 [ 68.]
 [100.]
 [110.]
 [ 84

In [54]:
lin_reg = lm.LinearRegression()
model = lin_reg.fit(x_train,y_train)

In [55]:
predictions = model.predict(x_test)
error = mse(y_test,predictions)
print("Mean Squared Error ",error)

Mean Squared Error  24.331941916661272


In [56]:
#Fitting the data to a quardatic equation
#create constructor of polynomial
poly = PolynomialFeatures(degree=2)
x_train2 = poly.fit_transform(x_train)
x_test2 = poly.fit_transform(x_test)

model = lin_reg.fit(x_train2,y_train)
pred2= model.predict(x_test2)
mse2 = mse(y_test,pred2)
print("Quardatic")
print("Mean Squared Error ", mse2)


Quardatic
Mean Squared Error  21.053740590984457


In [57]:
#Fitting the data to a cubic equation
#create constructor of polynomial
poly = PolynomialFeatures(degree=3)
x_train3 = poly.fit_transform(x_train)
x_test3 = poly.fit_transform(x_test)

model = lin_reg.fit(x_train3,y_train)
pred3= model.predict(x_test3)
mse3 = mse(y_test,pred3)
print("Cubic")
print("Mean Squared Error ", mse3)



Cubic
Mean Squared Error  21.22820771535413


In [58]:
#KFOLD
model = lin_reg.fit(x_train,y_train)
crossvalid = KFold(n_splits=150, random_state=None, shuffle=True)
score = cross_val_score(model,x_train,y_train,scoring= 'neg_mean_squared_error',cv=crossvalid)
print("Folds: ",str(len(score)))
print("Mean Squared Error: ",str(np.mean(np.abs(score))))

Folds:  150
Mean Squared Error:  24.37718392581454


In [60]:
#LeaveOneOut
loo = LeaveOneOut()
it  = loo.get_n_splits(x_train)
scoress = cross_val_score(model,x_train,y_train,scoring= 'neg_mean_squared_error',cv=loo)
print("Folds: ",str(len(scoress)))
print("Mean Squared Error: ",str(np.mean(np.abs(scoress))))

Folds:  170
Mean Squared Error:  24.414813500554192
