# Construct a linear model that explains the relationship a car's mileage (mpg) has with its other attributes

## Import libraries

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import os


## Load the file

In [40]:
os.chdir('E:\Programming\Anaconda\Learning Python\Pandas')
car_df = pd.read_csv("auto-mpg.csv")

In [41]:
car_df.shape

(398, 10)

In [42]:
car_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_America,origin_Asia,origin_Europe
157,15.0,8,350.0,145.0,4440,14.0,75,1,0,0
32,25.0,4,98.0,0.0,2046,19.0,71,1,0,0
100,18.0,6,250.0,88.0,3021,16.5,73,1,0,0
303,31.8,4,85.0,65.0,2020,19.2,79,0,1,0
250,19.4,8,318.0,140.0,3735,13.2,78,1,0,0
220,33.5,4,85.0,70.0,1945,16.8,77,0,1,0
112,19.0,4,122.0,85.0,2310,18.5,73,1,0,0
148,26.0,4,116.0,75.0,2246,14.0,74,0,0,1
274,20.3,5,131.0,103.0,2830,15.9,78,0,0,1
156,16.0,8,400.0,170.0,4668,11.5,75,1,0,0


In [43]:
car_df.drop("car name",axis=1,inplace=True)

KeyError: "['car name'] not found in axis"

In [44]:
#Replacing categorical var with actual values
car_df['origin'] = car_df['origin'].replace({1: 'America', 2: 'Europe',3: 'Asia'})
car_df.sample(10)

KeyError: 'origin'

In [45]:
#One hot encoding
car_df = pd.get_dummies(car_df,columns=['origin'])
car_df.sample(10)

KeyError: "None of [Index(['origin'], dtype='object')] are in the [columns]"

In [46]:
car_df.isnull().sum()

mpg               0
cylinders         0
displacement      0
horsepower        0
weight            0
acceleration      0
model year        0
origin_America    0
origin_Asia       0
origin_Europe     0
dtype: int64

In [47]:
car_df.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower        float64
weight              int64
acceleration      float64
model year          int64
origin_America      int64
origin_Asia         int64
origin_Europe       int64
dtype: object

## Dealing with Missing Values

In [48]:
car_df['horsepower'] = car_df['horsepower'].replace(to_replace='?',value=np.nan)


In [49]:
car_df['horsepower'] = car_df['horsepower'].astype(float)

In [50]:
mean1=car_df["horsepower"].mean()
mean1

102.89447236180905

In [51]:
car_df["horsepower"].replace(np.nan,mean1,inplace=True)

In [52]:
car_df.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower        float64
weight              int64
acceleration      float64
model year          int64
origin_America      int64
origin_Asia         int64
origin_Europe       int64
dtype: object

## Split Data

In [53]:
x = car_df.drop(['mpg'], axis=1) #independent
y = car_df[['mpg']] #dependent

In [54]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.30,random_state=1)

## Fit Linear Model

In [55]:
model_1 = LinearRegression()
model_1.fit(x_train,y_train)

LinearRegression()

In [56]:
model_1.score(x_train,y_train)

0.8134399204354176

In [57]:
model_1.score(x_test,y_test)

0.8452190244999926

In [58]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=4,interaction_only=True)
x_train2 = poly.fit_transform(x_train)
x_test2 = poly.fit_transform(x_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(x_train2, y_train)
#y_pred = poly_clf.predict(x_test2)

#print(y_pred)

#In sample (training) R^2 will always improve with the number of variables
print(poly_clf.score(x_train2,y_train))

0.9618031120030835


In [59]:
#Out of sample (testing) R^2 is our measure of success and does improve
print(poly_clf.score(x_test2, y_test))

-67.83774000496005
