### PROBLEM STATEMENT: We will construct a linear model that explains the relationship a car's mileage (mpg) has with its other attributes

## Import Libraries

### Step 1:Import the libraries

In [1]:
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
  

In [2]:
### Step 2:Load the file

## Load and review data

In [3]:
car_df = pd.read_csv("auto-mpg.csv")  


In [4]:
car_df.shape

(398, 9)

In [5]:
car_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
346,32.3,4,97.0,67,2065,17.8,81,3,subaru
150,26.0,4,108.0,93,2391,15.5,74,3,subaru
168,23.0,4,140.0,83,2639,17.0,75,1,ford pinto
230,15.5,8,350.0,170,4165,11.4,77,1,chevrolet monte carlo landau
199,20.0,6,225.0,100,3651,17.7,76,1,dodge aspen se
170,23.0,4,140.0,78,2592,18.5,75,1,pontiac astro
225,17.5,6,250.0,110,3520,16.4,77,1,chevrolet concours
378,38.0,4,105.0,63,2125,14.7,82,1,plymouth horizon miser
125,20.0,6,198.0,95,3102,16.5,74,1,plymouth duster
280,21.5,6,231.0,115,3245,15.4,79,1,pontiac lemans v6


In [6]:
car_df.drop("car name",axis=1,inplace=True)

In [7]:
# Also replacing the categorical var with actual values
car_df['origin'] = car_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
car_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
48,18.0,6,250.0,88,3139,14.5,71,america
12,15.0,8,400.0,150,3761,9.5,70,america
333,32.7,6,168.0,132,2910,11.4,80,asia
14,24.0,4,113.0,95,2372,15.0,70,asia
372,27.0,4,151.0,90,2735,18.0,82,america
326,43.4,4,90.0,48,2335,23.7,80,europe
77,22.0,4,121.0,76,2511,18.0,72,europe
392,27.0,4,151.0,90,2950,17.3,82,america
101,23.0,6,198.0,95,2904,16.0,73,america
367,28.0,4,112.0,88,2605,19.6,82,america


In [8]:
### One hot encoding
car_df = pd.get_dummies(car_df, columns=['origin']) ### one hot encoding
car_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
8,14.0,8,455.0,225,4425,10.0,70,1,0,0
393,27.0,4,140.0,86,2790,15.6,82,1,0,0
387,38.0,6,262.0,85,3015,17.0,82,1,0,0
13,14.0,8,455.0,225,3086,10.0,70,1,0,0
336,23.6,4,140.0,?,2905,14.3,80,1,0,0
230,15.5,8,350.0,170,4165,11.4,77,1,0,0
146,28.0,4,90.0,75,2125,14.5,74,1,0,0
155,15.0,6,250.0,72,3158,19.5,75,1,0,0
216,31.5,4,98.0,68,2045,18.5,77,0,1,0
95,12.0,8,455.0,225,4951,11.0,73,1,0,0


In [9]:
car_df.isnull().sum()

mpg               0
cylinders         0
displacement      0
horsepower        0
weight            0
acceleration      0
model year        0
origin_america    0
origin_asia       0
origin_europe     0
dtype: int64

In [10]:
car_df.dtypes


mpg               float64
cylinders           int64
displacement      float64
horsepower         object
weight              int64
acceleration      float64
model year          int64
origin_america      uint8
origin_asia         uint8
origin_europe       uint8
dtype: object

## Dealing with Missing Values

In [11]:
#A quick summary of the data columns
car_df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0


In [12]:
car_df.describe(include="all")

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
unique,,,,94.0,,,,,,
top,,,,150.0,,,,,,
freq,,,,22.0,,,,,,
mean,23.514573,5.454774,193.425879,,2970.424623,15.56809,76.01005,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,,846.841774,2.757689,3.697627,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,,2223.75,13.825,73.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,,3608.0,17.175,79.0,1.0,0.0,0.0


In [13]:
car_df.info()                      

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cylinders       398 non-null    int64  
 2   displacement    398 non-null    float64
 3   horsepower      398 non-null    object 
 4   weight          398 non-null    int64  
 5   acceleration    398 non-null    float64
 6   model year      398 non-null    int64  
 7   origin_america  398 non-null    uint8  
 8   origin_asia     398 non-null    uint8  
 9   origin_europe   398 non-null    uint8  
dtypes: float64(3), int64(3), object(1), uint8(3)
memory usage: 23.1+ KB


In [14]:
# hp is missing cause it does not seem to be recognized as a numerical column!
car_df.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower         object
weight              int64
acceleration      float64
model year          int64
origin_america      uint8
origin_asia         uint8
origin_europe       uint8
dtype: object

In [15]:
# isdigit()? on 'horsepower' 
hpIsDigit = pd.DataFrame(car_df.horsepower.str.isdigit())  # if the string is made of digits store True else False

#print isDigit = False!
car_df[hpIsDigit['horsepower'] == False]   # from temp take only those rows where hp has false


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,?,2046,19.0,71,1,0,0
126,21.0,6,200.0,?,2875,17.0,74,1,0,0
330,40.9,4,85.0,?,1835,17.3,80,0,0,1
336,23.6,4,140.0,?,2905,14.3,80,1,0,0
354,34.5,4,100.0,?,2320,15.8,81,0,0,1
374,23.0,4,151.0,?,3035,20.5,82,1,0,0


In [16]:
car_df["horsepower"]=car_df["horsepower"].replace("?",np.nan)
car_df["horsepower"]=car_df["horsepower"].astype(float)

In [17]:
median1=car_df["horsepower"].median()
median1

93.5

In [18]:
car_df["horsepower"].replace(np.nan,median1,inplace=True)

In [19]:
car_df[hpIsDigit['horsepower'] == False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,93.5,2046,19.0,71,1,0,0
126,21.0,6,200.0,93.5,2875,17.0,74,1,0,0
330,40.9,4,85.0,93.5,1835,17.3,80,0,0,1
336,23.6,4,140.0,93.5,2905,14.3,80,1,0,0
354,34.5,4,100.0,93.5,2320,15.8,81,0,0,1
374,23.0,4,151.0,93.5,3035,20.5,82,1,0,0


In [20]:
car_df.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower        float64
weight              int64
acceleration      float64
model year          int64
origin_america      uint8
origin_asia         uint8
origin_europe       uint8
dtype: object

In [21]:
### Duplicates?
duplicate=car_df.duplicated()
duplicate.sum()

0

There are various ways to handle missing values. Drop the rows, replace missing values with median values etc. of the 398 rows 6 have NAN in the hp column. We could drop those 6 rows - which might not be a good idea under all situations


## BiVariate Plots

A bivariate analysis among the different variables can be done using scatter matrix plot. Seaborn libs create a dashboard reflecting useful information about the dimensions. The result can be stored as a .png file. 

In [None]:
sns.pairplot(car_df,diag_kind="kde")

<seaborn.axisgrid.PairGrid at 0x2148a1d34c0>

Observation between 'mpg' and other attributes indicate the relationship is not really linear. However, the plots also indicate that linearity would still capture quite a bit of useful information/pattern. Several assumptions of classical linear regression seem to be violated, including the assumption of no Heteroscedasticity


## Split Data

In [None]:
# lets build our linear model
# independant variables
X = car_df.drop(['mpg'], axis=1)
# the dependent variable
y = car_df[['mpg']]

In [None]:
# Split X and y into training and test set in 70:30 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=1)

## Fit Linear Model

In [None]:
model_1 = LinearRegression()
model_1.fit(X_train, y_train)


Here are the coefficients for each variable and the intercept

The score (R^2) for in-sample and out of sample

In [None]:
model_1.score(X_train, y_train)

In [None]:
#out of sample score (R^2)

model_1.score(X_test, y_test)


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf = linear_model.LinearRegression()

poly_clf.fit(X_train2, y_train)

#y_pred = poly_clf.predict(X_test2)

#print(y_pred)

#In sample (training) R^2 will always improve with the number of variables!
print(poly_clf.score(X_train2, y_train))

In [None]:
#Out off sample (testing) R^2 is our measure of sucess and does improve
print(poly_clf.score(X_test2, y_test))