# IMport Labraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
%matplotlib inline


## Load  and review data

In [2]:
carData=pd.read_csv('auto-mpg.csv')

In [3]:
carData.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [4]:
# 8 Variables
#
# MPG(Miles per gallon)
# cylinders,
# engine displacement (cu, inches),
# horsepower,
# Vehicle weight (lbs),
# time to acceleratee from 0 to 60 mph (secs.),
# model year (modelo 100), and
# origin of car (1. America, 2. European, 3. Japanese).
#
# Also provided are the car labels (types)
# Missing data values are marked by series of question marks


In [5]:
carData.shape

(398, 9)

## Create Dummy Variable

Values like 'america' cannot be read into an equation. Using substitutes like 1 fro america, 2 for europe and 3 for asia would end up implying 
that eurpean cars fall exactly half way between american and asian cars! we don't want to impose such an baseless assumption!
So we create 3 simple  true or false columns with titles equivalent to 'ls this car America?',"Is this car European?","and "Is this car Asian?"
These will be used as independent variables without imposing any kind of ordering between the three regions.

In [6]:
carData['origin']=carData['origin'].replace({1:'america',2:'europe',3:'asia'})

In [7]:
carData

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,america,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,america,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,america,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,america,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,america,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,america,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,america,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,america,ford ranger


In [8]:
carData=carData.drop('car name',axis=1)

In [9]:
carData

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,america
394,44.0,4,97.0,52,2130,24.6,82,europe
395,32.0,4,135.0,84,2295,11.6,82,america
396,28.0,4,120.0,79,2625,18.6,82,america


## Dealing with missing values

In [10]:
carData.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [11]:
carData.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin           object
dtype: object

In [12]:
horsepowerisDigit=pd.DataFrame(carData.horsepower.str.isdigit())

In [13]:
carData[horsepowerisDigit['horsepower']==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,?,2046,19.0,71,america
126,21.0,6,200.0,?,2875,17.0,74,america
330,40.9,4,85.0,?,1835,17.3,80,europe
336,23.6,4,140.0,?,2905,14.3,80,america
354,34.5,4,100.0,?,2320,15.8,81,europe
374,23.0,4,151.0,?,3035,20.5,82,america


There are various ways  to handle missing  values. Drop the rows, replace missing values, with median value etc of  the 398 rows 6 have NAN in the 
the hp column. We could drop those six rows which might not be a good idea under all situations

In [14]:
carData=carData.replace('?',np.nan)

In [15]:
carData.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [16]:
carData['horsepower'].median()

93.5

In [17]:
carData['horsepower']=carData['horsepower'].fillna(carData['horsepower'].median())

In [18]:
carData.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [19]:
carData[horsepowerisDigit['horsepower']==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
32,25.0,4,98.0,93.5,2046,19.0,71,america
126,21.0,6,200.0,93.5,2875,17.0,74,america
330,40.9,4,85.0,93.5,1835,17.3,80,europe
336,23.6,4,140.0,93.5,2905,14.3,80,america
354,34.5,4,100.0,93.5,2320,15.8,81,europe
374,23.0,4,151.0,93.5,3035,20.5,82,america


In [20]:
carData.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin           object
dtype: object

In [21]:
carData['horsepower']=carData['horsepower'].astype('float64')

In [22]:
carData.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin           object
dtype: object

## Creating Dummy Variables Con.

In [23]:
carData=pd.get_dummies(carData,['origin'])

In [24]:
carData

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504,12.0,70,True,False,False
1,15.0,8,350.0,165.0,3693,11.5,70,True,False,False
2,18.0,8,318.0,150.0,3436,11.0,70,True,False,False
3,16.0,8,304.0,150.0,3433,12.0,70,True,False,False
4,17.0,8,302.0,140.0,3449,10.5,70,True,False,False
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,True,False,False
394,44.0,4,97.0,52.0,2130,24.6,82,False,False,True
395,32.0,4,135.0,84.0,2295,11.6,82,True,False,False
396,28.0,4,120.0,79.0,2625,18.6,82,True,False,False


### BI Variate Plots

A bivariate analysis among  the different variables can be done using scatter matrix plot. Seaborn libs create a dashboard reflecting
information about  the determinations.  The result can be store as a .png  file.