In [178]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 

In [179]:
# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [180]:
# reading the CSV file into pandas dataframe
mpg_df = pd.read_csv("/home/sunil/Desktop/Great Lakes/supervised learning/Day1 - KNN - Linear Reg - Blr - 15 Nov 2019 \car-mpg.csv")  

In [181]:
# Check top few records to get a feel of the data structure
mpg_df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [182]:
# drop the car name column as it is useless for the model
mpg_df = mpg_df.drop('car name', axis=1)

In [183]:
# Replace the numbers in categorical variables with the actual country names in the origin col
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

In [184]:
# Convert categorical variable into dummy/indicator variables. As many columns will be created as distinct values
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])

In [185]:
#Lets analysze the distribution of the dependent (mpg) column
mpg_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,398.0,23.514573,7.815984,9.0,17.5,23.0,29.0,46.6
cyl,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
disp,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
wt,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acc,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
yr,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
origin_america,398.0,0.625628,0.484569,0.0,0.0,1.0,1.0,1.0
origin_asia,398.0,0.198492,0.399367,0.0,0.0,0.0,0.0,1.0
origin_europe,398.0,0.175879,0.381197,0.0,0.0,0.0,0.0,1.0


In [186]:
# Note:  HP column is missing the describe output. That indicates something is not right with that column

In [190]:
#Check if the hp column contains anything other than digits 
# run the "isdigit() check on 'hp' column of the mpg_df dataframe. Result will be True or False for every row
# capture the result in temp dataframe and dow a frequency count using value_counts()
# There are six records with non digit values in 'hp' column
temp = pd.DataFrame(mpg_df.hp.str.isdigit())

temp[temp['hp'] == False]

Unnamed: 0,hp
32,False
126,False
330,False
336,False
354,False
374,False


In [191]:
# On inspecting records number 32, 126 etc, we find "?" in the columns. Replace them with "nan"
#Replace them with nan and remove the records from the data frame that have "nan"
mpg_df = mpg_df.replace('?', np.nan)

In [193]:
# There are various ways to handle missing values. Drop the rows, replace missing values with median values etc. 

In [194]:
#of the 398 rows 6 have NAN in the hp column. We will drop those 6 rows. Not a good idea under all situations
#note: HP is missing becauses of the non-numeric values in the column. 
#mpg_df = mpg_df.dropna()

In [195]:
#instead of dropping the rows, lets replace the missing values with median value. 
mpg_df.median()

mpg                 23.0
cyl                  4.0
disp               148.5
hp                  93.5
wt                2803.5
acc                 15.5
yr                  76.0
origin_america       1.0
origin_asia          0.0
origin_europe        0.0
dtype: float64

In [217]:
# replace the missing values in 'hp' with median value of 'hp' :Note, we do not need to specify the column names
# every column's missing value is replaced with that column's median respectively
#mpg_df = mpg_df.fillna(mpg_df.median())

mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)


In [219]:
mpg_df.dtypes

mpg               float64
cyl                 int64
disp              float64
hp                float64
wt                  int64
acc               float64
yr                  int64
origin_america      uint8
origin_asia         uint8
origin_europe       uint8
dtype: object

In [220]:
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [265]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [266]:
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

In [267]:
# Split X and y into training and test set in 75:25 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [268]:
# invoke the LinearRegression function and find the bestfit model on training data

regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [269]:
# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is -0.3587938318113527
The coefficient for disp is 0.029786224589748327
The coefficient for hp is -0.017326691282575718
The coefficient for wt is -0.00755315974082148
The coefficient for acc is 0.11679979864255319
The coefficient for yr is 0.845003689561729
The coefficient for origin_america is -1.9029961213759472
The coefficient for origin_asia is 0.7383618604870851
The coefficient for origin_europe is 1.1646342608888638


In [270]:
# Let us check the intercept for the model

intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -21.11359559723781


In [271]:
# we can write our linear model as:
# Y=−21.11–0.35×X1+0.03×X2–0.02×X3–0.01×X4+0.12×X5+0.85×X6–1.90×X7+0.74×X8+1.16×X9

In [272]:
# Model score - R2 or coeff of determinant
# R^2=1–RSS / TSS

regression_model.score(X_test, y_test)

0.84423196365199848

In [273]:
# So the model explains 84.4% of the variability in Y using X

In [274]:
# Let us check the sum of squared errors by predicting value of y for test cases and 
# subtracting from the actual y for the test cases

mse = np.mean((regression_model.predict(X_test)-y_test)**2)

In [275]:
# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted and actual

import math

math.sqrt(mse)

2.921101066036313

In [277]:
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test)

In [279]:
# ------------------------------------------------- ITERATION 2  ---------------------------------------------------

In [280]:
# How do we improve the model? the R^2 is .844, how do we improve it
# The indpendent attributes have different units and scales of measurement 
# It is always a good practice to scale all the dimensions using z scores or someother methode to address the problem of different scales 


In [281]:
# To scale the dimensions we need scale function which is part of sckikit preprocessing libraries

from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
mpg_df_scaled = preprocessing.scale(mpg_df)

In [282]:
mpg_df_scaled

array([[-0.7064387 ,  1.49819126,  1.0906037 , ...,  0.77355903,
        -0.49764335, -0.46196822],
       [-1.09075062,  1.49819126,  1.5035143 , ...,  0.77355903,
        -0.49764335, -0.46196822],
       [-0.7064387 ,  1.49819126,  1.19623199, ...,  0.77355903,
        -0.49764335, -0.46196822],
       ..., 
       [ 1.08701694, -0.85632057, -0.56103873, ...,  0.77355903,
        -0.49764335, -0.46196822],
       [ 0.57460104, -0.85632057, -0.70507731, ...,  0.77355903,
        -0.49764335, -0.46196822],
       [ 0.95891297, -0.85632057, -0.71467988, ...,  0.77355903,
        -0.49764335, -0.46196822]])

In [283]:
#convert the numpy array back into a dataframe 

mpg_df_scaled = pd.DataFrame(mpg_df_scaled, columns=mpg_df.columns)

In [284]:
#browse the contents of the dataframe. Check that all the values are now z scores

mpg_df_scaled

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin_america,origin_asia,origin_europe
0,-0.706439,1.498191,1.090604,0.673118,0.630870,-1.295498,-1.627426,0.773559,-0.497643,-0.461968
1,-1.090751,1.498191,1.503514,1.589958,0.854333,-1.477038,-1.627426,0.773559,-0.497643,-0.461968
2,-0.706439,1.498191,1.196232,1.197027,0.550470,-1.658577,-1.627426,0.773559,-0.497643,-0.461968
3,-0.962647,1.498191,1.061796,1.197027,0.546923,-1.295498,-1.627426,0.773559,-0.497643,-0.461968
4,-0.834543,1.498191,1.042591,0.935072,0.565841,-1.840117,-1.627426,0.773559,-0.497643,-0.461968
5,-1.090751,1.498191,2.262118,2.454408,1.620492,-2.021656,-1.627426,0.773559,-0.497643,-0.461968
6,-1.218855,1.498191,2.502182,3.030708,1.635863,-2.384735,-1.627426,0.773559,-0.497643,-0.461968
7,-1.218855,1.498191,2.367746,2.899730,1.586204,-2.566274,-1.627426,0.773559,-0.497643,-0.461968
8,-1.218855,1.498191,2.511784,3.161685,1.719809,-2.021656,-1.627426,0.773559,-0.497643,-0.461968
9,-1.090751,1.498191,1.887617,2.244844,1.039961,-2.566274,-1.627426,0.773559,-0.497643,-0.461968


In [285]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df_scaled.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df_scaled[['mpg']]


In [286]:
# Split X and y into training and test set in 75:25 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [287]:
# invoke the LinearRegression function and find the bestfit model on training data

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [288]:
# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is -0.07808483313384415
The coefficient for disp is 0.3973657946972272
The coefficient for hp is -0.08473297726998189
The coefficient for wt is -0.8183654086190686
The coefficient for acc is 0.041210102124063966
The coefficient for yr is 0.39975875515609055
The coefficient for origin_america is -0.09961619757213232
The coefficient for origin_asia is 0.05286275347891442
The coefficient for origin_europe is 0.07124754231534776


In [289]:
# Model score - R2 or coeff of determinant
# R^2=1–RSS / TSS

regression_model.score(X_test, y_test)

0.84423196365200093

In [290]:
# Let us check the sum of squared errors by predicting value of y for test cases and 
# subtracting from the actual y for the test cases

mse = np.mean((regression_model.predict(X_test)-y_test)**2)

In [291]:
# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted and actual

import math

math.sqrt(mse)

0.37420465597832564