In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# To supress warnings
import warnings
warnings.filterwarnings('ignore')

#read the input file
Car_Price=pd.read_csv("CarPrice_Assignment.csv", encoding = "ISO-8859-1")

## Step 1 : Reading and Understanding the Data

In [None]:
display(Car_Price.head())

In [None]:
Car_Price.shape

In [None]:
Car_Price.info()

In [None]:
Car_Price.describe()

In [None]:
# To check for rows having atleast one missing value
Car_Price.isnull().any(axis=1).sum()

In [None]:
# To check for columns having atleast one missing value
Car_Price.isnull().any(axis=0).sum()

In [None]:
# To check for Duplicate rows 
Car_Price_Check_Dup = Car_Price

In [None]:
duplicateRowsDF = Car_Price_Check_Dup[Car_Price_Check_Dup.duplicated()]

In [None]:
display(duplicateRowsDF)

## Step 2 : Visualising the Data

In [None]:
# Visualizing Numeric Variables
sns.pairplot(Car_Price)
plt.show()

In [None]:
plt.figure(figsize = (16,5))
sns.heatmap(Car_Price.corr(),annot=True)
plt.show()

In [None]:
## Visualizing Categorical Variables
plt.figure(figsize=(20, 12))
plt.subplot(5,2,1)
sns.boxplot(x = 'CarName', y = 'price', data = Car_Price)
plt.subplot(5,2,2)
sns.boxplot(x = 'fueltype', y = 'price', data = Car_Price)
plt.subplot(5,2,3)
sns.boxplot(x = 'aspiration', y = 'price', data = Car_Price)
plt.subplot(5,2,4)
sns.boxplot(x = 'doornumber', y = 'price', data = Car_Price)
plt.subplot(5,2,5)
sns.boxplot(x = 'carbody', y = 'price', data = Car_Price)
plt.subplot(5,2,6)
sns.boxplot(x = 'drivewheel', y = 'price', data = Car_Price)
plt.subplot(5,2,7)
sns.boxplot(x = 'enginelocation', y = 'price', data = Car_Price)
plt.subplot(5,2,8)
sns.boxplot(x = 'enginetype', y = 'price', data = Car_Price)
plt.subplot(5,2,9)
sns.boxplot(x = 'cylindernumber', y = 'price', data = Car_Price)
plt.subplot(5,2,10)
sns.boxplot(x = 'fuelsystem', y = 'price', data = Car_Price)
plt.show()

## Step 3: Data Preparation

In [None]:
# Requirement: consider only company name as the independent variable for model building
# Hence, Create a new column "CompanyName" from existing column "CarName"
Car_Price['CompanyName'] = Car_Price['CarName'].str.split(' ').str[0]

In [None]:
display(Car_Price.head(50))

In [None]:
# Drop column "CarName"
Car_Price_Prep = Car_Price
Car_Price_Prep.drop(['CarName'], axis = 1, inplace = True)
display(Car_Price_Prep.head())

In [None]:
# Company Name has below typo errors: To correct these typos
# -Maxda instead of Mazda
# -Nissan instead of nissan
# -porcshce instead of porsche
# -toyouta instead of toyota
# -vokswagen, vw instead of volkswagen

Car_Price_Prep['CompanyName'] = Car_Price_Prep['CompanyName'].str.replace('maxda','mazda')

In [None]:
Car_Price_Prep['CompanyName'] = Car_Price_Prep['CompanyName'].str.replace('Nissan','nissan')

In [None]:
Car_Price_Prep['CompanyName'] = Car_Price_Prep['CompanyName'].str.replace('porcshce','porsche')

In [None]:
Car_Price_Prep['CompanyName'] = Car_Price_Prep['CompanyName'].str.replace('toyouta','toyota')

In [None]:
Car_Price_Prep['CompanyName'] = Car_Price_Prep['CompanyName'].str.replace('vokswagen','volkswagen')

In [None]:
Car_Price_Prep['CompanyName'] = Car_Price_Prep['CompanyName'].str.replace('vw','volkswagen')

In [None]:
display(Car_Price_Prep['CompanyName'].unique())

In [None]:
# To Drop column 'car_ID', as it has no impact on car_Price
Car_Price_Prep.drop(['car_ID'], axis = 1, inplace = True)
display(Car_Price_Prep.head())

### To Create Dummy Variables for all Categorical Variables

In [None]:
# 1. Cat Variable : fueltype
Car_Price_Prep['fueltype'].unique()

In [None]:
# get dummies for 'fueltype'
fueltype_d = pd.get_dummies(Car_Price_Prep['fueltype'])

In [None]:
fueltype_d.head()

In [None]:
# 'fueltype' has 2 levels. Hence, will create 2-1 dummy variable. Arbitrarily choose 'gas' (Values : 0 refers to diesel and 1 refers to gas)
fueltype_d = pd.get_dummies(Car_Price_Prep['fueltype'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,fueltype_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.head())

In [None]:
# To Drop "fueltype"
Car_Price_Prep.drop(['fueltype'], axis = 1, inplace = True)

In [None]:
# 2. Cat Variable : aspiration
Car_Price_Prep['aspiration'].unique()

In [None]:
# 'aspiration' has 2 levels. Hence, will create 2-1 dummy variable. Arbitrarily choose 1 of the dummy variable.
aspiration_d = pd.get_dummies(Car_Price_Prep['aspiration'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,aspiration_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.head())

In [None]:
# To Drop "aspiration"
Car_Price_Prep.drop(['aspiration'], axis = 1, inplace = True)

In [None]:
# 3. Cat Variable : doornumber
Car_Price_Prep['doornumber'].unique()

In [None]:
# 'doornumber' has 2 levels. Hence, will create 2-1 dummy variable. 
doornumber_d = pd.get_dummies(Car_Price_Prep['doornumber'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,doornumber_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.head())

In [None]:
# To Drop "doornumber"
Car_Price_Prep.drop(['doornumber'], axis = 1, inplace = True)

In [None]:
# 4. Cat Variable : carbody
Car_Price_Prep['carbody'].unique()

In [None]:
# 'carbody' has 5 levels. Hence, will create 5-1 dummy variable. 
carbody_d = pd.get_dummies(Car_Price_Prep['carbody'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,carbody_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.head())

In [None]:
# To Drop "carbody"
Car_Price_Prep.drop(['carbody'], axis = 1, inplace = True)

In [None]:
# 5. Cat Variable : drivewheel
Car_Price_Prep['drivewheel'].unique()

In [None]:
# 'drivewheel' has 3 levels. Hence, will create 3-1 dummy variables. 
drivewheel_d = pd.get_dummies(Car_Price_Prep['drivewheel'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,drivewheel_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.head())

In [None]:
# To Drop "drivewheel"
Car_Price_Prep.drop(['drivewheel'], axis = 1, inplace = True)

In [None]:
# 6. Cat Variable : enginelocation
Car_Price_Prep['enginelocation'].unique()

In [None]:
# 'enginelocation' has 2 levels. Hence, will create 2-1 dummy variables.
enginelocation_d = pd.get_dummies(Car_Price_Prep['enginelocation'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,enginelocation_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.head())

In [None]:
# To Drop "enginelocation"
Car_Price_Prep.drop(['enginelocation'], axis = 1, inplace = True)

In [None]:
# 7. Cat Variable : enginetype
Car_Price_Prep['enginetype'].unique()

In [None]:
# 'enginetype' has 7 levels. Hence, will create 7-1 dummy variable. 
enginetype_d = pd.get_dummies(Car_Price_Prep['enginetype'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,enginetype_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.head())

In [None]:
# To Drop "enginetype"
Car_Price_Prep.drop(['enginetype'], axis = 1, inplace = True)

In [None]:
# 8. Cat Variable : cylindernumber
Car_Price_Prep['cylindernumber'].unique()

In [None]:
# 'cylindernumber' has 7 levels. Hence, will create 7-1 dummy variables. 
cylindernumber_d = pd.get_dummies(Car_Price_Prep['cylindernumber'],drop_first = True)

In [None]:
# Merge the dummy variable to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,cylindernumber_d], axis = 1)

In [None]:
# Check for dummy variable in dataframe.
display(Car_Price_Prep.tail())

In [None]:
# To Drop "cylindernumber"
Car_Price_Prep.drop(['cylindernumber'], axis = 1, inplace = True)

In [None]:
# 9. Cat Variable : fuelsystem
Car_Price_Prep['fuelsystem'].unique()

In [None]:
# 'fuelsystem' has 8 levels. Hence, will create 8-1 dummy variables. 
fuelsystem_d = pd.get_dummies(Car_Price_Prep['fuelsystem'],drop_first = True)

In [None]:
# Merge the dummy variables to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,fuelsystem_d], axis = 1)

In [None]:
# Check for dummy variables in dataframe.
display(Car_Price_Prep.tail(5))

In [None]:
# To Drop "fuelsystem"
Car_Price_Prep.drop(['fuelsystem'], axis = 1, inplace = True)

In [None]:
# 10. Cat Variable : CompanyName
Car_Price_Prep['CompanyName'].unique()

In [None]:
# 'CompanyName' has 22 levels. Hence, will create 22-1 dummy variables. 
CompanyName_d = pd.get_dummies(Car_Price_Prep['CompanyName'],drop_first = True)

In [None]:
# Merge the dummy variables to Car_Price_Prep dataframe
Car_Price_Prep = pd.concat([Car_Price_Prep,CompanyName_d], axis = 1)

In [None]:
# Check for dummy variables in dataframe.
display(Car_Price_Prep.tail(5))

In [None]:
# To Drop "CompanyName"
Car_Price_Prep.drop(['CompanyName'], axis = 1, inplace = True)

In [None]:
display(Car_Price_Prep.head())

In [None]:
Car_Price_Prep.info()

## Step 4 : Splitting the data into Training and Testing Sets

In [None]:
df_train, df_test = train_test_split(Car_Price_Prep, train_size = 0.7, test_size = 0.3, random_state = 100)

### Rescaling the Training Set Features using "MinMax Scaling"

In [None]:
scaler = MinMaxScaler()

In [None]:
# Apply scaler() to all the columns except the 'dummy' variables

rescale_features = ['symboling','wheelbase','carlength','carwidth','carheight','curbweight','enginesize','boreratio','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg','price']

df_train[rescale_features] = scaler.fit_transform(df_train[rescale_features])

In [None]:
df_train.head()

### Creating X and Y Sets for Model Building

In [None]:
y_train = df_train.pop('price')
X_train = df_train

## Step 5 : Building a Linear Model

### Applying RFE

In [None]:
# Applying RFE with the output number of the variables = 10
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 10)            
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
features = X_train.columns[rfe.support_]
features

In [None]:
X_train.columns[~rfe.support_]

### Building model using Statsmodel, for detailed analysis

In [None]:
# To create X_test dataframe with RFE selected 10 features
X_train_rfe = X_train[features]

In [None]:
# To Add a constant variable 
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
# Running the Linear Model
lm = sm.OLS(y_train,X_train_rfe).fit() 

In [None]:
display(lm.summary())

In [None]:
# To Calculate VIFs for the model
X_train_rfe_vif = X_train_rfe.drop(['const'], axis=1)

In [None]:
vif = pd.DataFrame()
X = X_train_rfe_vif
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### feature 'twelve' is insignificant in presence of other variables (ie, p-value is 0.511 >0.05) and hence drop it.
#### Rebuild the model without "twelve".

In [None]:
X_train_rfe1 = X_train_rfe.drop(["twelve"], axis = 1)

In [None]:
# To Add a constant variable 
X_train_rfe1 = sm.add_constant(X_train_rfe1)

In [None]:
# Running the Linear Model
lm1 = sm.OLS(y_train,X_train_rfe1).fit() 

In [None]:
display(lm1.summary())

In [None]:
X_train_rfe1.columns

In [None]:
# To Calculate VIFs for the new model
X_train_rfe1_vif = X_train_rfe1.drop(['const'], axis=1)

In [None]:
vif = pd.DataFrame()
X = X_train_rfe1_vif
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### expected VIF is usually <5, there are multiple features above 5. Lets drop them one by one.
#### Rebuild the Model without "curbweight"

In [None]:
X_train_rfe2 = X_train_rfe1.drop(["curbweight"], axis = 1)
X_train_rfe2.columns

In [None]:
# To Add a constant variable 
X_train_rfe2 = sm.add_constant(X_train_rfe2)
X_train_rfe2.columns

In [None]:
# Running the Linear Model
lm2 = sm.OLS(y_train,X_train_rfe2).fit() 

In [None]:
display(lm2.summary())

In [None]:
# To Calculate VIFs for the new model
X_train_rfe2_vif = X_train_rfe2.drop(['const'], axis=1)

In [None]:
vif = pd.DataFrame()
X = X_train_rfe2_vif
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### expected VIF is usually <5, there are multiple features above 5. Lets drop them one by one.
#### Rebuild the Model without "carwidth"

In [None]:
X_train_rfe3 = X_train_rfe2.drop(["carwidth"], axis = 1)
X_train_rfe3.columns

In [None]:
# To Add a constant variable 
X_train_rfe3 = sm.add_constant(X_train_rfe3)
X_train_rfe3.columns

In [None]:
# Running the Linear Model
lm3 = sm.OLS(y_train,X_train_rfe3).fit() 

In [None]:
display(lm3.summary())

In [None]:
# To Calculate VIFs for the new model
X_train_rfe3_vif = X_train_rfe3.drop(['const'], axis=1)

In [None]:
vif = pd.DataFrame()
X = X_train_rfe3_vif
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### expected VIF is usually <5, there are multiple features above 5. Lets drop them one by one.
#### Rebuild the Model without "boreratio"

In [None]:
X_train_rfe4 = X_train_rfe3.drop(["boreratio"], axis = 1)
X_train_rfe4.columns

In [None]:
# To Add a constant variable 
X_train_rfe4 = sm.add_constant(X_train_rfe4)
X_train_rfe4.columns

In [None]:
# Running the Linear Model
lm4 = sm.OLS(y_train,X_train_rfe4).fit() 

In [None]:
display(lm4.summary())

In [None]:
# To Calculate VIFs for the new model
X_train_rfe4_vif = X_train_rfe4.drop(['const'], axis=1)

In [None]:
vif = pd.DataFrame()
X = X_train_rfe4_vif
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### feature 'three' is insignificant in presence of other variables (ie, p-value is 0.338 >0.05) and hence drop it.
#### Rebuild the model without "three".

In [None]:
X_train_rfe5 = X_train_rfe4.drop(["three"], axis = 1)
X_train_rfe5.columns

In [None]:
# To Add a constant variable 
X_train_rfe5 = sm.add_constant(X_train_rfe5)
X_train_rfe5.columns

In [None]:
# Running the Linear Model
lm5 = sm.OLS(y_train,X_train_rfe5).fit() 

In [None]:
display(lm5.summary())

In [None]:
# To Calculate VIFs for the new model
X_train_rfe5_vif = X_train_rfe5.drop(['const'], axis=1)

In [None]:
vif = pd.DataFrame()
X = X_train_rfe5_vif
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### feature 'rear' is insignificant in presence of other variables (ie, p-value is 0.236 >0.05) and hence drop it.
#### Rebuild the model without "rear".

In [None]:
X_train_rfe6 = X_train_rfe5.drop(["rear"], axis = 1)
X_train_rfe6.columns

In [None]:
# To Add a constant variable 
X_train_rfe6 = sm.add_constant(X_train_rfe6)
X_train_rfe6.columns

In [None]:
# Running the Linear Model
lm6 = sm.OLS(y_train,X_train_rfe6).fit() 

In [None]:
display(lm6.summary())

In [None]:
# To Calculate VIFs for the new model
X_train_rfe6_vif = X_train_rfe6.drop(['const'], axis=1)

In [None]:
vif = pd.DataFrame()
X = X_train_rfe6_vif
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

## Step 7: Residual Analysis of Test Data

In [None]:
y_train_price = lm6.predict(X_train_rfe6)

In [None]:
# To Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  
plt.xlabel('Errors', fontsize = 18)                        

## Step 8 : Make Predictions using the Final Model

### Rescaling the Test Set Features using "MinMax Scaling"

In [None]:
rescale_features = ['symboling','wheelbase','carlength','carwidth','carheight','curbweight','enginesize','boreratio','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg','price']

df_test[rescale_features] = scaler.transform(df_test[rescale_features])

In [None]:
df_test.head()

### Creating X and Y Test Sets 

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
# To make predictions using the model

X_test_new = X_test[['enginesize','rotor','bmw','porsche']]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)

In [None]:
y_pred = lm6.predict(X_test_new)

## Step 9 : Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)              
plt.xlabel('y_test', fontsize=18)                          
plt.ylabel('y_pred', fontsize=16)                          

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

####  Model Evaluation results hold good. 
#### The difference between Model Training R2-score (0.840) and Test R2-score(0.849) is 0.009.