In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")
data.info()

In [None]:
data.head()

In [None]:
# symboling: -2 (least risky) to +3 most risky
# Most cars are 0,1,2
data['symboling'].value_counts()

In [None]:
# aspiration: An (internal combustion) engine property showing 
# whether the oxygen intake is through standard (atmospheric pressure)
# or through turbocharging (pressurised oxygen intake)
data['aspiration'].value_counts()

In [None]:
# drivewheel: frontwheel, rear wheel or four-wheel drive 
data['drivewheel'].value_counts()

In [None]:

# target variable: price of car

fig, ax= plt.subplots(1,2,figsize=(15,5))

sns.distplot(data['price'], norm_hist=False, kde=True, ax=ax[0], color='blue')
ax[0].set_xlabel('Car Price')
ax[0].set_ylabel('Count of cars',size=12)
ax[0].set_title('Count Of Cars By Price',size=15,weight="bold")

sns.distplot(data['price'], kde=True, ax=ax[1], color='green')
ax[1].set_xlabel('Car Price')
ax[1].set_ylabel('Relative Frequency of cars',size=12)
ax[1].set_title('Density or Relative Frequency Of Cars By Price',size=15,weight="bold")


# Observations on Target Variable- Price:
 
1. The target variable price has a positive skew, however majority of the cars are low priced.
2. More than 50% of the cars (around 105-107 out of total of 205) are priced 10,000 and close to 35% cars are priced between 10,000 and 20,000. So around 85% of cars in US market are priced between 5,000 to 20,000.
3. Based on above observations and graph on right side (KDE/green one) it appears there are 2 distributions one for cars priced between 5,000 and 25000 and another distribution for high priced cars 25,000 and above. (Notice the approximate bell curve
from little less than 30000 upto 45,000/50,000)

# Data Exploration
To perform linear regression, the target variable should be linearly related to independent variables. Let's see whether that's true in this case.

In [None]:
data_numeric=data.select_dtypes(include=['float64','int64'])

In [None]:
# dropping symboling and car_ID as symboling is more of categorical variable as described before and car_ID is only 
#an index type variable and not a predictor
data_numeric = data_numeric.drop(['symboling', 'car_ID'], axis=1)
data_numeric.head()

In [None]:
# pair_wise scatter plot
plt.figure(figsize=(20, 10))
sns.pairplot(data_numeric)
#as we can see its difficult to interpret these graphs due to somany of them
#a BETTER way of checking linearity is in below cell

In [None]:
for i,col in enumerate(data_numeric.columns):
    plt.figure(i)
    sns.scatterplot(x=data_numeric[col],y=data_numeric['price'])


* These var's appears to have a linear relation with price: carwidth, curbweight, enginesize, horsepower, boreration and citympg.

* Other variables either don't have a relation with price or relationship isn't strong. None of the varibales appear to have polynomial relation with price.

* In linear regression assumptions validation section we will check for linearity assumption in detail

# Correlation matrix


In [None]:
corr_matrix=data_numeric.corr()
corr_matrix


In [None]:
'''For best visualization of correlation between variables we'll use heatmap'''


plt.figure(figsize=(10,15))
sns.heatmap(corr_matrix,annot=True,cmap="YlGnBu")


# Useful insights from Corr Heatmap
Dependent var and indep. var's :
* Positive corr : Price highly correlated with enginesize, curbweight, horsepower, carwidth (all of these variables represent the size/weight/engine power of the car)

* Negative corr: Price negatively corr with mpg var's citympg and highwaympg. This suggest that cars having high mileage may fall in the 'economy' cars category or in other words indicates that Low priced cars have mostly high mpg

Correlation among independent variables:

* Many independent variables are highly correlated; wheelbase, carlength, curbweight, enginesize etc. are all measures of 'size/weight', and are positively correlated

Since indep. var's are highly correlated (more than 80% corr among many of them ) we'll have to pay attention to multicollinearity, which we will check in assumptions validation section using VIF score.

# Section 2: Data Cleaning: Missing values and feature data type check


In [None]:
data.info()

In [None]:
data.symboling=data.symboling.astype('object')

data.info()

In [None]:
#Extracting car names. car name is the first word (before space)
car_names = data['CarName'].apply(lambda x: x.split(" ")[0])
car_names[:10]
data['CarName']=car_names

In [None]:
data['CarName'].value_counts()

#many car names are duplicates like toyota and toyouta, porsche and porcshce etc.
# we need to fix incorrect spelling and get car_names column in order

data.info()

In [None]:
#volkswagen
data.loc[(data['CarName']=="vw")|(data['CarName']=="vokswagen"),'CarName']="volkswagen"
#porsche
data.loc[(data['CarName']=="porcshce"),'CarName']="porsche"

#toyota
data.loc[(data['CarName']=="toyouta"),'CarName']="toyota"

# nissan
data.loc[data['CarName'] == "Nissan",'CarName'] = 'nissan'

# mazda
data.loc[data['CarName'] == "maxda",'CarName'] = 'mazda'

data['CarName'].value_counts()
data.info()

# Section 3. Data Preparation: feature engineering


In [None]:
X=data.drop(columns=['price',"car_ID"])
y=data['price']
X.head()


In [None]:
# creating dummy variables for categorical variables
data.info()
cars_categorical = X.select_dtypes(include=['object'])
cars_categorical.head()

In [None]:
#creating dummy variables
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
cars_dummies.head()


In [None]:
cars_dummies.columns


In [None]:
X=X.drop(columns=cars_categorical)
X.head()

In [None]:
dfX=pd.merge(X,cars_dummies,on=X.index)
#or
dfX=pd.concat([X,cars_dummies],axis=1)
dfX.head()

# Scaling the features


In [None]:
from sklearn.preprocessing import scale

# storing column names in cols, since column names are (annoyingly) lost after 
# scaling (the df is converted to a numpy array)
dfX_scaled=pd.DataFrame(scale(dfX))
dfX_scaled.columns=dfX.columns


In [None]:
'''Generate descriptive statistics:
Descriptive statistics include those that summarize the central tendency,
dispersion and shape of a dataset’s distribution, excluding NaN values.'''
dfX.describe()

In [None]:
# split into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dfX_scaled, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)


# Section 4. Model Building and Feature Selection using RFE
 Since our dependent variable price looks to be linearly related to most of the independent variables we are using Linear Regression only and not other types of regression like Polynomial, Random Forest/Boosting regression etc.

In [None]:
# Model with all features
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

lm=LinearRegression()
lm.fit(X_train,y_train)

y_pred_test=lm.predict(X_test)
y_pred_train=lm.predict(X_train)

In [None]:

#Rsqaure
from sklearn.metrics import r2_score

print('R-sqaure on train data: {}'.format(r2_score(y_true=y_train, y_pred=y_pred_train)))
print('R-sqaure on test data: {}'.format(r2_score(y_true=y_test, y_pred=y_pred_test)))

#Standard error/RMSE
error_train=y_pred_train-y_train
error_test=y_pred_test-y_test

print('RMSE on train data: {}'.format(((error_train**2).mean())**0.5))
print('RMSE on test data: {}'.format(((error_test**2).mean())**0.5))