In [1]:
%matplotlib inline     

import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')

car_price_df = pd.read_csv("CarPrice_Assignment.csv") #loading data file

y = car_price_df['price'] #define target Variable

car_price_df.drop(columns= 'price', inplace = True) #dropping target Variable

In [2]:
na_df = pd.DataFrame({'column_name': car_price_df.columns,
                     'na_count': car_price_df.isnull().sum(),
                     'na_percentage': car_price_df.isnull().sum()/len(car_price_df)*100})

na_df.sort_values(by='na_percentage', ascending=False) #null Value %

Unnamed: 0,column_name,na_count,na_percentage
car_ID,car_ID,0,0.0
curbweight,curbweight,0,0.0
citympg,citympg,0,0.0
peakrpm,peakrpm,0,0.0
horsepower,horsepower,0,0.0
compressionratio,compressionratio,0,0.0
stroke,stroke,0,0.0
boreratio,boreratio,0,0.0
fuelsystem,fuelsystem,0,0.0
enginesize,enginesize,0,0.0


In [3]:
cols_to_drop = []

for col in car_price_df.columns:
    if len(car_price_df[col].unique())==1 or len(car_price_df[col].unique())==car_price_df.shape[0]:
        cols_to_drop.append(col)
        
cols_to_drop #finding unique value columns

['car_ID']

In [4]:
car_price_df.drop(columns=cols_to_drop, inplace= True) #dropping unique value columns

In [5]:
car_price_df.dtypes #data type

symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
dtype: object

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(car_price_df,y, test_size=0.2, random_state=42) 


# split the data into Train and Test


In [7]:
num_col = [col for col in car_price_df.columns if car_price_df[col].dtype=='int64' or car_price_df[col].dtype=='float64']
cat_col= [col for col in car_price_df.columns if car_price_df[col].dtype=='object'] 

#separate the Continuos and Categorical columns

In [8]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder, OneHotEncoder

minmaxscaler = MinMaxScaler()


for col in num_col:
    X_train[col]=minmaxscaler.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_test[col]=minmaxscaler.transform(np.array(X_test[col]).reshape(-1,1))


oe_train_df=pd.get_dummies(X_train[cat_col])
oe_test_df =pd.get_dummies(X_test[cat_col])

#Scalling the variables

In [9]:
X_train_oe, X_test_oe = oe_train_df.align(oe_test_df,axis=1,join='inner',fill_value=0)


In [10]:
X_train_final = pd.concat([X_train_oe, X_train[num_col]], axis=1)
X_test_final = pd.concat([X_test_oe, X_test[num_col]], axis=1)

# LinearRegression

In [11]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(X_train_final,y_train)


predictions = linear.predict(X_test_final)

predictions

array([ 30304.,  22720.,  10912.,  19104.,  26080.,   5696.,  11360.,
         6816.,   5824.,   9376.,  15616.,   6304.,  14272.,   8800.,
        46464.,   8768., -17728.,  13856.,  10464.,   7616.,  10752.,
        19712.,   1664.,   3104.,   7680.,  31072.,  14880.,  11392.,
         4320.,  13856.,  27264.,   7520.,   8608.,  23296.,   8224.,
        25120.,  12992.,  14496.,   8256.,  12160.,   3808.])

In [12]:
from sklearn.metrics import r2_score, mean_squared_error

r2_score(y_test, predictions)

0.6821795111234807

In [13]:
mean_squared_error(y_test, predictions)

25090027.30985095

# Variance Inflation Factor

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif['VIF Factor'] = [variance_inflation_factor(X_train[num_col].values, i) for i in range(X_train[num_col].shape[1])]

vif['Features'] = X_train[num_col].columns

vif

Unnamed: 0,VIF Factor,Features
0,9.169963,symboling
1,43.246513,wheelbase
2,58.429157,carlength
3,24.082,carwidth
4,14.250093,carheight
5,46.125409,curbweight
6,19.749627,enginesize
7,16.053635,boreratio
8,16.084586,stroke
9,3.219615,compressionratio


# Lasso

In [15]:
from sklearn.linear_model import Lasso

def lasso(alphas):
    df =pd.DataFrame()
    df['Feature']= X_train_final.columns
    
    for alpha in alphas:
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train_final,y_train)
        col_name = 'Alpha =%f' % alpha
        df[col_name]= lasso.coef_
    return df

lasso([1.3, 0.8, -2.2]).head(25)

Unnamed: 0,Feature,Alpha =1.300000,Alpha =0.800000,Alpha =-2.200000
0,CarName_honda civic,1377.897,1515.619,2455.462476
1,CarName_mazda 626,-252.8325,-292.588,-616.230562
2,CarName_mazda glc,-1143.117,-1389.155,-2853.051968
3,CarName_mitsubishi g4,262.2061,362.9914,766.281263
4,CarName_mitsubishi mirage g4,888.8866,1002.412,1395.945857
5,CarName_mitsubishi outlander,-543.4992,-597.9303,-1032.676889
6,CarName_nissan clipper,0.0,99.57052,898.615198
7,CarName_plymouth fury iii,0.0,85.93091,1098.709071
8,CarName_saab 99e,4901.122,5062.557,5849.239506
9,CarName_toyota corolla,-284.8289,-227.7842,488.595315


# Decision Tree Regressor

In [16]:
from sklearn.tree import DecisionTreeRegressor


dtr = DecisionTreeRegressor()

dtr.fit(X_train_final,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [17]:
dtr.predict(X_test_final)

array([36880., 15998.,  8949., 12170., 33900.,  5195.,  6918.,  8358.,
        8845.,  7995., 12170.,  8058., 12170., 11248., 45400.,  6338.,
        5572., 12764.,  6989.,  9095., 10245., 14399.,  7299.,  5389.,
        7609., 36880.,  8449., 16515.,  7349., 15985., 33900.,  6229.,
        6785., 19045.,  7957., 33278., 15250., 13645.,  6575., 14869.,
        8921.])

In [18]:
print('Training Accurecy: ', dtr.score(X_train_final,y_train))

print('Test Accrecy:', dtr.score(X_test_final,y_test))

Training Accurecy:  0.9988102717463964
Test Accrecy: 0.8989594326961939


# Hyperparameter Tunning (Decision Tree)


In [20]:
from sklearn.model_selection import RandomizedSearchCV

hyp_dict = { 'max_depth': [8,10,12],
             'min_samples_split' : [9,11,13],
             'max_leaf_nodes': [8,12,16]
           }
gcv_dec = RandomizedSearchCV(estimator=dtr, param_distributions=hyp_dict, cv=5, n_iter = 24)

gcv_dec.fit(X_train_final, y_train)

print('Best Score =' , gcv_dec.best_score_)
print('Where we got best score =', gcv_dec.best_params_)

Best Score = 0.80938020989181
Where we got best score = {'min_samples_split': 13, 'max_leaf_nodes': 16, 'max_depth': 12}


# Random Forest Regressor


In [21]:
from sklearn.ensemble import RandomForestRegressor

rand_reg = RandomForestRegressor()

rand_reg.fit(X_train_final, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [22]:
rand_reg.predict(X_test_final)

array([35776.06      , 19008.71      ,  8841.36      , 13203.83083333,
       27279.655     ,  6529.24      ,  7801.72      ,  7886.27      ,
       10084.01333333,  8163.23      , 14831.08666667,  7892.72      ,
       14904.71166667, 10800.83      , 40895.5       ,  6412.37      ,
        5868.595     , 13855.48      ,  8614.54      ,  9384.24      ,
       10411.06      , 14810.        ,  7065.42      ,  5971.57      ,
        7293.61      , 35944.17      ,  9358.155     , 16915.01      ,
        7180.05      , 16678.71      , 27469.925     ,  6472.64      ,
        7970.41      , 18645.13      ,  8019.59      , 27505.035     ,
       10753.38833333, 12929.92      ,  7350.875     , 14269.97      ,
        8434.07      ])

In [23]:
print('Training Accurecy: ', rand_reg.score(X_train_final,y_train))

print('Test Accrecy:', rand_reg.score(X_test_final,y_test))

Training Accurecy:  0.9858494978166725
Test Accrecy: 0.958685173251845


# Hyperparameter Tunning (Random Forest)


In [24]:
from sklearn.model_selection import RandomizedSearchCV

hype_dict = { 'max_depth': [11,9,17],
             'min_samples_split' : [9,15,18],
             'max_leaf_nodes': [5,10,15]
           }
gcv_ran = RandomizedSearchCV(estimator=dtr, param_distributions=hype_dict, cv=4, n_iter = 24)

gcv_ran.fit(X_train_final, y_train)

print('Best Score =' , gcv_ran.best_score_)
print('Where we got best score =', gcv_ran.best_params_)

Best Score = 0.8370516487903922
Where we got best score = {'min_samples_split': 9, 'max_leaf_nodes': 15, 'max_depth': 11}


# GridSearchCV

In [25]:
from sklearn.model_selection import GridSearchCV

hyp_dict = { 'max_depth': [10,16,18],
             'min_samples_split' : [8,10,14],
             'max_leaf_nodes': [8,13,16]
           }
gcv = GridSearchCV(estimator=dtr, param_grid=hyp_dict, cv=5)

gcv.fit(X_train_final, y_train)

gcv.best_params_

{'max_depth': 10, 'max_leaf_nodes': 8, 'min_samples_split': 10}

In [26]:
gcv.best_score_

0.8152277611234899

In [27]:
gcv.predict(X_test_final)

array([34227.68181818, 18278.4516129 ,  9748.96296296, 12536.10526316,
       34227.68181818,  7178.22321429,  7178.22321429,  7178.22321429,
        9748.96296296,  7178.22321429, 12536.10526316,  7178.22321429,
       12536.10526316,  9748.96296296, 45400.        ,  7178.22321429,
        7178.22321429, 15303.66666667,  9748.96296296,  9748.96296296,
        9748.96296296, 18278.4516129 ,  7178.22321429,  7178.22321429,
        7178.22321429, 34227.68181818, 12536.10526316, 18278.4516129 ,
        7178.22321429, 18278.4516129 , 34227.68181818,  7178.22321429,
        9748.96296296, 18278.4516129 ,  7178.22321429, 34227.68181818,
       12536.10526316, 12536.10526316,  7178.22321429, 15303.66666667,
        9748.96296296])

In [28]:
gcv.score(X_test_final, y_test)

0.8763066597117881