In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

In [37]:
df = pd.read_csv('data/Car.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [38]:
df.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

In [39]:
df.dropna(inplace=True)
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
mileage          0
engine           0
max_power        0
torque           0
seats            0
dtype: int64

In [40]:
def remap_owner(owner):
    if owner == 'First Owner': return 1
    elif owner == 'Second Owner': return 2
    elif owner == 'Third Owner': return 3
    elif owner == 'Fourth & Above Owner': return 4
    else: return 0

In [61]:
df['transmission'] = [1 if x == 'Manual' else 0 for x in df['transmission']]
df.rename(columns={'transmission': 'is_manual'}, inplace=True)

df['owner'] = df['owner'].apply(remap_owner)

df['name'] = df['name'].apply(lambda x: x.split()[0])
df['mileage'] = df['mileage'].apply(lambda x: x.split()[0]).astype(float)
df['engine'] = df['engine'].apply(lambda x: x.split()[0]).astype(int)
df['max_power'] = df['max_power'].apply(lambda x: x.split()[0]).astype(float)

brand_dummies = pd.get_dummies(df['name'], drop_first=True, prefix='brand')
fuel_dummies = pd.get_dummies(df['fuel'], drop_first=True, prefix='fuel')
seller_dummies = pd.get_dummies(df['seller_type'], drop_first=True, prefix='seller')

df.drop(['name', 'fuel', 'seller_type', 'torque'], axis=1, inplace=True)
df = pd.concat([df, brand_dummies, fuel_dummies, seller_dummies], axis=1)

In [62]:
df.head()

Unnamed: 0,year,selling_price,km_driven,is_manual,owner,mileage,engine,max_power,seats,brand_Ashok,...,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_Individual,seller_Trustmark Dealer
0,2014,450000,145500,1,1,23.4,1248,74.0,5.0,0,...,0,0,0,0,0,1,0,0,1,0
1,2014,370000,120000,1,2,21.14,1498,103.52,5.0,0,...,1,0,0,0,0,1,0,0,1,0
2,2006,158000,140000,1,3,17.7,1497,78.0,5.0,0,...,0,0,0,0,0,0,0,1,1,0
3,2010,225000,127000,1,1,23.0,1396,90.0,5.0,0,...,0,0,0,0,0,1,0,0,1,0
4,2007,130000,120000,1,1,16.1,1298,88.2,5.0,0,...,0,0,0,0,0,0,0,1,1,0


In [65]:
(df.dtypes == 'object').sum()

0

In [66]:
from sklearn.model_selection import train_test_split

X = df.drop('selling_price', axis=1)
y = df['selling_price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

y_train.shape, y_test.shape

((5929,), (1977,))

In [67]:
def mape(y, y_hat): 
    y, y_hat = np.array(y), np.array(y_hat)
    return np.mean(np.abs((y - y_hat) / y)) * 100

In [69]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lm = LinearRegression()
lm.fit(X_train, y_train)

lm_preds = lm.predict(X_test)
print(f'R2   = {r2_score(y_test, lm_preds):.2f}')
print(f'MAPE = {mape(y_test, lm_preds):.2f}')

R2   = 0.87
MAPE = 42.87


In [72]:
for i, column in enumerate(df.columns[:-1]):
    coef = np.round(lm.coef_[i], 2)
    print(f'{column:20}: {coef:12}')

year                :     43014.37
selling_price       :        -0.99
km_driven           :    -93417.99
is_manual           :    -29134.49
owner               :     -3223.42
mileage             :        51.82
engine              :      6442.18
max_power           :     -6664.42
seats               :         -0.0
brand_Ashok         :    732079.41
brand_Audi          :   2062806.87
brand_BMW           :   -413173.11
brand_Chevrolet     :    143836.25
brand_Daewoo        :   -456934.44
brand_Datsun        :   -412771.18
brand_Fiat          :   -350143.14
brand_Force         :    -351419.7
brand_Ford          :   -359997.28
brand_Honda         :    -350773.5
brand_Hyundai       :    382947.89
brand_Isuzu         :   1084736.28
brand_Jaguar        :    403502.72
brand_Jeep          :      61492.6
brand_Kia           :   2185633.63
brand_Land          :   3094325.97
brand_Lexus         :    206214.99
brand_MG            :   -327026.98
brand_Mahindra      :   -256695.95
brand_Maruti        

In [73]:
from tpot import TPOTRegressor
from sklearn.metrics import make_scorer

mape_scorer = make_scorer(mape, greater_is_better=False)

pipeline_optimizer = TPOTRegressor(
    scoring=mape_scorer,
    max_time_mins=10,
    random_state=42,
    verbosity=2,
    n_jobs=-1
)

pipeline_optimizer.fit(X_train, y_train)



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…


Generation 1 - Current best internal CV score: -15.112509587258568

Generation 2 - Current best internal CV score: -15.112509587258568

11.19 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GradientBoostingRegressor(CombineDFs(input_matrix, input_matrix), alpha=0.8, learning_rate=0.1, loss=lad, max_depth=10, max_features=0.6000000000000001, min_samples_leaf=4, min_samples_split=5, n_estimators=100, subsample=0.8)


TPOTRegressor(max_time_mins=10, n_jobs=-1, random_state=42,
              scoring=make_scorer(mape, greater_is_better=False), verbosity=2)

In [75]:
pipeline_optimizer.fitted_pipeline_

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('functiontransformer-1',
                                                 FunctionTransformer(func=<function copy at 0x7fd819775940>)),
                                                ('functiontransformer-2',
                                                 FunctionTransformer(func=<function copy at 0x7fd819775940>))])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.8, loss='lad', max_depth=10,
                                           max_features=0.6000000000000001,
                                           min_samples_leaf=4,
                                           min_samples_split=5, random_state=42,
                                           subsample=0.8))])

In [76]:
tpot_preds = pipeline_optimizer.predict(X_test)

print(f'R2   = {r2_score(y_test, tpot_preds):.2f}')
print(f'MAPE = {mape(y_test, tpot_preds):.2f}')

R2   = 0.97
MAPE = 14.68
