In [1]:
# data stuff:
import pandas as pd
import numpy as np

# ML stuff:
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from tpot import TPOTRegressor

---
### Import data and prepare:

In [2]:
cars = pd.read_csv('data/cars.csv')

#### Inspect and search for missing values:

In [3]:
cars

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,113.7Nm@ 4000rpm,5.0
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,19.3 kmpl,1248 CC,73.9 bhp,190Nm@ 2000rpm,5.0
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396 CC,70 bhp,140Nm@ 1800-3000rpm,5.0


In [4]:
cars.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

&nbsp;

---
### Feature Cleaning and Engineering:
See chapter 21 (page 225) of "Effective Pandas" by Matt Harrison for details on the method chain below.
(https://amz.run/5mhB)

In [5]:
def tweak_cars(df):
    df = (df
          .dropna()
          .assign(owner = lambda df_: df_['owner']
                      .replace('First Owner', 1)
                      .replace('Second Owner', 2)
                      .replace('Third Owner', 3)
                      .replace('Fourth & Above Owner', 4)
                      .replace('Test Drive Car', 5)
                      .astype(int),
                  engine = lambda df_: df_['engine']
                      .str.split(r' ', n=1, expand=True)[0]
                      .astype(int),
                  mileage = lambda df_: df_['mileage']
                      .str.split(r' ', n=1, expand=True)[0]
                      .astype(float),
                  max_power = lambda df_: df_['max_power']
                      .str.split(r' ', n=1, expand=True)[0]
                      .astype(float),
                  name = lambda df_: df_['name']
                      .str.split(r' ', n=1, expand=True)[0]
                 )
          .drop(['torque'], axis=1)
         )
    return df

In [6]:
cars_tweaked = tweak_cars(cars)
cars_tweaked

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti,2014,450000,145500,Diesel,Individual,Manual,1,23.40,1248,74.00,5.0
1,Skoda,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498,103.52,5.0
2,Honda,2006,158000,140000,Petrol,Individual,Manual,3,17.70,1497,78.00,5.0
3,Hyundai,2010,225000,127000,Diesel,Individual,Manual,1,23.00,1396,90.00,5.0
4,Maruti,2007,130000,120000,Petrol,Individual,Manual,1,16.10,1298,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,2013,320000,110000,Petrol,Individual,Manual,1,18.50,1197,82.85,5.0
8124,Hyundai,2007,135000,119000,Diesel,Individual,Manual,4,16.80,1493,110.00,5.0
8125,Maruti,2009,382000,120000,Diesel,Individual,Manual,1,19.30,1248,73.90,5.0
8126,Tata,2013,290000,25000,Diesel,Individual,Manual,1,23.57,1396,70.00,5.0


&nbsp;

#### Convert categories to dummies:

In [7]:
def get_dummies(df, to_dummy_feats):
    dummies = pd.get_dummies(df[to_dummy_feats], prefix_sep='_', drop_first=True)
    return dummies

In [8]:
car_dummies = get_dummies(cars_tweaked, ['seller_type', 'transmission', 'fuel', 'name'])
car_dummies

Unnamed: 0,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,fuel_Diesel,fuel_LPG,fuel_Petrol,name_Ashok,name_Audi,name_BMW,name_Chevrolet,...,name_Mercedes-Benz,name_Mitsubishi,name_Nissan,name_Opel,name_Renault,name_Skoda,name_Tata,name_Toyota,name_Volkswagen,name_Volvo
0,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8124,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8125,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8126,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


&nbsp;

#### Concatenate Dataframes:

In [9]:
# set final complete df:
final_df = pd.concat([cars_tweaked, car_dummies], axis=1)

# set predictor:
y = final_df['selling_price']

# drop encoded columns:
final_df.drop(['seller_type', 'transmission', 'fuel', 'name', 'selling_price'], axis=1, inplace=True)
final_df

Unnamed: 0,year,km_driven,owner,mileage,engine,max_power,seats,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,...,name_Mercedes-Benz,name_Mitsubishi,name_Nissan,name_Opel,name_Renault,name_Skoda,name_Tata,name_Toyota,name_Volkswagen,name_Volvo
0,2014,145500,1,23.40,1248,74.00,5.0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2014,120000,2,21.14,1498,103.52,5.0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
2,2006,140000,3,17.70,1497,78.00,5.0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2010,127000,1,23.00,1396,90.00,5.0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,2007,120000,1,16.10,1298,88.20,5.0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,2013,110000,1,18.50,1197,82.85,5.0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
8124,2007,119000,4,16.80,1493,110.00,5.0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
8125,2009,120000,1,19.30,1248,73.90,5.0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
8126,2013,25000,1,23.57,1396,70.00,5.0,1,0,1,...,0,0,0,0,0,0,1,0,0,0


In [10]:
final_df.dtypes

year                              int64
km_driven                         int64
owner                             int64
mileage                         float64
engine                            int64
max_power                       float64
seats                           float64
seller_type_Individual            uint8
seller_type_Trustmark Dealer      uint8
transmission_Manual               uint8
fuel_Diesel                       uint8
fuel_LPG                          uint8
fuel_Petrol                       uint8
name_Ashok                        uint8
name_Audi                         uint8
name_BMW                          uint8
name_Chevrolet                    uint8
name_Daewoo                       uint8
name_Datsun                       uint8
name_Fiat                         uint8
name_Force                        uint8
name_Ford                         uint8
name_Honda                        uint8
name_Hyundai                      uint8
name_Isuzu                        uint8


&nbsp;

---
### Prepare for ML:

In [11]:
# split:
X = final_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

&nbsp;

#### Define Mean Absolute Percentage Error:

In [12]:
# scoring function:
def mape(y, y_hat): 
    y, y_hat = np.array(y), np.array(y_hat)
    return np.mean(np.abs((y - y_hat) / y)) * 100

&nbsp;

#### Define TPOT params and fit:

In [13]:
# set scorer:
mape_scorer = make_scorer(mape, greater_is_better=False)

# set pipeline:
pipeline_optimizer = TPOTRegressor(
    scoring=mape_scorer,
    max_time_mins=180,
    random_state=42,
    verbosity=2,
    n_jobs=-1
)

pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -15.096341906155839

Generation 2 - Current best internal CV score: -15.096341906155839

Generation 3 - Current best internal CV score: -15.096341906155839

Generation 4 - Current best internal CV score: -15.096341906155839

Generation 5 - Current best internal CV score: -15.073954813158796

Generation 6 - Current best internal CV score: -15.00818993432138

Generation 7 - Current best internal CV score: -14.900564319158144

Generation 8 - Current best internal CV score: -14.900564319158144

Generation 9 - Current best internal CV score: -14.898234950561605

Generation 10 - Current best internal CV score: -14.8108310659647

Generation 11 - Current best internal CV score: -14.665495163863351

Generation 12 - Current best internal CV score: -14.665495163863351

Generation 13 - Current best internal CV score: -14.665495163863351

Generation 14 - Current best internal CV score: -14.63748261742984

Generation 15 - Current best internal CV score

TPOTRegressor(max_time_mins=180, n_jobs=-1, random_state=42,
              scoring=make_scorer(mape, greater_is_better=False), verbosity=2)

&nbsp;

#### Obtain Best Params:

In [14]:
pipeline_optimizer.fitted_pipeline_

Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=LassoLarsCV(normalize=False))),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.8, loss='huber',
                                           max_depth=10,
                                           max_features=0.6000000000000001,
                                           min_samples_leaf=4,
                                           min_samples_split=16,
                                           random_state=42, subsample=0.8))])

&nbsp;

#### Obtain Score:

In [15]:
tpot_preds = pipeline_optimizer.predict(X_test)

print(f'R2   = {r2_score(y_test, tpot_preds):.2f}')
print(f'MAPE = {mape(y_test, tpot_preds):.2f}')

R2   = 0.97
MAPE = 14.22


  "X does not have valid feature names, but"
