In [None]:
import numpy as np
import pandas as pd

import plotly
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import iplot
import cufflinks as cf

import matplotlib.pyplot as plt
import seaborn as sns

cf.go_offline()
plotly.offline.init_notebook_mode()
cf.set_config_file(world_readable=True, theme='space', offline=True)

# EDA & Data Cleaning

## Data Loading

In [None]:
file_names = [
    'audi.csv', 'bmw.csv', 'ford.csv',
    'hyundi.csv', 'merc.csv', 'skoda.csv',
    'toyota.csv', 'vauxhall.csv', 'vw.csv'
]

brands = [
    'Audi', 'BMW', 'Ford',
    'Hyundi', 'Mercedes-Benz', 'Skoda',
    'Toyota', 'Vauxhall', 'Volkswagen'
]

brands_data = {
    brand: pd.read_csv('../input/used-car-dataset-ford-and-mercedes/' + fn)
    for brand, fn in zip(brands, file_names)
}

for bname, bdata in brands_data.items():
    print(f"{bname:<15}{bdata.columns.tolist()}")

In [None]:
brands_data["Hyundi"].rename(columns={'tax(£)': 'tax'}, inplace=True)

## EDA

In [None]:
mean_taxes = pd.DataFrame({'AvgTax': [0]*len(brands)}, index=brands)
for bname, bdata in brands_data.items():
    mean_taxes.loc[bname, 'AvgTax'] = round(bdata.tax.mean(), 1)
mean_taxes.iplot(kind='bar', title='Average taxes', orientation='h')

In [None]:
for bname, bdata in brands_data.items():
    iplot(px.scatter(data_frame=bdata, color='model', x='mileage',
                     y='price', template='plotly_dark', title=bname))

The anticorrelation between price and mileage is clearly visible. You can also see that the points marked with color (it denotes the model) form clusters. There are outliers in each dataset.

In [None]:
for bname, bdata in brands_data.items():
    iplot(px.scatter(data_frame=bdata, color='fuelType', x='mileage', size='mpg',
                     y='price', template='plotly_dark', title=bname))

MPG and price both depend on fuel type.

In [None]:
feats = brands_data['Audi'].columns.tolist()
feats.remove('model')
feats.remove('transmission')
feats.remove('fuelType')
n_feats = len(feats)

avg_corr = pd.DataFrame({feat: [0] * n_feats for feat in feats}, index=feats)
for bname, bdata in brands_data.items():
    brand_corr = bdata.corr()
    for f in feats:
        avg_corr[f] += brand_corr[f] / len(brands)
px.imshow(avg_corr, template='plotly_dark', zmin=-1, color_continuous_scale='inferno')

There is strong correlation between year and mileage, mileage correlates with the price more strongly.

# Data Preparation

## Cleaning

In [None]:
def remove_outliers(b: str, feat: str, lim: int) -> None:
    brands_data[b] = brands_data[b][brands_data[b][feat] <= lim]
    
remove_outliers('BMW',      'price', 80000)
remove_outliers('Hyundi',   'price', 80000)
remove_outliers('Skoda',    'price', 40000)
remove_outliers('Vauxhall', 'price', 30000)

for bname in brands:
    remove_outliers(bname, 'mileage', 200000)

## Features Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from typing import List

full_data = {bname: bdata.copy() for bname, bdata in brands_data.items()}

def encode_features(data_frame: pd.DataFrame, encoder, feats: List[str]) -> None:
    data_frame[feats] = encoder.fit_transform(data_frame[feats])

enc = OrdinalEncoder()    
for b in brands:
    encode_features(full_data[b], enc, ['model', 'fuelType', 'transmission'])

# Model Selection

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error as mae
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from xgboost import XGBRegressor
from sklearn.svm import SVR

models = {
    'Linear SVR':        SVR(kernel='linear'),
    'ElasticNet':        ElasticNet(),
    'Linear Regression': LinearRegression(),
    'XGBoost Regressor': XGBRegressor()
}

# Scaling and splitting into (X, y) pairs
full_data = {
    bname: (scale(bdata.drop(columns=['price'])), bdata.price)
    for bname, bdata in full_data.items()
}

scores = pd.DataFrame({'r2': [0] * len(models), 'MAE': [0] * len(models)}, index=models.keys())

for model_name, model in models.items():
    mr2, mmae = 0., 0.
    for b in brands:
        X, y = full_data[b]
        train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)
        model.fit(train_X, train_y)
        pred_y = model.predict(val_X)
        mr2  += r2_score(val_y, pred_y) / len(brands)
        mmae = max(mae(val_y, pred_y), mmae)
    scores.loc[model_name, 'r2'], scores.loc[model_name, 'MAE'] = round(mr2, 2), round(mmae, 0)
    
scores.r2.iplot(kind='bar', title='Mean Determination Coefficient')
scores.MAE.iplot(kind='bar', title='Max MAE')

**As you can see, XGBoost showed the best results.**

# Final Model

In [None]:
for bname, bdata in brands_data.items():
    bdata.model = bname + str(' ') + bdata.model.astype(str)

udata = pd.concat([bdata for bdata in brands_data.values()])
encode_features(udata, enc, ['model', 'fuelType', 'transmission'])
full_X, full_y = scale(udata.drop(columns=['price'])), udata.price
full_train_X, full_test_X, full_train_y, full_test_y = train_test_split(full_X, full_y, random_state=0)

regressor = XGBRegressor()
regressor.fit(full_train_X, full_train_y)
full_pred_y = regressor.predict(full_test_X)
final_r2_score = r2_score(full_test_y, full_pred_y)

print(round(final_r2_score, 2))

In [None]:
regressor.fit(full_X, full_y)
def predict_car_prices(X: pd.DataFrame) -> np.ndarray:
    return regressor.predict(X)