# Importing libraries

In [None]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
sns.set()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
audi_df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/audi.csv')
toyota_df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/toyota.csv')
ford_df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/ford.csv')
bmw_df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv')
vw_df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/vw.csv')
hyundai_df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/hyundi.csv')
mercedez_df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/merc.csv')

In [None]:
print(audi_df.info())
display(audi_df.head())

In [None]:
print(toyota_df.info())
display(toyota_df.head())

In [None]:
print(ford_df.info())
display(ford_df.head())

In [None]:
print(bmw_df.info())
display(bmw_df.head())

In [None]:
print(vw_df.info())
display(vw_df.head())

In [None]:
print(hyundai_df.info())
display(hyundai_df.head())

### The tax column is different in the symbol, let's change that before we concatenate

In [None]:
hyundai_df.rename(columns={'tax(£)':'tax'}, inplace=True)
hyundai_df.info()

In [None]:
print(mercedez_df.info())
display(mercedez_df.head())

### All dataframes have the same number of columns but the number of instances is different, in consequence we have to consider the unbalance that it represent.
### I will use the manufacture company instead of the individual models

In [None]:
audi_df['model'] = 'audi'
toyota_df['model'] = 'toyota'
ford_df['model'] = 'ford'
bmw_df['model'] = 'bmw'
vw_df['model'] = 'vw'
hyundai_df['model'] = 'hyundai'
mercedez_df['model'] = 'mercedez'

In [None]:
car_manufacturers = pd.concat([audi_df,toyota_df,ford_df,bmw_df,vw_df,hyundai_df,mercedez_df])


### Sanity checks

In [None]:
car_manufacturers.info()

In [None]:
car_manufacturers['model'].value_counts()

In [None]:
car_manufacturers['transmission'].value_counts()

In [None]:
car_manufacturers['fuelType'].value_counts()

In [None]:
car_manufacturers.describe().T

### Apperently there are cars manufactured in the year 2060, lets filter the rows where the year is less than 2021

In [None]:
car_manufacturers = car_manufacturers[car_manufacturers['year'] <= 2021]

In [None]:
car_manufacturers.describe().T

## Plotting some basic information

In [None]:
fig, ax = plt.subplots(3,2,figsize=(10,10))
ax[0,0] = sns.kdeplot(car_manufacturers['year'], ax=ax[0,0])
ax[0,1] = sns.kdeplot(car_manufacturers['price'],log_scale=True, ax=ax[0,1])
ax[1,0] = sns.kdeplot(car_manufacturers['mileage'],log_scale=True, ax=ax[1,0])
ax[1,1] = sns.kdeplot(car_manufacturers['tax'], ax=ax[1,1])
ax[2,0] = sns.kdeplot(car_manufacturers['mpg'], ax=ax[2,0])
ax[2,1] = sns.kdeplot(car_manufacturers['engineSize'], ax=ax[2,1])
plt.tight_layout()

* Year and mileage are skew to the right
* Price follows a quite normal distribution
* Tax and mpg are skew to the left
* Engine size appears as multimodal because of how the engines are measure (cc)

## Plotting correlation matrix

In [None]:
car_manufacturers_corr = car_manufacturers.corr()
mask = np.triu(np.ones_like(car_manufacturers_corr, dtype=bool))
sns.heatmap(car_manufacturers_corr, mask=mask, cbar=False, cmap="BuGn", linewidths=0.3)

* It loks like price is more correlated with engine size and tax

In [None]:
fig, ax = plt.subplots(2,1,figsize=(10,10))
ax[0] = sns.regplot(data = car_manufacturers,y='price',x='engineSize',x_estimator=np.mean,ci=None, ax=ax[0])
ax[1] = sns.regplot(data = car_manufacturers,y='price',x='tax',x_estimator=np.mean,ci=None, ax=ax[1])
plt.tight_layout()

* In general, audi and mercedez are the brands with bigger engines and also the ones that pay more taxes

## Preparing the data in features and target value

In [None]:
X_df = car_manufacturers.drop("price", axis=1)
X_df.info()

In [None]:
y = car_manufacturers["price"]
y[:5]

In [None]:
cat_features = [0,2,4]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size = 0.2, random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [None]:
train_dataset = cb.Pool(X_train, y_train, cat_features = cat_features ) 
valid_dataset = cb.Pool(X_valid, y_valid, cat_features = cat_features ) 
test_dataset = cb.Pool(X_test, y_test, cat_features = cat_features )

In [None]:
#Grid Search
#model = cb.CatBoostRegressor(loss_function='RMSE', verbose = 100)
#grid = {'iterations': [100, 150, 200],
#        'learning_rate': [0.03, 0.07, 0.1],
#        'depth': [2, 4, 6, 8],
#        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
#model.grid_search(grid, train_dataset)

Best parameters
Best so far :
* iterations : 500
* lr: 0.15
* depth: 8
* l2_leaf_reg: 0.2

In [None]:
model = cb.CatBoostRegressor(loss_function='RMSE', iterations= 500,
        learning_rate= 0.15,
        depth= 8,
        l2_leaf_reg= 0.2,
        logging_level='Silent')

In [None]:
model.fit(train_dataset, eval_set=valid_dataset, plot=True)

In [None]:
model.score(X_test, y_test)

In [None]:
pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

In [None]:
model.get_feature_importance(prettified=True)

## Our model predicts the price with a precision of +/- 2281.83 euros and 95% of our data fits the model

#### Feature Importance

In [None]:
plt.figure(figsize=(12, 6));
ax = sns.barplot(x="Importances", y="Feature Id", data=model.get_feature_importance(prettified=True));
ax.set_title('CatBoost features importance:', fontsize=16);
ax.set_xlabel('Importance', fontsize=14);
ax.set_ylabel('Feature', fontsize=14);
plt.show()

#### Using shap values

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(train_dataset) 
shap.summary_plot(shap_values, X_train)