In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Little modification for Hyundai data

Hyundai dataset contains a column which includes pound sign before the 'tax' column. So we have to take care of it before concatenating the datasets. We haven't included the two uncleaned versions of some models.

In [None]:
df_hyundi = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/hyundi.csv')
df_hyundi.head()

In [None]:
df_hyundi = df_hyundi.rename(columns={'tax(£)': 'tax'})
df_hyundi.head()

In [None]:
# This will get the names of the files in the dataset's folder
files = [file for file in os.listdir('/kaggle/input/used-car-dataset-ford-and-mercedes/')]
full_data = pd.DataFrame()

for file in files:
    if file in ['hyundi.csv', 'unclean focus.csv', 'unclean cclass.csv']:
        continue
    df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/'+file)
    full_data = pd.concat([full_data, df])

full_data = pd.concat([full_data, df_hyundi])

In [None]:
full_data.isnull().sum()

In [None]:
full_data

In [None]:
msno.matrix(full_data);

As seen from the missingno matrix, all the missing values are on the same rows. Let's find out which models have missing values and try to find why they are missing.

In [None]:
full_data.loc[np.isnan(full_data.tax), 'model'].value_counts()

In [None]:
# We had a car with production year 2060, so we are replacing it with the mean year value.
full_data.loc[full_data.year > 2020, 'year'] = 2017

In [None]:
full_data.describe()

Only Focus and C Class models have missing values. Now we have to find out if all of the C Class (and Focus) models have missing values or only a part of them are missing. There are some spaces in the the model names of 'model' column. We're going to strip those spaces in order to use them for conditional subsetting.

In [None]:
full_data.model = full_data.model.str.strip()

In [None]:
print(len(full_data[full_data.model == 'C Class']), len(full_data[full_data.model == 'Focus']))

In [None]:
missing_no = full_data[full_data['mpg'].isnull()]['model'].value_counts()
missing_no.plot.barh()

A great part of these two models are missing. Let's fill them with random values drawn one standard deviation range around the mean.

In [None]:
cols = ['tax', 'mpg']

for col in cols:
    mean_val = full_data[col].mean()
    std_val = full_data[col].std()
    nan_count = full_data[col].isnull().sum()
    rand_values = np.random.randint(mean_val-std_val, mean_val+std_val, size=nan_count)
    
    col_copy = full_data[col].copy()
    col_copy[np.isnan(col_copy)] = rand_values
    full_data[col] = col_copy

In [None]:
full_data.isnull().sum()

In [None]:
sns.heatmap(full_data.corr(), annot=True)

- People who want to buy a new car have to pay higher tax compared to older cars.
- Engine size and year highly affects the price of a car.
- Higher mileage means lower price.
- Correlation between mileage and year is highly negatively correlated.

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15,10))
sns.histplot(full_data.price, kde=True, ax=ax[0,0])
sns.countplot(x='transmission', data=full_data, hue='fuelType', ax=ax[0,1])
sns.lineplot(x='year', y='price', data=full_data[full_data.year > 1995], ci=None, ax=ax[1,0])
sns.scatterplot(x='mileage', y='price', data=full_data[full_data.year > 1995], ax=ax[1,1]);

Price column has positive skewness, meaning most of the cars are lower than 50000. As expected, lower mileage means higher price. 0 mileage cars are the most expensive ones.

In [None]:
transmission_share = pd.crosstab(full_data.transmission, full_data.fuelType).apply(lambda x: round(x/x.sum() * 100, 2), axis=1)
transmission_share['Total'] = full_data.transmission.value_counts()
transmission_share['Total perc.'] = round(transmission_share['Total'] / transmission_share['Total'].sum() * 100, 2)

In [None]:
transmission_share

Manual cars are dominating the market. More than half of the used cars are manual cars and the majority of them use petrol as fuel. But is there any significant price difference between them?

In [None]:
full_data.groupby('transmission')['price'].mean().sort_values().plot.barh();

In [None]:
full_data.groupby('transmission')['price'].median().sort_values().plot.barh();

In [None]:
full_data['mpg'].plot.hist(bins=50)

As we can see that most of the mpg values are under 100 and an mpg value higher than 100 is unrealistic, no car can drive 160 km with only 3.55 liters. Let's divide mpg values which are higher than 100 by 6.

In [None]:
def get_real_mpg(value):
    if value > 100:
        return round(value / 6, 1)
    else:
        return value
    

full_data['mpg'] = full_data['mpg'].apply(get_real_mpg)

In [None]:
full_data['mpg'].plot.hist()

Now let's encode 'transmission', and 'fuelType' columns and drop the model column, as it's not helpful for us. It'd be a great idea to encode the models into brands, like 'A1', 'A3' etc. into Audi and so on. But we haven't done it here. Because I wanted to concatenate all datasets into one and it would've taken a little time to encode the models into brands. 

In [None]:
from sklearn.preprocessing import LabelEncoder

full_data.drop('model', axis=1, inplace=True)

le=LabelEncoder()

for col in full_data.columns.to_numpy():
    if full_data[col].dtypes=='object':
        full_data[col]=le.fit_transform(full_data[col])

In [None]:
full_data = full_data.reset_index(drop=True)

Now let's split the data into train, validation and test sets. After that we will scale the dataset

In [None]:
train, val, test = np.split(full_data.sample(frac=1), [int(.6*len(full_data)), int(.8*len(full_data))])

In [None]:
print(len(train), len(val), len(test))

Before scaling the dataset, we should split them into X and y.

In [None]:
X_train = train.drop('price', axis=1)
y_train = train['price']

X_val = val.drop('price', axis=1)
y_val = val['price']

X_test = test.drop('price', axis=1)
y_test = test['price']

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
scaledX_train = sc.fit_transform(X_train)
scaledX_val = sc.transform(X_val)
scaledX_test = sc.transform(X_test)

We've done everything for fitting a model. Now let's try some models.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

rfr_reg = RandomForestRegressor(min_samples_leaf = 0.01)
en_reg = ElasticNet(alpha=0.1, l1_ratio=0.5)
dtr_reg = DecisionTreeRegressor()

for regressor in (rfr_reg, en_reg, dtr_reg):
    regressor.fit(scaledX_train, y_train)
    y_pred = regressor.predict(scaledX_val)
    print(regressor.__class__.__name__, np.sqrt(mean_squared_error(y_val, y_pred)))

Seems like Random Forest and Decision Tree models are better than the ElasticNet. So let's tune them!

In [None]:
param_grid_rfr = [{'n_estimators': [3,10,30], 'max_features': [2,4,6]},
                  {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]}]
param_grid_dtr = {'max_features': [2,4,6], 'min_samples_split': [0.1, 0.01]}

from sklearn.model_selection import GridSearchCV

grid_search_rfr = GridSearchCV(rfr_reg, param_grid_rfr, cv=3, scoring='neg_mean_squared_error')
grid_search_dtr = GridSearchCV(dtr_reg, param_grid_dtr, cv=3, scoring='neg_mean_squared_error')

#Since tree based models need no scaling, we will use the unscaled versions
grid_search_rfr.fit(X_train, y_train)

In [None]:
grid_search_dtr.fit(X_train, y_train)

In [None]:
best_rfr_model = grid_search_rfr.best_estimator_
best_dtr_model = grid_search_dtr.best_estimator_

In [None]:
for model in (best_rfr_model, best_dtr_model):
    y_val_pred = model.predict(X_val)
    print(np.sqrt(mean_squared_error(y_val, y_val_pred)))

The Random Forest model got better, but the Decision tree model's score has worsened. Let's try a new, untuned Decision Tree model and the tuned Random Forest model to predict the test set.

In [None]:
dtr_reg_new = DecisionTreeRegressor()
dtr_reg_new.fit(X_train, y_train)

for model in (best_rfr_model, dtr_reg_new):
    y_test_pred = model.predict(X_test)
    print(np.sqrt(mean_squared_error(y_test, y_test_pred)))

They are not so bad, they achieved similar scores with the validation set. So our model isn't overfitting. Let's try another metric called r-squared and finish our analysis.

In [None]:
from sklearn.metrics import r2_score

for model in (best_rfr_model, dtr_reg_new):
    y_test_pred = model.predict(X_test)
    print(r2_score(y_test, y_test_pred))

Maximum possible value is 1. Our data could be improved by adding a brand column and encoding it into labels (0,1,2...) and some work could be done on correlation problems, like the correlation between the age and mileage columns. We could also spend some time on other columns, like mpg and tax. If you like, you may try and improve the data from this notebook. See you!