# Predict the sale price for the Pre-Owned Cars

The data contains 9 csv files, with each file has the information about each car brand, including Audi, BMW, Ford, Hyundai, Mercedes, Skoda, Toyota, Vauxhall and Volkswagen.

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Reading the Data

In [None]:
audi = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/audi.csv')
bmw = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv')
ford = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/ford.csv')
hyundai = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/hyundi.csv')
merc = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/merc.csv')
skoda = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/skoda.csv')
toyota = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/toyota.csv')
vauxhall = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/vauxhall.csv')
vw = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/vw.csv')

## Checking the number of columns in each dataset

In [None]:
print("Columns in the Audi dataframe:") 
print(list(audi.columns))
print("-" * 50)
print("Columns in the BMW dataframe:")
print(list(bmw.columns))
print("-" * 50)
print("Columns in the Ford dataframe:")
print(list(ford.columns))
print("-" * 50)
print("Columns in the Hyundai dataframe:")
print(list(hyundai.columns))
print("-" * 50)
print("Columns in the Mercedes dataframe:")
print(list(merc.columns))
print("-" * 50)
print("Columns in the Skoda dataframe:")
print(list(skoda.columns))
print("-" * 50)
print("Columns in the Toyota dataframe:")
print(list(toyota.columns))
print("-" * 50)
print("Columns in the Vauxhall dataframe:")
print(list(vauxhall.columns))
print("-" * 50)
print("Columns in the VW dataframe:")
print(list(vw.columns))

1. It is observed that in hyundai dataset, the tax column is having a special character (pound).

In [None]:
hyundai.rename({'tax(£)': 'tax'},axis=1,inplace=True)

## Adding a column make to identify the manufacturer for combining the datasets

In [None]:
audi['make'] = 'Audi'
bmw['make'] = 'BMW'
ford['make'] = 'Ford'
hyundai['make'] = 'Hyundai'
merc['make'] = 'Mercedes'
skoda['make'] = 'Skoda'
toyota['make'] = 'Toyota'
vauxhall['make'] = 'Vauxhall'
vw['make'] = 'Volkswagen'

In [None]:
df = pd.concat([audi, bmw, ford, hyundai, merc, skoda, toyota, vauxhall, vw], axis=0, ignore_index=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.isnull().sum()

### There are no missing values in the dataset

In [None]:
df.nunique()

### By observing the unique values we can understand that the columns "Model", "fuelType" and "transmission" and "make" are categorical columns

# Exploratory Data Analysis

In [None]:
df.describe()

### Year

From the info we got from the dataset, it is observed that there is atleast 1 car with the year as 2060

In [None]:
df[df['year'] == 2060]

It is observed that there is 1 car from the manufacturer Ford that is having year as 2060. Since we don't have any other information regarding this data, we can remove this entry

In [None]:
df = df.drop(df[df['year'] == 2060].index)

Also it is observed that there are some cars with the year 1970

In [None]:
df[df['year'] == 1970]

There are only 2 cars with the year as 1970. Also in the current situation the cars with the model of 1970's are not allowed to be used these 2 cars can be removed from the data.

In [None]:
df = df.drop(df[df['year'] == 1970].index)

### Engine Size

In [None]:
df[df['engineSize'] == 0]

There are 272 rows with engine size as 0. So this needs to be handled in the feature engineering part

### Mileage

There are some cars with mileage as 1

In [None]:
df[df['mileage'] == 1]

Some vehicles are from the year 2020. So it makes sense. But There are some vehicles that have the year less than 2020 but with mileage as 1

In [None]:
len(df[(df['mileage'] ==1) & (df['year'] != 2020)])

Since there are only 37 cars with mileage as 1, these data points can be removed.

In [None]:
df = df.drop(df[(df['mileage']==1) & (df['year']<= 2019)].index)

### Tax

In [None]:
df[df['tax'] == 0]

There are nearly 6300 datapoints with tax as 0. So this should be handled in feature engineering

### Price

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True)

There is an extremely positive correlation between year and price and an extremely negative correlation between mileage and price. This makes sense, since newer cars are generally more expensive and cars with more mileage are relatively cheaper. We also notice a negative correlation between mileage and year - the newer a car is the less miles it is likely to have travelled. Furthermore, we notice a positive correlation between engine size and price, as well as engine size and tax. This follows expectation, since it is common practice for manufacturers to sell models with larger engines for a higher price in comparison to the same model with a smaller engine. As a result, due to the higher price, a larger tax payment is required, hence the positive correlation. This also explains the positive correlation between tax and price.

In [None]:
sns.scatterplot(x=df['mileage'],y=df['price'])
plt.title('Scatter plot of Mileage against Price')

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(x='make',y='price',data=df)

It is observed that the cars Audi, BMW, Mercedes, Volkswagen have higher price than average. But the cars Hyundai and Skoda points seems to be outliers. So those points can be removed

In [None]:
for i in range(91,100,1):
    print('The {0}th percentile of price is {1}'.format(i, np.percentile(df['price'],i)))

In [None]:
for i in np.arange(99,100,0.1):
    print('The {0}th percentile of price is {1}'.format(round(i,1), np.percentile(df['price'],i)))

In [None]:
df[(df['make'] == 'Hyundai') & (df['price'] > 80000)]

In [None]:
df = df.drop(df[(df['make'] == 'Hyundai') & (df['price'] > 80000)].index)

In [None]:
df[(df['make'] == 'Skoda') & (df['price']> 80000)]

In [None]:
df = df.drop(df[(df['make'] == 'Skoda') & (df['price'] > 80000)].index)

### Fuel Type

In [None]:
sns.boxplot(x=df['fuelType'],y=df['price'])

It is observed that the petrol vehicles are cheaper than the other fuel type vehicles

### Transmission

In [None]:
sns.boxplot(x=df['transmission'],y=df['price'])

It is evident that the Automatic and Semi-Automatic cars are higher price than the manual transmission. This feature has clear significant influence on the price of the vehicles

### Price

In [None]:
sns.distplot(df['price'])

### Year

In [None]:
sns.distplot(df['year'])

### Mileage

In [None]:
sns.distplot(df['mileage'])

### Tax

In [None]:
sns.distplot(df['tax'])

### MPG

In [None]:
sns.distplot(df['mpg'])

### Engine Size

In [None]:
sns.distplot(df['engineSize'])

It is observed that mileage and price are having right skewed distribution

# Feature Engineering

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.head()

In [None]:
X = df.drop(['price'], axis = 1)
y = df['price']

In [None]:
X_transmission = pd.get_dummies(X['transmission'], drop_first=True)
X_transmission.rename(columns={'Other':'Other Transmission'}, inplace=True)
X_fuelType = pd.get_dummies(X['fuelType'], drop_first=True)
X_fuelType.rename(columns={'Other':'Other FuelType'}, inplace=True)
X_make = pd.get_dummies(X['make'], drop_first=True)
X_model = pd.get_dummies(X['model'], drop_first=True)
X = pd.concat([X,X_transmission,X_fuelType,X_make, X_model],axis=1)
X.drop(['transmission', 'fuelType', 'make', 'model'], axis = 1, inplace=True)

In [None]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train = X_train_.copy(deep=True)
X_test = X_test_.copy(deep=True)
y_train = y_train_.copy(deep=True)
y_test = y_test_.copy(deep=True)

Engine Size

In [None]:
engineSize_median = X_train['engineSize'].median()
print('The median of Engine Size is : {0}'.format(engineSize_median))
engineSize_median = 1.6

In [None]:
X_train['engineSize'] = X_train['engineSize'].replace(to_replace=0,value=engineSize_median)
X_test['engineSize'] = X_test['engineSize'].replace(to_replace=0,value=engineSize_median)

Tax

In [None]:
tax_mean = X_train['tax'].mean()
print('The mean of tax is {0}'.format(tax_mean))
tax_mean = 120.41

In [None]:
X_train['tax'] = X_train['tax'].replace(to_replace=0,value=tax_mean)
X_test['tax'] = X_test['tax'].replace(to_replace=0,value=tax_mean)

Mileage

In [None]:
from scipy import stats

In [None]:
mileage,mileage_lambda = stats.boxcox(X_train['mileage'])

In [None]:
print('The lambda for boxcox transform of mileage is {0}'.format(mileage_lambda))
mileage_lambda=0.36762127618713064

In [None]:
sns.distplot(mileage)

In [None]:
test_mileage = stats.boxcox(X_test['mileage'],mileage_lambda)

In [None]:
X_train['mileage_transformed'] = mileage
X_test['mileage_transformed'] = test_mileage

In [None]:
X_train.head()

Price

In [None]:
price,price_lambda = stats.boxcox(y_train)

In [None]:
print('The lambda for boxcox transform of price is {0}'.format(price_lambda))
price_lambda=0.03888231697484755

In [None]:
sns.distplot(price)

In [None]:
test_price = stats.boxcox(y_test,price_lambda)

In [None]:
y_train_trans = price
y_test_trans = test_price

# Model Creation

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [None]:
X_train.drop(['mileage'], axis=1, inplace=True)
X_test.drop(['mileage'], axis=1, inplace=True)

### Linear Regression

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train_trans)

In [None]:
y_train_pred = reg.predict(X_train)
y_pred = reg.predict(X_test)

In [None]:
train_r2 = r2_score(y_train_trans, y_train_pred)
print('The train R2 value is {0}'.format(train_r2))

test_r2 = r2_score(y_test_trans, y_pred)
print('The test R2 value is {0}'.format(test_r2))

In [None]:
!rm -rf /kaggle/working/models
!rm -rf /kaggle/working/data
!mkdir models
!mkdir data

In [None]:
with open('/kaggle/working/data/linear_regression.pkl', 'wb') as f:
    pickle.dump(reg,f)
with open('/kaggle/working/data/X_train.pkl', 'wb') as f:
    pickle.dump(X_train,f)
with open('/kaggle/working/data/X_test.pkl', 'wb') as f:
    pickle.dump(X_test,f)
with open('/kaggle/working/data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train,f)
with open('/kaggle/working/data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test,f)
with open('/kaggle/working/data/y_train_trans.pkl', 'wb') as f:
    pickle.dump(y_train_trans,f)
with open('/kaggle/working/data/y_test_trans.pkl', 'wb') as f:
    pickle.dump(y_test_trans,f)

### Decision Tree

In [None]:
reg = DecisionTreeRegressor(random_state=42)
parameters = {"max_depth":np.array([1, 2, 5, 10, 50, 100]), "min_samples_split":np.array([1, 5, 10, 50, 100, 500])}
clf = GridSearchCV(reg, param_grid = parameters, cv = 5, scoring = 'r2', n_jobs=-1, verbose=5)
clf.fit(X_train, y_train_trans)
print(clf.best_estimator_)
best_reg = clf.best_estimator_
best_reg.fit(X_train, y_train_trans)

In [None]:
y_train_pred = best_reg.predict(X_train)
y_pred = best_reg.predict(X_test)
train_r2 = r2_score(y_train_trans, y_train_pred)
print('The train R2 value is {0}'.format(train_r2))

test_r2 = r2_score(y_test_trans, y_pred)
print('The test R2 value is {0}'.format(test_r2))

In [None]:
with open('/kaggle/working/data/decisiontree_regression.pkl', 'wb') as f:
    pickle.dump(best_reg,f)

### Random Forest

In [None]:
reg = RandomForestRegressor(random_state=42)
parameters = {"max_depth": [2, 3, 5, 8, 10, 15, 20, 25, 30, 40, 50,100],
              "n_estimators": [10, 20, 30, 40, 50, 80, 100, 150, 200],
                "max_features": ['auto', 'sqrt']
             }
rcv = RandomizedSearchCV(reg, param_distributions = parameters, cv = 3, scoring = 'r2', n_jobs=-1, verbose=10)
rcv.fit(X_train, y_train_trans)
print(rcv.best_estimator_)
best_reg = rcv.best_estimator_
best_reg.fit(X_train, y_train_trans)

In [None]:
y_train_pred = best_reg.predict(X_train)
y_pred = best_reg.predict(X_test)
train_r2 = r2_score(y_train_trans, y_train_pred)
print('The train R2 value is {0}'.format(train_r2))

test_r2 = r2_score(y_test_trans, y_pred)
print('The test R2 value is {0}'.format(test_r2))

In [None]:
with open('/kaggle/working/data/randomforest_regression.pkl', 'wb') as f:
    pickle.dump(best_reg,f)

## XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(n_jobs = -1)
parameters = {"max_depth":np.array([1, 5, 10, 50, 150, 200, 250]), "n_estimators":np.array([3, 9, 11,15, 25, 50, 70, 150, 200, 250, 300])}
reg = GridSearchCV(xgb, parameters, cv = 3, scoring='r2', n_jobs=-1, verbose = 10) 
reg.fit(X_train, y_train_trans)
print(reg.best_estimator_)
best_reg = reg.best_estimator_
best_reg.fit(X_train, y_train_trans)

In [None]:
y_train_pred = best_reg.predict(X_train)
y_pred = best_reg.predict(X_test)
train_r2 = r2_score(y_train_trans, y_train_pred)
print('The train R2 value is {0}'.format(train_r2))

test_r2 = r2_score(y_test_trans, y_pred)
print('The test R2 value is {0}'.format(test_r2))

In [None]:
with open('/kaggle/working/data/xgboost_regression.pkl', 'wb') as f:
    pickle.dump(best_reg,f)

# Results

In [None]:
from prettytable import PrettyTable

In [None]:
results = []

# add the results in a list
results.append(['Model','Train R2','Test R2'])
results.append(['Linear Regression',0.94,0.939])
results.append(['Decision Tree',0.987,0.949])
results.append(['Random Forest',0.991,0.962])
results.append(['XGBoost',0.987,0.968])

In [None]:
table = PrettyTable()
table.field_names = results[0]
for i in range(len(results)):
    if i!=0:
        table.add_row(results[i])
print(table)

# Conclusion

It is observed that XGBoost is giving the best R Square metric value on test data