In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Packages

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 11000)
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Data

In [None]:
brand_data_org = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/hyundi.csv')

## Copying Datasets

In [None]:
# Using a copy to play on the safe side, making sure the inital Dataframe won't be modified

In [None]:
brand_data = brand_data_org.copy()

# Exploring Dataset

## Brand

In [None]:
brand_data.info()

In [None]:
# no nulls with appropriate data types

In [None]:
brand_data.describe(include='all')

In [None]:
# We have 16 unique models but most of the cars are Tucson
# 4 different transmission types but most of the cars are Manual transmission
# 4 different fuel types but most of the cars are running on Petrol
# It is strange to see min mileage is 1 for used cars!!
# mpg max = 256.8!! It is not logic to have cars with more than 100 miles per gallon
# It is also strange to see engine size equal to zero but may be this can be true for electric cars only
# Let us explore the data in more details but we have to change tax(£) to tax

In [None]:
brand_data.columns.values

In [None]:
cols = ['model',
 'year',
 'price',
 'transmission',
 'mileage',
 'fuelType',
 'tax',
 'mpg',
 'engineSize']

In [None]:
brand_data.columns=cols

## Data Exploration

### Data Fields Distribution Graphs

In [None]:
fig, axes = plt.subplots(figsize=(15,8),nrows=2, ncols=3)
sns.histplot(brand_data['year'],ax=axes[0,0])
sns.histplot(brand_data['mileage'],ax=axes[0,1])
sns.histplot(brand_data['tax'],ax=axes[0,2])
sns.histplot(brand_data['mpg'],ax=axes[1,0])
sns.histplot(brand_data['engineSize'],ax=axes[1,1])
sns.histplot(brand_data['price'],ax=axes[1,2])

## Finding Correlations

In [None]:
fields_correlation = sns.heatmap(brand_data.corr(), cmap="YlGnBu", annot=True)

### Correlogram for additional visulatization of the relation between each two features 

In [None]:
sns.pairplot(brand_data, kind="reg",plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.1}},diag_kind="hist")

### mileage vs mean price [grouped by year]

In [None]:
year_mile_avgprice=pd.DataFrame
year_mile_avgprice=year_mile_avgprice({'mean_price' : brand_data.groupby( ["year",'mileage'] ).mean()["price"]}).reset_index().sort_values(by='year',ascending=True)

In [None]:
import itertools
marker = itertools.cycle((',', '+', '.', 'o', '*','s','p','X','d','h','v','^','<','>','1','2','3','4','8','|','_','x')) 
year_set=sorted(set(year_mile_avgprice['year']), reverse=True)
plt.figure(figsize=(17,8))
for year in year_set:
     selected_data = year_mile_avgprice.loc[year_mile_avgprice['year'] == year]
     plt.scatter(selected_data['mileage'], selected_data['mean_price'],marker = next(marker), label=year)
plt.xlabel('mileage')
plt.ylabel('mean price')
plt.legend()

## Further Exploration of Data (Categorical Features and Outliers Visulaization)

### Price

In [None]:
brand_data.boxplot(column=['price'],figsize=(15,5),patch_artist = True,notch =False,vert=0)

In [None]:
brand_data_above25=brand_data[brand_data['price']>25000]
brand_data_above25.boxplot(column=['price'],figsize=(15,5),patch_artist = True,notch ='True',vert=0)

In [None]:
len(brand_data_above25['price'])

In [None]:
# now we have only 166 outliers (above 25,000). Dataset total len=4860

### Fuel Type and Mileage

In [None]:
brand_data.boxplot(column=['mileage'], by=['fuelType'],figsize=(15,5),vert=0)

### Fuel Type and mpg

In [None]:
brand_data.boxplot(column=['mpg'], by=['fuelType'],figsize=(15,5),patch_artist = True,notch =False,vert=0)

### mpg and transmission

In [None]:
brand_data.boxplot(column=['mpg'], by=['transmission'],figsize=(15,5),patch_artist = True,notch =False,vert=0)

### mpg and engine size

In [None]:
brand_data.boxplot(column=['mpg'], by=['engineSize'],figsize=(15,5))

### Count of Features

In [None]:
fig, axes = plt.subplots(figsize=(20,12),nrows=2, ncols=2)
vis_models = brand_data.groupby(['model']).count()['price'].sort_values(ascending=False)
vis_ftyp = brand_data.groupby(['fuelType']).count()['price'].sort_values(ascending=False)
vis_trans = brand_data.groupby(['transmission']).count()['price'].sort_values(ascending=False)
vis_engS = brand_data.groupby(['engineSize']).count()['price']
vis_models.plot(ax=axes[0,0],kind='bar',title = 'Count')
vis_ftyp.plot(ax=axes[0,1],kind='bar',title = 'Count')
vis_engS.plot(ax=axes[1,0],kind='bar',title = 'Count')
vis_trans.plot(ax=axes[1,1],kind='bar',rot=360,title = 'Count')

### mpg, fuel type and transmission

In [None]:
df=pd.DataFrame
df=brand_data[brand_data['transmission']=='Automatic']
df=df[df['fuelType']!='Other']
df=df[df['engineSize']>0]
plt.figure(figsize=(15,5))
plot01 = sns.boxplot(x='engineSize', y='mpg', hue='fuelType', notch=False, data=df, palette='bright')

In [None]:
df=pd.DataFrame
df=brand_data[brand_data['transmission']=='Semi-Auto']
df=df[df['fuelType']!='Other']
df=df[df['engineSize']>0]
plt.figure(figsize=(15,5))
plot01 = sns.boxplot(x='engineSize', y='mpg', hue='fuelType', notch=False, data=df, palette='bright')

In [None]:
df=pd.DataFrame
df=brand_data[brand_data['transmission']=='Manual']
df=df[df['fuelType']!='Other']
df=df[df['engineSize']>0]
plt.figure(figsize=(15,5))
plot01 = sns.boxplot(x='engineSize', y='mpg', hue='fuelType', notch=False, data=df, palette='bright')

### price and year 

In [None]:
brand_data.boxplot(column=['price'], by=['year'],figsize=(15,5))

### Price and model

In [None]:
brand_data.boxplot(column=['price'], by=['model'],figsize=(15,5),rot=45)

### Mean Price vs Other Features

In [None]:
fig, axes = plt.subplots(figsize=(15,12),nrows=2, ncols=2)
vis_models = brand_data.groupby(['model']).mean()['price'].sort_values(ascending=False)
vis_trans = brand_data.groupby(['transmission']).mean()['price'].sort_values(ascending=False)
vis_ftyp = brand_data.groupby(['fuelType']).mean()['price'].sort_values(ascending=False)
vis_engS = brand_data.groupby(['engineSize']).mean()['price'].sort_values(ascending=False)
vis_models.plot(ax=axes[0,0],kind='bar',title = 'mean price')
vis_engS.plot(ax=axes[0,1],kind='bar',title = 'mean price')
vis_trans.plot(ax=axes[1,0],kind='bar',rot=360,title = 'mean price')
vis_ftyp.plot(ax=axes[1,1],kind='bar',rot=360,title = 'mean price')

## Preprocessing

### Tax feature

In [None]:
# As concluded above that tax and mpg has no correlation with price or any other feature
# Tax and mpg columns will be dropped

In [None]:
brand_notax=brand_data.drop(['tax'], axis=1)

In [None]:
brand_notaxmpg=brand_notax.drop(['mpg'], axis=1)

In [None]:
brand_notaxmpg.head()

### Mileage feature

In [None]:
## there are some zero entries in the mileage. We are concerned with used cars
## Accordingly we will delete any car below 50 mileage records

In [None]:
mile_min = 50
brand_mile = brand_notaxmpg[brand_notaxmpg['mileage']>=mile_min]

In [None]:
sns.histplot(brand_mile['mileage'])

In [None]:
## we will try to remove the mileage outliers by keeping the records upto 99% quartile

In [None]:
mile_skewness=brand_mile['mileage'].skew(axis = 0, skipna = True)

In [None]:
if mile_skewness>1:
    q = brand_mile['mileage'].quantile(0.99)
    brand_mile= brand_mile[brand_mile['mileage']<q]
elif mile_skewness<-1:
    q = brand_mile['mileage'].quantile(0.01)
    brand_mile= brand_mile[brand_data_mile['mileage']>q]

In [None]:
sns.histplot(brand_mile['mileage'])

### Year Feature

In [None]:
sns.histplot(brand_mile['year'])

In [None]:
## we will try to remove the year outliers by keeping the records above 1% quartile

In [None]:
brand_year=brand_mile.copy()
year_skewness=brand_year['year'].skew(axis = 0, skipna = True)

In [None]:
if year_skewness>1:
    q = brand_year['year'].quantile(0.99)
    brand_year= brand_year[brand_year['year']<q]
elif year_skewness<-1:
    q = brand_year['year'].quantile(0.01)
    brand_year= brand_year[brand_year['year']>q]

In [None]:
sns.histplot(brand_year['year'])

### Engine Size feature

In [None]:
sns.histplot(brand_year['engineSize'])

In [None]:
brand_engsZ=brand_year.copy()

In [None]:
brand_engsZ = brand_engsZ[brand_engsZ.engineSize != 0]

In [None]:
sns.histplot(brand_engsZ['engineSize'])

### Fuel Type

In [None]:
brand_fuel=brand_engsZ.copy()
brand_fuel = brand_fuel[brand_fuel.fuelType != 'Other']

In [None]:
vis_ftyp = brand_fuel.groupby(['fuelType']).count()['price'].sort_values(ascending=False)

In [None]:
vis_ftyp.plot(kind='bar')
plt.show()

In [None]:
# Hybrid Cars has been dropped

### Transmission

In [None]:
brand_Trans=brand_fuel.copy()
brand_Trans = brand_Trans[brand_Trans.transmission != 'Other']

In [None]:
vis_trans = brand_Trans.groupby(['transmission']).count()['price'].sort_values(ascending=False)

In [None]:
vis_trans.plot(kind='bar')
plt.show()

### Model

In [None]:
brand_models=brand_Trans.copy()

In [None]:
vis_models = brand_models.groupby(['model']).count()['price'].sort_values(ascending=False)

In [None]:
vis_models.plot(kind='bar')
plt.show()

In [None]:
# there are still some models with very few records
# I will filter those model with only 1 record for now

In [None]:
brand_model_count=pd.DataFrame

brand_model_count=brand_model_count({'counts' : brand_models.groupby( [ "model"] ).size()}).reset_index()

min_count=1

counts_sum=brand_model_count['counts'].sum(axis=0)
brand_model_count.drop(brand_model_count[brand_model_count.counts >= min_count].index, inplace=True)

a=brand_model_count['model'].tolist()

for i in range (0,len(a)):
    brand_models.drop(brand_models[brand_models.model == a[i]].index,inplace=True)

vis_audi_models = brand_models.groupby(['model']).count()['price'].sort_values(ascending=False)

vis_audi_models.plot(kind='bar')
plt.show()

### Price

In [None]:
sns.histplot(brand_models['price'])

In [None]:
brand_models.boxplot(column=['price'],figsize=(15,5),patch_artist = True,notch ='True',vert=0)

In [None]:
# After these processing no remarkable effect is noticed on the price boxplot compared to the pervious one

In [None]:
price_skewness=brand_models['price'].skew(axis = 0, skipna = True)
price_skewness

In [None]:
# price is still highly skewed, we will try to visualize price with model and year for more better spotting of outliers

In [None]:
brand_models.boxplot(column=['price'], by=['model','year'],figsize=(15,5),rot=90)

In [None]:
# we need to remove price outliers based on model type and year (Assuming that those are the most effective features).
# we are going to calculate filter between 90% & 95% of each model_year price distribution 
# and exclude the rest in case the price variation is high (coefficient of varitaion >0.7)

In [None]:
model_cstat=pd.DataFrame
model_mstat=pd.DataFrame
model_sdstat=pd.DataFrame
model_cstat=model_cstat({'cnt_price' : brand_models.groupby( [ "model","year"] ).count()["price"]}).reset_index()
model_mstat=model_mstat({'mean_price' : brand_models.groupby( [ "model","year"] ).mean()["price"]}).reset_index()
model_sdstat=model_sdstat({'std_price' : brand_models.groupby( [ "model","year"] ).std()["price"]}).reset_index()
model_stats1 = pd.merge(model_cstat, model_mstat, on=["model", "year"])
model_stats2 = pd.merge(model_stats1, model_sdstat, on=["model", "year"])

model_stats2['std_price'] = model_stats2['std_price'].fillna(0)

models_prices=brand_models.copy()

models_prices = models_prices.merge(model_stats2, how='inner', left_on=["model", "year"], right_on=["model","year"])
x=1.5
y=0.7
models_prices['var_coeff']=models_prices['std_price']/models_prices['mean_price']
models_prices['price_L95']=models_prices['mean_price']-x*models_prices['std_price']
models_prices['price_R95']=models_prices['mean_price']+x*models_prices['std_price']
models_prices['price_L95'] =models_prices['price_L95'].astype(int)
models_prices['price_R95'] =models_prices['price_R95'].astype(int)

def drop_price(models_prices):

    if (models_prices['var_coeff']>y) and (models_prices['cnt_price'] <4):
        return 'drop'
    elif (models_prices['price'] > models_prices['price_R95']) or (models_prices['price'] < models_prices['price_L95']):
        return 'drop'
    else:
        return 'keep'
models_prices['drop_price'] = models_prices.apply(drop_price, axis = 1)
models_prices = models_prices[models_prices.drop_price !='drop']

In [None]:
models_prices.boxplot(column=['price'], by=['model','year'],figsize=(15,5),rot=90)

In [None]:
price_skewness=models_prices['price'].skew(axis = 0, skipna = True)
price_skewness

In [None]:
# Skewness decreased and as you can see only the oultiers of specific model and year has been cleaned.

### Preocessing Checkpoint

In [None]:
# Let us calculate the percentage of records removed out of the original dataset

In [None]:
data_removed_perct=100*(len(brand_data)-len(models_prices))/len(brand_data)
data_removed_perct

In [None]:
# Data removed is high but accepted

In [None]:
Brand_preprocessed=models_prices.copy()

In [None]:
Brand_preprocessed=Brand_preprocessed.reset_index(drop=True)

In [None]:
Brand_preprocessed.info()

## Re-arranging Columns

In [None]:
Brand_preprocessed.columns.values

In [None]:
cols=['price','model', 'year', 'transmission', 'mileage', 'fuelType',
      'engineSize', 'cnt_price', 'mean_price', 'std_price',
      'var_coeff', 'price_L95', 'price_R95', 'drop_price']

In [None]:
brand_arranged=Brand_preprocessed.copy()
brand_arranged=Brand_preprocessed[cols]

In [None]:
brand_arranged=brand_arranged.drop(['cnt_price', 'mean_price', 'std_price',
       'var_coeff', 'price_L95', 'price_R95', 'drop_price'], axis=1)

In [None]:
brand_arranged.describe(include='all')

## Additional Graphs after Processing Data

### Year-Model Vs Mean Mileage

In [None]:
year_model_mmile=pd.DataFrame
year_model_mmile=year_model_mmile({'mean_mileage' : brand_arranged.groupby( ["year",'model'] ).mean()["mileage"]}).reset_index().sort_values(by='year',ascending=True)

In [None]:
model_set=sorted(set(year_model_mmile['model']))
plt.figure(figsize=(17,8))
for model in model_set:
     selected_data = year_model_mmile.loc[year_model_mmile['model'] == model]
     plt.plot(selected_data['year'], selected_data['mean_mileage'],marker = next(marker), label=model)
plt.xlabel('year')
plt.ylabel('mean mileage')
plt.legend()

In [None]:
## Generally, mileage is higher for older cars

### Year-Model vs Mean Price

In [None]:
year_model_mprice=pd.DataFrame
year_model_mprice=year_model_mprice({'mean_price' : brand_arranged.groupby( ["year",'model'] ).mean()["price"]}).reset_index().sort_values(by='year',ascending=True)

model_set=sorted(set(year_model_mprice['model']))
plt.figure(figsize=(17,8))
for model in model_set:
     selected_data = year_model_mprice.loc[year_model_mprice['model'] == model]
     plt.plot(selected_data['year'], selected_data['mean_price'],marker = next(marker), label=model)
plt.xlabel('year')
plt.ylabel('mean price')
plt.legend()
plt.show()

### Year-Model vs Min Price

In [None]:
year_model_mnprice=pd.DataFrame
year_model_mnprice=year_model_mnprice({'min_price' : brand_arranged.groupby( ["year",'model'] ).min()["price"]}).reset_index().sort_values(by='year',ascending=True)

model_set=sorted(set(year_model_mnprice['model']))
plt.figure(figsize=(17,8))
for model in model_set:
     selected_data = year_model_mnprice.loc[year_model_mnprice['model'] == model]
     plt.plot(selected_data['year'], selected_data['min_price'],marker = next(marker), label=model)
plt.xlabel('year')
plt.ylabel('min price')
plt.legend()
plt.show()

### Year-Model vs Max Price

In [None]:
year_model_mxprice=pd.DataFrame
year_model_mxprice=year_model_mxprice({'max_price' : brand_arranged.groupby( ["year",'model'] ).max()["price"]}).reset_index().sort_values(by='year',ascending=True)

model_set=sorted(set(year_model_mxprice['model']))
plt.figure(figsize=(17,8))
for model in model_set:
     selected_data = year_model_mxprice.loc[year_model_mxprice['model'] == model]
     plt.plot(selected_data['year'], selected_data['max_price'],marker = next(marker), label=model)
plt.xlabel('year')
plt.ylabel('max price')
plt.legend()
plt.show()

## Dropping Unwanted Fields

In [None]:
brand_final=brand_arranged.drop(['year'], axis=1)

In [None]:
# The remaining brands has been explored and preproceesed in the same manner and saved as csv

  # https://www.kaggle.com/enginsights/vw-preprocessing-and-exploration
  # https://www.kaggle.com/enginsights/vauxhall-preprocessing-and-exploration
  # https://www.kaggle.com/enginsights/bmw-preprocessing-and-exploration
  # https://www.kaggle.com/enginsights/audi-preprocessing-and-exploration
  # https://www.kaggle.com/enginsights/mercedes-preprocessing-and-exploration
  # https://www.kaggle.com/enginsights/skoda-preprocessing-and-exploration
  # https://www.kaggle.com/enginsights/toyota-preprocessing-and-exploration
  # https://www.kaggle.com/enginsights/ford-preprocessing-and-exploration

# All CSVs of processed data of all brands will be imported to separate notebook where a regression model is applied

# Please refer to 100K UK Used Cars Model

  # https://www.kaggle.com/enginsights/100k-uk-used-cars-model