In [None]:
import pandas as pd
import numpy as np

from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use( 'seaborn-whitegrid')

from scipy import stats

<h1 style="color:#262A53">Loading data files.</h1>
<p style="font-size:24px">The following code read data from multiple files.The below code will</p>
 
 - <p style="font-size:22px">Loop through each data file.</p>
 - <p style="font-size:22px">Read the file into a Pandas DataFrame.</p>
 - <p style="font-size:22px">Put each DataFrame into a Python dictionary.</p>

In [None]:
# load car datasets.

files = ['audi.csv', 'bmw.csv', 'cclass.csv','focus.csv','hyundi.csv','merc.csv','skoda.csv','toyota.csv','vauxhall.csv','vw.csv']
data = {}
for file in files:
    df = pd.read_csv(f"../input/used-car-dataset-ford-and-mercedes/{file}")
    data[file.replace(".csv","")] = df
    
# Add column brand to our datasets.
for k, v in data.items():
    v['brand'] = k
    

# Printing first five rows.
for k, v in data.items():
    print(k)
    print(v.head())
    break

In [None]:
# Creating single DataFrame
car_data = pd.DataFrame()

for k, v in data.items():
    car_data = pd.concat([car_data,v], axis=0)
    
car_data.head()

In [None]:
# Shape
car_data.shape

In [None]:
# drop the extra tax columns
car_data = car_data.drop(['tax(£)'], axis = 1)
car_data.head()

In [None]:
# Prints summary of Car data
car_data.describe()

In [None]:
car_data.info()

In [None]:
# we can see that there are some outliers, so let's clean our data.

def Quartile(data):
    
    """
    This function calculate first,third quantile, Inter quantile range.
    returns upper range and lower range which helps in to remove Outliers.
    
    """
    
    # first quantile Q1
    Q1 = np.percentile(data, 25, interpolation='midpoint')
    
    # third quantile Q3
    Q3 = np.percentile(data, 75, interpolation='midpoint')
    
    IQR = stats.iqr(data)
    
    upper_range = Q3 + (1.5 * IQR)   
    lower_range = Q1 - (1.5 * IQR)
    
    return int(upper_range), int(lower_range)

In [None]:
# Let's find Outliers in tax.

plt.figure(figsize=(12, 8))

plt.boxplot(car_data['mileage'])

plt.title("Detecting Outliers")
plt.show()

In [None]:
# remove outliers.
upperRange, lowerRange = Quartile(car_data.mileage)
car_data = car_data.loc[(car_data.mileage<upperRange) & (car_data.mileage>lowerRange)]

In [None]:
# Let's find Outliers in engineSize.

plt.figure(figsize=(12, 8))

plt.boxplot(car_data['engineSize'])

plt.title("Detecting Outliers")
plt.show()

In [None]:
# remove outliers.
upperRange, lowerRange = Quartile(car_data.engineSize)
car_data = car_data.loc[(car_data.engineSize<upperRange) & (car_data.engineSize>lowerRange)]

In [None]:
# Let's find Outliers.

plt.figure(figsize=(12, 8))

plt.boxplot(car_data['price'])

plt.title("Detecting Outliers")
plt.show()

In [None]:
# remove outliers.
upperRange, lowerRange = Quartile(car_data.price)
car_data = car_data.loc[(car_data.price<upperRange) & (car_data.price>lowerRange)]

In [None]:
# Missing data.
car_data.isnull().sum()

In [None]:
car_data.reset_index(drop=True, inplace=True)
car_data.head()

## Let's examine the tax price

In [None]:
# Let's see the average tax values of car by transmission type.

GrpData = pd.DataFrame(car_data.groupby(['transmission']).mean().tax)

# visualization of the same.

GrpData.plot(kind='bar', figsize=(12, 8))

plt.xlabel("Transmission type")
plt.ylabel("Tax")
plt.title("Average Tax by Transmission type")

plt.show()

In [None]:
# Let's see the average tax values of car by fuel type.

fuelGrp = pd.DataFrame(car_data.groupby(['fuelType']).mean().tax)

# visualization of the same.

fuelGrp.plot(kind='bar', figsize=(12, 8))

plt.xlabel("fule type")
plt.ylabel("Tax")
plt.title("Average Tax by fule type")

plt.show()

We can see that there is no tax for Electronics car.

In [None]:
# Let's see the average tax values of car by engineSize.

GrpData = pd.DataFrame(car_data.groupby(['engineSize']).mean().tax)
GrpData = GrpData.sort_index(ascending=False)

# visualization of the same.

GrpData.plot(kind='bar', figsize=(12, 8))

plt.ylim(0, 220)

plt.xlabel("Engine Size")
plt.ylabel("Tax")
plt.title("Average Tax by Size of Engine")

plt.show()

In [None]:

GrpData = pd.DataFrame(car_data.groupby(['fuelType', 'transmission']).mean().tax)
GrpData = GrpData.sort_index(ascending=False)

# visualization of the same.

GrpData.unstack().plot(kind='bar', figsize=(12, 8))

plt.ylim(0, 200)

plt.ylabel("Tax")
plt.title("Average Tax")

plt.show()

<h3>There is no tax for electronic cars. So, Let's try to fill the missing values of tax.</h3>

In [None]:
# distribution plot.
sns.displot(data = car_data, x ='tax', kde=True, height=8, aspect=1.5)

plt.xlabel('Tax')
plt.title("Univariate Distribution of Tax.")
plt.show()

In [None]:
# Tax for electronic car is 0.
# Let's see if there is any missing data for tax for electronic car.

a = car_data.loc[car_data.fuelType == 'Electric', 'tax'].isnull().sum()
print(f"Missing Values for electric cars is {a}")

# Let's replace missing data of by average value of tax.
car_data['tax'] = car_data.tax.replace(np.nan, car_data.tax.mean())

# check for missing data.
car_data.tax.isnull().sum()

In [None]:
# distribution plot.
sns.displot(data = car_data, x ='mpg', kde=True, height=8, aspect=2)

plt.xlabel('Miles per gallon')
plt.title("Univariate Distribution of miles per gallon.")
plt.show()

The data is skewed towards right.

In [None]:
# Let's replace missing data of by median value of mpg.
car_data['mpg'] = car_data['mpg'].replace(np.nan, car_data.mpg.median())

# check for missing data.
car_data.mpg.isnull().sum()

In [None]:
car_data.isnull().sum()

In [None]:
plt.figure(figsize=(12, 8))

plt.boxplot(car_data.tax)

plt.ylabel('Tax')
plt.title("Boxplot for Outliers")
plt.show()

In [None]:
upperRange, lowerRange = Quartile(car_data.tax)
car_data = car_data.loc[(car_data.tax<upperRange) & (car_data.tax>lowerRange)]

In [None]:
plt.figure(figsize=(12, 8))

plt.boxplot(car_data.mpg)

plt.ylabel('Tax')
plt.title("Boxplot for Outliers")
plt.show()

In [None]:
upperRange, lowerRange = Quartile(car_data.mpg)
car_data = car_data.loc[(car_data.mpg<upperRange) & (car_data.mpg>lowerRange)]

In [None]:
car_data.dtypes

In [None]:
# Let's examine Price of cars by Brands.
Price = pd.DataFrame(car_data.groupby(['brand']).mean().price)
Price.sort_values(by='price', ascending=False, inplace=True)

In [None]:
# visualization for the same.
Price.plot(kind='bar', figsize=(12, 8))

plt.ylim(0, 30000)

plt.xlabel("Brands")
plt.ylabel("Price")
plt.title("Average Price by Brand")

plt.show()

<h2 style="color:#548CA8">The top 3 most expensive cars are from Mercedes, Audi, and BMW Brands </h2>

In [None]:
# Let's find out if price depends on engine size, transimission they used and fuel type.
Price = pd.DataFrame(car_data.groupby(['fuelType', 'transmission']).mean().price)
Price.sort_values(by='price', ascending=False, inplace=True)
Price

In [None]:
# visualization for the same.
Price.unstack().plot(kind='bar', figsize=(12, 8))

plt.ylim(0, 30000)

plt.xlabel("Fuel")
plt.ylabel("Price")
plt.title("Average Price")

plt.show()

In [None]:
avg_mileage = pd.DataFrame(car_data.groupby(['year'])['price'].mean())
avg_mileage.sort_values(by='price',ascending=False, inplace=True)
avg_mileage

In [None]:

avg_mileage.unstack().plot(kind='line',stacked=True,figsize=(14, 8))

plt.ylabel('Price')
plt.title("Average Price and Mileage by Car Brand")

plt.show()

In [None]:
car_data.head()

In [None]:
b = pd.get_dummies(car_data.transmission)
c = pd.get_dummies(car_data.brand)

car_data = pd.concat([car_data,b, c], axis=1)
car_data.head()

In [None]:
data = car_data.drop('price', axis=1)

# select only numeric columns
X = data.select_dtypes(exclude='object')
y = car_data.price

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# find how accurate model performance is.
MAE = mean_absolute_error(predictions, y_test)
print(f"Mean absolute error for Decision Tree Regressor: {MAE}")

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# find how accurate model performance is.
MAE = mean_absolute_error(predictions, y_test)
print(f"Mean absolute error for Linear Regression: {MAE}")