In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Load data**

# we will take alook at individual data files and see that how many columns are there in every data file

In [None]:
audi= pd.read_csv("../input/used-car-dataset-ford-and-mercedes/audi.csv")
audi.head()

In [None]:
bmw= pd.read_csv("../input/used-car-dataset-ford-and-mercedes/bmw.csv")
bmw.head()

In [None]:
unclean_cclass= pd.read_csv("../input/used-car-dataset-ford-and-mercedes/unclean cclass.csv")
unclean_cclass.head()

In [None]:
cclass= pd.read_csv("../input/used-car-dataset-ford-and-mercedes/cclass.csv")
cclass.head()

# we will see description of the numerical columns in cclass dataset

In [None]:
cclass.describe().T

In [None]:
cclass.info()

In [None]:
focus= pd.read_csv("../input/used-car-dataset-ford-and-mercedes/focus.csv")
focus.head()

In [None]:
ford= pd.read_csv("../input/used-car-dataset-ford-and-mercedes/ford.csv")
ford.head()

# **collect all file paths**

In [None]:
path= "../input/used-car-dataset-ford-and-mercedes"
paths=[]
for file in os.listdir(path):
    paths.append(os.path.join(path, file))
print(paths)  # paths contrains the path to all the input files
    

In [None]:
paths.pop(1)  # we will remove the uncleaned data file paths from the paths list e.g. unclean cclass
paths

In [None]:
paths.pop(-2)  # we will remove the uncleaned data file paths from the paths list e.g. unclean focus
paths

In [None]:
data= []  # empty list to collect the dataframe of all the cars data
for path in paths:
    d= pd.read_csv(path)
    data.append(d)
    #print(data)
    
data[1]
    

# **concatenating all the cars data from data list**

In [None]:
car_data= pd.concat(data, ignore_index=True )
car_data.shape

In [None]:
car_data.head()

In [None]:
models= car_data.model.value_counts().count()  # number of different models in dataset
print(models)

# **missing values**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))

sns.heatmap(car_data.isnull(), yticklabels=False)  # show the heatmap with the null values shown as 1 and nonnull as 0

1. large number of null values in last column so we have to remove this
2. tax and mpg columns have small number of null value so we will try to impute this with proper strategy

In [None]:
car_data= car_data.drop(columns= "tax(£)") # removing the last column of car_data
car_data.head()

In [None]:
car_data.info()

**mpg** is miles per gallon
so to find the strategy to impute the missing values we will see if it have relation with other column values

In [None]:
sns.heatmap(car_data.corr(), annot= True)

In [None]:
mpg= car_data.corr()["mpg"]
mpg.plot()
plt.axhline(0)
#sns.histplot(mpg)

In [None]:
sns.scatterplot(x="mpg", y="price", data=car_data)

In [None]:
sns.pairplot(car_data)

In [None]:
!pip install evalml
import evalml

In [None]:
car_data.head()

In [None]:
x= car_data.drop(columns="price")
y= car_data.price

In [None]:
trainx, testx, trainy, testy= evalml.preprocessing.split_data(x,y, problem_type="regression", test_size=0.2, random_seed=20)

In [None]:
model1= evalml.automl.AutoMLSearch(X_train=trainx, y_train=trainy, problem_type="regression")

In [None]:
model1.search()

In [None]:
model1.best_pipeline

In [None]:
model1.rankings

# **As we can see from ranking the XGBoost regressoer performed best on this data so we will use this model for further study**

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# **impute missing values**

In [None]:
si= SimpleImputer(strategy="mean")
si.fit(np.array(x["tax"]).reshape(-1,1))
x["tax"]= si.transform(np.array(x['tax']).reshape(-1,1)).astype(dtype="int")
x.tax

In [None]:
x['mpg']= si.fit_transform(np.array(x['mpg']).reshape(-1,1)).astype(dtype="int")

In [None]:
sns.heatmap(x.isnull(), yticklabels=False) 

# **categorical features**

In [None]:
x.head()

In [None]:
cat_feat= ['model', 'year', 'transmission', 'fuelType', 'engineSize', 'tax', 'mpg']
le= LabelEncoder()
for feature in cat_feat:
    x[feature]= le.fit_transform(x[feature])

In [None]:
x.head()

# **split the data**

In [None]:
# split the data into training and testing data
x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.2, random_state=20)

In [None]:
model1.best_pipeline

In [None]:
model= XGBRegressor(n_estimators= 100, max_depth=6, min_child_weight= 1, n_jobs=-1, random_state= 20)
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error as MAE
y_pred= model.predict(x_test)
mae= MAE(y_test, y_pred)
mae

In [None]:
from sklearn.metrics import mean_squared_error as MSE
rmse= np.sqrt(MSE(y_test, y_pred))
rmse

In [None]:
from sklearn.metrics import r2_score

r2= r2_score(y_test, y_pred)
r2

In [None]:
score= cross_val_score(model, x_train, y_train, cv=5, n_jobs=-1)
score

In [None]:
score= cross_val_score(model, x_test, y_test, cv=5, n_jobs=-1)
score

**r2 score is above 94%**

# **hyperparameter tunning**

# **Grid search CV**

In [None]:
model.get_params

In [None]:
from sklearn.model_selection import GridSearchCV
params= [{"n_estimators": [50,100,200], "learning_rate": [0.1, 0.3, 0.5], "max_depth": [4,6,10]}]

reg = GridSearchCV(model, params, n_jobs=-1)

In [None]:
reg.fit(x_train, y_train)

In [None]:
reg.best_estimator_

In [None]:
best_est_1= reg.best_estimator_

In [None]:
reg.best_params_

In [None]:
score= cross_val_score(best_est_1, x_test, y_test, cv=5, n_jobs=-1)
score