In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn.metrics import r2_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv')
print("Data shape:")
print(df.shape)
print("\n\n")

print("First 5 rows in df:")
print(df.head())
print("\n\n")

print("Checking if there is Null:")
print(df.isnull().sum())
print("\n\n")

print("df describing:")
print(df.describe())
print("\n\n")

print("Checking duplicates:")
print(df.duplicated().sum())

# Removing duplicates
df=df.drop_duplicates(keep='first')
print("Duplicates are removed.\n\n")


print("Checking duplicates:")
print(df.duplicated().sum())

**EDA**

In [None]:
# Scatter plots of each pair of features
sns.pairplot(data = df, hue = 'transmission')
plt.show()

In [None]:
# Scatter plots of each pair of features
sns.pairplot(data = df, hue = 'fuelType')
plt.show()

In [None]:
# Looking for outliers
plt.figure(figsize=(40,20))
sns.boxplot(data=df[['price', 'mileage']])

In [None]:
# Looking for outliers
plt.figure(figsize=(40,20))
sns.boxplot(data=df[['tax', 'mpg']])

In [None]:
# Finding outliers by Tukey’s box plot method
q1=df.quantile(0.25)
q2=df.quantile(0.75)
IQR=q2-q1

print("Number of outliers is:")
print(df[((df<(q1-1.5*IQR))|(df>(q2+1.5*IQR))).any(axis=1)].shape[0])    

# Tukey's method show that there are almost 50% of data is outliers, so let's try to remove outliers by our hands
df = df[(df['tax'] <= 400) & (df['mpg'] <= 300) & (df['price'] <= 100000) & (df['mileage'] <= 200000)]


**Building a model REGRESSIONS**

In [None]:
# Creating Dummy variables (for more information use link below)
# https://towardsdatascience.com/the-dummys-guide-to-creating-dummy-variables-f21faddb1d40
dummies = pd.get_dummies(df[['transmission', 'model', 'fuelType']])
df = pd.concat([df, dummies], axis = 1)
print(df.head())

In [None]:
# Splitting data into X, y - features and predictible variable
# X = df.loc[:,['volatile acidity','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol']]
X = df.drop(['transmission', 'model', 'fuelType', 'price'], axis = 1)
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
y = df['price']


# Splitting X, y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)


# Linear Regression
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

# Evaluating model
print("Linear Regression:")
print('Train success rate : %',reg.score(X_train, y_train)*100)
print('Validation success rate : %',r2_score(y_test, y_pred))
print("\n\n")
### !!!!! R2 score is negative. It is caused by some strange values in test set and means that my model is bad, can't predict value for some test rows. 
### There are two ways:
### 1) try not to remove outliers
### 2) don't use this model at all



# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_jobs = -1, random_state = 42).fit(X_train, y_train)
y_pred = rf_reg.predict(X_test)
# Evaluating model
print("Random Forest Regression:")
print('Train success rate : %',rf_reg.score(X_train, y_train)*100)
print('Validation success rate : %',r2_score(y_test, y_pred))
print("\n\n")
# It looks like this model is an overfitted on train data
# Let's try to use Grid Search to improve model
# Let's look at deths of trees in random forest
print("Tree's depths:")
print(plt.hist([est.get_depth() for est in rf_reg.estimators_]))


# Grid Search for Random Forest Regression. Let's try to use Grid Search
from sklearn.model_selection import GridSearchCV
reg = RandomForestRegressor()
grid_values = {'n_estimators': [50, 100, 150],'max_depth': [25, 29, 30, 31], 'bootstrap': [True, False], 'n_jobs': [-1], 'random_state': [4]}
grid_reg_acc = GridSearchCV(reg, param_grid = grid_values, scoring = 'r2')
grid_reg_acc.fit(X_train, y_train)
y_pred_acc = grid_reg_acc.predict(X_test)
# Evaluating model
print("Random Forest Regression with Grid Search:")
print('Train success rate : %',grid_reg_acc.score(X_train, y_train)*100)
print('Validation success rate : %',r2_score(y_test, y_pred_acc))
print("\n\n")
print("Best parameters wich was choosen:")
print(grid_reg_acc.best_params_)
print("\n\n")



# Also let's look how R2 score depends on max_depth of trees
rf_scores = []
for md in range(1,35):
    rf_reg = RandomForestRegressor(n_jobs = -1, random_state = 42, max_depth = md).fit(X_train, y_train)
    y_pred = rf_reg.predict(X_test)
    rf_scores = rf_scores + [rf_reg.score(X_train, y_train)*100]
plt.figure(figsize=(40,20))
plt.plot(list(range(1,35)), rf_scores)
# Let's choose max_depth = 12 to optimize calculation time and save good R2 score.
rf_reg = RandomForestRegressor(n_jobs = -1, random_state = 42, max_depth = 12).fit(X_train, y_train)
y_pred = rf_reg.predict(X_test)
print("Random Forest Regression with max_depth = 12:")
print('Train success rate : %',rf_reg.score(X_train, y_train)*100)
print('Validation success rate : %',r2_score(y_test, y_pred))
print("\n\n")





# XGBoost Regression
from xgboost import XGBRegressor
xgb_reg = XGBRegressor(max_depth = 3, learning_rate = 0.1, n_estimators = 100, verbosity = 0, random_state = 42).fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)
# Evaluating model
print("XGBoost Regression:")
print('Train success rate : %',rf_reg.score(X_train, y_train)*100)
print('Validation success rate : %',r2_score(y_test, y_pred))
print("\n\n")
# It looks like this model is an overfitted on train data




**Conclusion**
The best model is Random Forest Regression.
By the Grid Search the best set of parameters is {'bootstrap': True, 'max_depth': 25, 'n_estimators': 150, 'n_jobs': -1, 'random_state': 4}, but even at {'bootstrap': True, 'max_depth': 12, 'n_estimators': 100, 'n_jobs': -1, 'random_state': 4} R2 score is almost the same. So to save calculation time I think should be better to use second parameters set.