# Importing libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings(action="ignore")
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# importing dataset
df = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/merc.csv")
df.head(10)

In [None]:
# rows and columns of data
df.shape

In [None]:
# info of data like memory used and datatypes of columns
df.info()

In [None]:
#checking for null values
df.isna().sum()

In [None]:
# Statistical info of data
df.describe()

In [None]:
df.columns

In [None]:
# Number of unique car models
df.model.nunique()

* There are 27 car models of Mercedes.

In [None]:
df.fuelType.unique()

In [None]:
df.transmission.unique()

# Visualization

In [None]:
plt.figure(figsize=(14,6))
sns.countplot(df["model"])
plt.xticks(rotation=45)
plt.title("Car models Countplot")
plt.ylabel("Number of Cars")
plt.xlabel("Car Models")

In [None]:
sns.countplot(df["fuelType"])

In [None]:
sns.countplot(df["transmission"])

In [None]:
plt.figure(figsize=(25,8))
sns.countplot(df['year'])
plt.show()

In [None]:
plt.figure(figsize=(13,8))
sns.countplot(df['engineSize'])
plt.show()

# Encoding Columns with categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["transmission"] = le.fit_transform(df["transmission"])
df["fuelType"] = le.fit_transform(df["fuelType"])
df["model"] = le.fit_transform(df["model"])

In [None]:
df.head()

# Splitting Data into Dependent and Independent variables

In [None]:
X = df.drop(columns="price")
y = df.price

Applying train_test_split to split data into training and testing set

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)

# Applying Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

# Model Training..🚴

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=400, max_depth=15)
regressor.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
y_pred = regressor.predict(X_test)
r2_score(y_pred, y_test)

## XGBoost Regressor

In [None]:
from xgboost import XGBRegressor
regressor2 = XGBRegressor(n_estimators=500,learning_rate=0.05, max_depth=6)
regressor2.fit(X_train, y_train)

In [None]:
y_pred2 = regressor2.predict(X_test)
r2_score(y_test,y_pred2)

## LightGBM Regressor

In [None]:
from lightgbm import LGBMRegressor
regressor3 = LGBMRegressor(n_estimators=600, max_depth= 7)
regressor3.fit(X_train, y_train)

In [None]:
y_pred3 = regressor3.predict(X_test)
r2_score(y_test,y_pred3)

# StratifiedKFold

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)

In [None]:
val_scores = cross_val_score(estimator=regressor3, X=X_train, y=y_train, cv=skf)
val_scores

In [None]:
val_scores.mean()

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
params={ 'n_estimators': (100,200,300,400,500,600,700,800),
         'max_depth': np.arange(2,21)}
grid_cv = GridSearchCV(estimator=regressor3, param_grid = params, cv=skf,verbose=True, n_jobs=-1)
random_cv = RandomizedSearchCV(estimator=regressor, cv=skf, param_distributions=params,verbose=True,n_jobs=-1)

In [None]:
%%time
random_cv.fit(X_train, y_train)

In [None]:
random_cv.best_params_

In [None]:
%%time
grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_params_

# R2_scores

In [None]:
print("r2_scores of models")
print("Random Forest Regressor {} %".format(r2_score(y_pred, y_test)))
print("XGBoost Regressor {}%".format(r2_score(y_pred2,y_test)))
print("LightGBM Regressor {}%".format(r2_score(y_test,y_pred3)))

In [None]:
df.head(20)

<div class="alert alert-box alert-info">
Predicting by input
</div>

In [None]:
regressor3.predict([[18,2016,1,14000,3,325,30.4,4.0]])

In [None]:
regressor.predict([[20,2011,2,6000,2,225,30.4,5.0]])

In [None]:
#from sklearn.cross_validati import cross_val_predict
#predicted = cross_val_predict(regressor3, X_train, y_train, cv=10)
plt.figure(figsize=(12,8))

sns.scatterplot(y_test, y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
plt.show()

<div class="alert alert-box alert-warning">
 Please UPVOTE the notebook if you find it insightful!
    
 See ya!
    </div>