In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import ExtraTreesRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [31]:
df = pd.read_csv("../input/cardekho/car data.csv")

In [32]:
df.head()

In [33]:
df.shape

In [34]:
print(df['Seller_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())
print(df['Fuel_Type'].unique())

In [35]:
# check missing or null values
df.isnull().sum()

# Description of the dataset

In [36]:
df.describe()

In [37]:
df.columns

In [38]:
final_dataset = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven','Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

In [39]:
final_dataset.head()

In [40]:
final_dataset['Current_Year'] = 2022

In [41]:
final_dataset.head()

In [42]:
final_dataset['No_year'] = final_dataset['Current_Year'] - final_dataset['Year']

In [43]:
final_dataset.head()

In [44]:
final_dataset.drop(['Year'], axis = 1, inplace = True)

In [45]:
final_dataset.drop(['Current_Year'], axis = 1, inplace = True)

In [46]:
final_dataset.head()

In [47]:
final_dataset = pd.get_dummies(final_dataset, drop_first = True)

In [48]:
final_dataset.describe()

In [49]:
final_dataset.head()

# Finding the correlation

In [50]:
final_dataset.corr()

In [51]:
sns.pairplot(final_dataset)

In [52]:
corrmat = final_dataset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))

# plot the heatmap
g = sns.heatmap(final_dataset[top_corr_features].corr(), annot = True, cmap = "RdYlGn")

In [53]:
# independent and dependent features
X = final_dataset.iloc[:, 1:]
y = final_dataset.iloc[:, 0]

In [54]:
X.head()

In [55]:
y.head()

# Feature importance

In [56]:
model = ExtraTreesRegressor()
model.fit(X,y)

In [57]:
print(model.feature_importances_)

In [59]:
# plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index = X.columns)
feat_importances.nlargest(5).plot(kind = 'barh')
plt.show()

# Train-Test set split

In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [62]:
X_train.shape

In [65]:
from sklearn.ensemble import RandomForestRegressor
rf_ranmdon = RandomForestRegressor()

In [67]:
# Hyperparameters
# Randomized search CV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
print(n_estimators)

In [70]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
#max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [72]:
from sklearn.model_selection import RandomizedSearchCV

In [73]:
# create the random grid
random_grid = { 'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf
              }
print(random_grid)

In [74]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

In [75]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring = 'neg_mean_squared_error', n_iter = 10, cv = 5, verbose = 2,
                               random_state = 42, n_jobs = 1)

In [76]:
rf_random.fit(X_train, y_train)

# Model Prediction

In [77]:
predictions = rf_random.predict(X_test)

In [78]:
predictions

In [79]:
sns.distplot(y_test - predictions)

In [80]:
plt.scatter(y_test, predictions)

In [81]:
import pickle
# open a file where you want to store the data
file = open('random_forest_regression_model.pkl', 'wb')

# dump information to that file
pickle.dump(rf_random, file)