In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
df.head()

In [3]:
df.shape

In [4]:
print(df['Seller_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())
print(df['Fuel_Type'].unique())

In [5]:
# check missing or null values
df.isnull().sum()

In [6]:
df.describe()

In [7]:
df.columns

In [8]:
final_dataset = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

In [9]:
final_dataset.head()

In [10]:
final_dataset['Current_year'] = 2021
final_dataset.head()

In [11]:
final_dataset['no_year'] = final_dataset['Current_year'] - final_dataset['Year']
final_dataset.head()

In [12]:
final_dataset.drop(['Year', 'Current_year'],axis=1, inplace=True)
final_dataset.head()

In [13]:
final_dataset = pd.get_dummies(final_dataset, drop_first=True)

In [14]:
final_dataset.head()

In [15]:
final_dataset.corr()

In [16]:
import seaborn as sns

In [17]:
sns.pairplot(final_dataset)

In [18]:
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
cormat = final_dataset.corr()
top_corr_features = cormat.index
plt.figure(figsize=(20,20))
g = sns.heatmap(final_dataset[top_corr_features].corr(), annot=True, cmap='RdYlGn')

In [20]:
final_dataset.head()

In [21]:
# independent and dependent features
x = final_dataset.iloc[:, 1:]
y = final_dataset.iloc[:, 0]

In [22]:
x.head()

In [23]:
y.head()

In [24]:
# features importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(x, y)

In [25]:
print(model.feature_importances_)

In [26]:
# plot graph of feature importance for better visualization
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

In [27]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

In [28]:
xtrain.shape

In [29]:
from sklearn.ensemble import RandomForestRegressor
rf_random = RandomForestRegressor()

In [32]:
import numpy as np

#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [33]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [34]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
# use the random grid to search for best hyperparameters
# frist create the base model to tune
rf = RandomForestRegressor()

In [36]:
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=4)

In [37]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [38]:
rf_random.fit(xtrain, ytrain)

In [39]:
predictions = rf_random.predict(xtest)

In [40]:
predictions

In [41]:
sns.distplot(ytest-predictions)

In [42]:
plt.scatter(ytest, predictions)

In [45]:
import pickle
# open a file, where you want to store the data
file = open('random_forest_regression_model.pkl', 'wb')

# dumb information to thet file
pickle.dump(rf_random, file)