In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Understanding the Dataset

In [None]:
df=pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Visualising Missing Values
plt.figure(figsize=[5,5])
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

> No missing Data

In [None]:
# creating a new column named 'Age' to know how old the car is
df['Age']=2020-df['Year']

In [None]:
df['Age']

In [None]:
df.drop('Year',axis=1,inplace=True)

# 2. Exploratory Data Analysis

**Univariate Analysis**

Categorical Features Visualisation


In [None]:
categ_cols=['Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']
i=0
while i < 4:
    fig = plt.figure(figsize=[10,4])
    plt.subplot(1,2,1)
    sns.countplot(x=categ_cols[i], data=df)
    i += 1
    plt.subplot(1,2,2)
    sns.countplot(x=categ_cols[i], data=df)
    i += 1
    
    plt.show()

Numerical features to visualise outliers

In [None]:
num_cols=['Selling_Price', 'Present_Price', 'Kms_Driven','Age']
i=0
while i < 4:
    fig = plt.figure(figsize=[15,5])
    plt.subplot(1,2,1)
    sns.boxplot(x=num_cols[i], data=df)
    i += 1
    plt.subplot(1,2,2)
    sns.boxplot(x=num_cols[i], data=df)
    i += 1
    
    plt.show()

In [None]:
sns.boxplot(x='Kms_Driven',y='Selling_Price', data=df)

Visualising  Probability density of continuous variable(Kms driven) - KDE - Kernel Density Estimate

In [None]:
plt.figure(figsize=(12,7))
sns.kdeplot(df['Kms_Driven'], label='Kms Driven', color='r')
plt.xlabel('Kms Driven')

> # Bivariate Analysis

In [None]:
sns.heatmap(df.corr(),annot=True)

Selling Price is more correlated with Pesent Price. 
Age of the car has negative correlation with Selling Price. (selling Price reduces as age increases)

# 3. Data Preparation¶


In [None]:
# dropping CarName as it has no effect in our model to predict the price
df.drop('Car_Name',axis=1,inplace=True)


Handling Categorical Features

In [None]:
#creating dummy variables and dropping first to avoid dummy variable trap
df=pd.get_dummies(data=df,drop_first=True)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
y=df['Selling_Price']
x=df.drop('Selling_Price',axis=1)


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
print("x train: ",x_train.shape)
print("x test: ",x_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)

# 4. Model Building

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
import pickle

In [None]:
cv=[]
r2=[]
def regg_model(model,model_name):
    model.fit(x_train,y_train)
    #print(model.best_params_)
    # r2-score
    y_pred=model.predict(x_test)
    r2.append(r2_score(y_test,y_pred))
    
    # cross-val-score
    cross_val=cross_val_score(model,x_train,y_train,cv=5)
    cross_val_mean=cross_val.mean()
    print('R2-Score: ',r2)
    print('Cross-Val-Mean-Score: ',cross_val_mean)
    
    # dumping model in a pickle file
    model_path='./car-models'+ model_name
    file=open(model_path,'wb')
    pickle.dump(model,file)
    
    

1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
regg_model(lr,'LinearRegeression.pkl')

> 2. Random Forest Regression 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf=RandomForestRegressor()

# number of trees
n_estimators=list(range(500,1000,100))
# Maximum depth of tree
max_depth=list(range(4,9,1))
# Minimum samples to split internal node
min_samples_split=list(range(4,9,1))
# Minimum samples to be a leaf node
min_samples_leaf=[1,2,3,4]
# features considered at each split
max_features=['auto','sqrt']

# Hyperparameters
params={'n_estimators':n_estimators,
       'max_depth':max_depth,
       'min_samples_split':min_samples_split,
       'min_samples_leaf':min_samples_leaf,
       'max_features':max_features}

rf_rs=RandomizedSearchCV(estimator=rf,param_distributions=params,cv=5,n_iter=10,return_train_score=True)

In [None]:
regg_model(rf_rs,'Random_Forest.pkl')


# 3.Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV


gb = GradientBoostingRegressor()

# Rate at which correcting is being made
learning_rate = [0.001, 0.01, 0.1, 0.2]
# Number of trees in Gradient boosting
n_estimators=list(range(500,1000,100))
# Maximum number of levels in a tree
max_depth=list(range(4,9,4))
# Minimum number of samples required to split an internal node
min_samples_split=list(range(4,9,2))
# Minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,7]
# Number of fearures to be considered at each split
max_features=['log2','sqrt']

# Hyperparameters dict
params = {"learning_rate":learning_rate,
              "n_estimators":n_estimators,
              "max_depth":max_depth,
              "min_samples_split":min_samples_split,
              "min_samples_leaf":min_samples_leaf,
              "max_features":max_features}

gb_rs = RandomizedSearchCV(estimator = gb, param_distributions = params)

In [None]:
regg_model(gb_rs,'gradient_boosting.pkl')

Check Pickle file for output 

In [None]:
model=pickle.load(open('./car-modelsgradient_boosting.pkl','rb'))
type(model)

In [None]:
model.predict(x_test.head())