# Second Car Problem Analysis 

### Regression Case Study

In [None]:
import numpy as np
import pandas as pd

################# Data preprocessing
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

################# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

################# Packages for model selection
from sklearn.model_selection import cross_validate, cross_val_score, RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

################# To ignore warnings
import warnings
warnings.filterwarnings('ignore')

################# Model Deployment
import pickle

In [None]:
#this is from local machine

### Data Import

In [3]:
data = pd.read_csv("SecondCar.csv")
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,9.0,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,9.0,108556,100000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4340 non-null   object 
 1   year              4340 non-null   int64  
 2   km_driven         4340 non-null   int64  
 3   fuel              4337 non-null   object 
 4   seller_type       4338 non-null   object 
 5   transmission      4338 non-null   object 
 6   owner             4337 non-null   object 
 7   Rating            4336 non-null   float64
 8   ExShowroom Price  4340 non-null   int64  
 9   selling_price     4340 non-null   int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 339.2+ KB


### Missing Value Check & Treatment

##### Missing Value Check : Column wise

In [5]:
data.isna().sum()

name                0
year                0
km_driven           0
fuel                3
seller_type         2
transmission        2
owner               3
Rating              4
ExShowroom Price    0
selling_price       0
dtype: int64

##### Checking Missing Value : Row Wise

In [6]:
data[data.isnull().any(axis=1) == True]

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
352,Chevrolet Sail 1.2 Base,2015,35000,,Individual,Manual,First Owner,,300235,260000
492,Chevrolet Sail 1.2 Base,2015,35000,Petrol,Individual,Manual,,,328139,260000
670,Tata Indigo Grand Petrol,2014,60000,Petrol,,Manual,Second Owner,15.0,365382,240000
1028,Maruti Wagon R VXI BS IV with ABS,2014,64000,Petrol,Individual,Manual,Second Owner,,436092,290000
1139,Maruti Alto LX BSIII,2007,125000,Petrol,Individual,,First Owner,9.92,453407,140000
1972,Maruti Wagon R LXI Minor,2007,50000,Petrol,,Manual,First Owner,14.0,560844,135000
2731,Chevrolet Enjoy TCDi LTZ 7 Seater,2013,33000,,Individual,Manual,Second Owner,,744569,390000
3928,Hyundai Creta 1.6 VTVT S,2015,25000,Petrol,Individual,,First Owner,12.0,1511680,850000
4068,Toyota Corolla Altis G AT,2016,50000,,Individual,Automatic,First Owner,11.0,1794249,900000
4186,Toyota Corolla Altis 1.8 VL CVT,2018,25000,Petrol,Dealer,Automatic,,11.13,2448803,1650000


### Missing Value Treatment

In [7]:
for col in data.columns:
    if data[col].dtype == "int32" or data[col].dtype == "int64" or data[col].dtype == "float32" or data[col].dtype == "float64":
        median = data[col].median()
        data[col].fillna(median, inplace = True)
    else:
        data = data.fillna(data.mode().iloc[0])

### Checking for any missing value remaining after treatment

In [8]:
data.isna().sum()

name                0
year                0
km_driven           0
fuel                0
seller_type         0
transmission        0
owner               0
Rating              0
ExShowroom Price    0
selling_price       0
dtype: int64

In [9]:
data.iloc[350:355,]

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
350,Tata Indigo LS,2012,70000,Diesel,Individual,Manual,Second Owner,14.0,299379,220000
351,Tata Indigo CR4,2013,100000,Diesel,Individual,Manual,First Owner,13.0,300046,220000
352,Chevrolet Sail 1.2 Base,2015,35000,Diesel,Individual,Manual,First Owner,9.0,300235,260000
353,Honda Brio V MT,2012,42000,Petrol,Dealer,Manual,Second Owner,12.0,300392,249000
354,Chevrolet Beat LS,2011,60000,Petrol,Individual,Manual,Third Owner,14.0,300507,150000


### Unique Value Check for Categorical Column

In [10]:
data.nunique()

name                1491
year                  27
km_driven            770
fuel                   5
seller_type            3
transmission           2
owner                  5
Rating               566
ExShowroom Price    4331
selling_price        445
dtype: int64

In [11]:
len(data)

4340

### **Condition :**

##### 1. Unique value Count == 1 or 
##### 2. Unique value count == len(data)
##### Then delete the column

##### *Because above condition was not available hence no column is deleted*

### **Feature Engineering**

In [12]:
data["company_name"] = data["name"].str.split(" ").str[0]
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price,company_name
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,9.0,106001,100000,Tata
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,9.0,108556,100000,Tata


In [13]:
data.company_name.unique()

array(['Tata', 'Maruti', 'Chevrolet', 'Hyundai', 'Ford', 'Volkswagen',
       'Mahindra', 'Fiat', 'Nissan', 'Renault', 'Toyota', 'Datsun',
       'Honda', 'Skoda', 'Ambassador', 'OpelCorsa', 'Daewoo', 'Force',
       'Mercedes-Benz', 'BMW', 'Audi', 'Mitsubishi', 'Jeep', 'Isuzu',
       'Kia', 'Volvo', 'Jaguar', 'MG', 'Land'], dtype=object)

### Feature Engineering for KM

In [93]:
km_ranges = ["low", "medium", "High"]
limits = [0, 35000, 100000, 200000]
data["km_range"] = pd.cut(data["km_driven"], bins = limits, labels = km_ranges)
data.head(5)

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,Rating,selling_price,company_name,km_range,year_range,ex_range
0,2010,120000,1,1,1,2,9.0,100000,25,High,3,0
1,2012,50000,4,1,1,2,9.0,100000,25,medium,3,0
2,2011,70000,1,1,1,0,11.0,120000,25,medium,3,0
3,2010,80000,4,1,1,2,9.0,100000,18,medium,3,0
4,2010,80000,4,1,1,2,10.0,130000,3,medium,3,0


In [97]:
low_count = 0
for i in data["km_range"]:
    if i == "low":
        low_count+=1
print("low :", low_count)

medium_count = 0
for i in data["km_range"]:
    if i == "medium":
        medium_count += 1
print("Medium :", medium_count)

High_count = 0
for i in data["km_range"]:
    if i == "High":
        High_count += 1
print("High :", High_count)


low : 1159
Medium : 2461
High : 663


In [15]:
round(data.describe(),0)

Unnamed: 0,year,km_driven,Rating,ExShowroom Price,selling_price
count,4340.0,4340.0,4340.0,4340.0,4340.0
mean,2013.0,66216.0,12.0,845381.0,504127.0
std,4.0,46644.0,2.0,884841.0,578549.0
min,1992.0,1.0,9.0,106001.0,20000.0
25%,2011.0,35000.0,10.0,445390.0,208750.0
50%,2014.0,60000.0,12.0,596055.0,350000.0
75%,2016.0,90000.0,14.0,946243.0,600000.0
max,2020.0,806599.0,15.0,15538153.0,8900000.0


### Feature for Years

In [16]:
year_ranges = ["Junk", "Scrap", "Buy", "Best"]
limits = [1991, 2005, 2012, 2017, 2022]
data["year_range"] = pd.cut(data["year"], bins = limits, labels = year_ranges)

data.tail(5)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price,company_name,km_range,year_range
4335,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,11.0,9407487,4950000,BMW,low,Best
4336,BMW 5 Series 520d Luxury Line,2019,12999,Diesel,Dealer,Automatic,First Owner,9.71,9598350,4800000,BMW,low,Best
4337,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,11.0,9857238,4950000,BMW,low,Best
4338,Audi RS7 2015-2019 Sportback Performance,2016,13000,Petrol,Dealer,Automatic,First Owner,14.09,14235729,8900000,Audi,low,Buy
4339,Mercedes-Benz S-Class S 350d Connoisseurs Edition,2017,6500,Diesel,Dealer,Automatic,First Owner,9.0,15538153,8150000,Mercedes-Benz,low,Buy


### Feature for ex Showroom Price

In [17]:
ex_range = ["Affordable", "Family", "Luxary", "Premium"]
limits = [0, 500000, 1000000, 1500000, 20000000]
data["ex_range"] = pd.cut(data["ExShowroom Price"], bins = limits, labels = ex_range)
data.tail(5)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price,company_name,km_range,year_range,ex_range
4335,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,11.0,9407487,4950000,BMW,low,Best,Premium
4336,BMW 5 Series 520d Luxury Line,2019,12999,Diesel,Dealer,Automatic,First Owner,9.71,9598350,4800000,BMW,low,Best,Premium
4337,BMW X5 xDrive 30d xLine,2019,30000,Diesel,Dealer,Automatic,First Owner,11.0,9857238,4950000,BMW,low,Best,Premium
4338,Audi RS7 2015-2019 Sportback Performance,2016,13000,Petrol,Dealer,Automatic,First Owner,14.09,14235729,8900000,Audi,low,Buy,Premium
4339,Mercedes-Benz S-Class S 350d Connoisseurs Edition,2017,6500,Diesel,Dealer,Automatic,First Owner,9.0,15538153,8150000,Mercedes-Benz,low,Buy,Premium


In [18]:
data["company_name"].unique()
company_name_table = ['Tata', 'Maruti', 'Chevrolet', 'Hyundai', 'Ford', 'Volkswagen',
       'Mahindra', 'Fiat', 'Nissan', 'Renault', 'Toyota', 'Datsun',
       'Honda', 'Skoda', 'Ambassador', 'OpelCorsa', 'Daewoo', 'Force',
       'Mercedes-Benz', 'BMW', 'Audi', 'Mitsubishi', 'Jeep', 'Isuzu',
       'Kia', 'Volvo', 'Jaguar', 'MG', 'Land']

count = 0
for item in company_name_table:
    

array(['Tata', 'Maruti', 'Chevrolet', 'Hyundai', 'Ford', 'Volkswagen',
       'Mahindra', 'Fiat', 'Nissan', 'Renault', 'Toyota', 'Datsun',
       'Honda', 'Skoda', 'Ambassador', 'OpelCorsa', 'Daewoo', 'Force',
       'Mercedes-Benz', 'BMW', 'Audi', 'Mitsubishi', 'Jeep', 'Isuzu',
       'Kia', 'Volvo', 'Jaguar', 'MG', 'Land'], dtype=object)

### Label Encoding

In [19]:
data["fuel"].unique()

array(['Diesel', 'Petrol', 'CNG', 'LPG', 'Electric'], dtype=object)

In [20]:
EN = LabelEncoder()
data["fuel"] = EN.fit_transform(data["fuel"])
data["fuel"].unique()

array([1, 4, 0, 3, 2])

In [21]:
data["transmission"].unique()

array(['Manual', 'Automatic'], dtype=object)

In [22]:
data["transmission"] = EN.fit_transform(data["transmission"])
data["transmission"].unique()

array([1, 0])

In [23]:
data["name"] = EN.fit_transform(data["name"])
data["seller_type"] = EN.fit_transform(data["seller_type"])
data["owner"] = EN.fit_transform(data["owner"])
data["company_name"] = EN.fit_transform(data["company_name"])
data["km_range"] = EN.fit_transform(data["km_range"])
data["year_range"] = EN.fit_transform(data["year_range"])
data["ex_range"] = EN.fit_transform(data["ex_range"])
data.head(3)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price,company_name,km_range,year_range,ex_range
0,1228,2010,120000,1,1,1,2,9.0,106001,100000,25,0,3,0
1,1283,2012,50000,4,1,1,2,9.0,108556,100000,25,2,3,0
2,1229,2011,70000,1,1,1,0,11.0,120678,120000,25,2,3,0


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4340 non-null   int32  
 1   year              4340 non-null   int64  
 2   km_driven         4340 non-null   int64  
 3   fuel              4340 non-null   int32  
 4   seller_type       4340 non-null   int32  
 5   transmission      4340 non-null   int32  
 6   owner             4340 non-null   int32  
 7   Rating            4340 non-null   float64
 8   ExShowroom Price  4340 non-null   int64  
 9   selling_price     4340 non-null   int64  
 10  company_name      4340 non-null   int32  
 11  km_range          4340 non-null   int32  
 12  year_range        4340 non-null   int32  
 13  ex_range          4340 non-null   int32  
dtypes: float64(1), int32(9), int64(4)
memory usage: 322.2 KB


##### If we see all data type as number then label encoding is sucessful

### Correlation : range(-1 to 1)

In [25]:
corr = data.corr()
corr.style.background_gradient(cmap = "coolwarm")

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price,company_name,km_range,year_range,ex_range
name,1.0,-0.051902,0.126203,-0.08385,0.140802,0.087785,0.032819,0.020311,-0.068402,-0.077598,0.974513,-0.016403,0.08534,-0.045323
year,-0.051902,1.0,-0.419688,-0.120528,-0.098352,-0.1438,-0.414705,-0.003809,0.304871,0.413922,-0.039724,-0.048207,-0.764325,0.36563
km_driven,0.126203,-0.419688,1.0,-0.285634,0.113689,0.120226,0.297115,-0.003163,-0.165105,-0.192289,0.131461,-0.119582,0.441331,-0.184287
fuel,-0.08385,-0.120528,-0.285634,1.0,0.038387,0.040445,-0.010301,-0.012126,-0.234674,-0.269779,-0.109394,0.041321,0.037367,-0.298527
seller_type,0.140802,-0.098352,0.113689,0.038387,1.0,0.174925,0.165681,-0.005921,-0.132745,-0.151554,0.144218,-0.060146,0.088519,-0.108782
transmission,0.087785,-0.1438,0.120226,0.040445,0.174925,1.0,0.078893,0.024621,-0.516677,-0.530205,0.110699,0.02128,0.132211,-0.401626
owner,0.032819,-0.414705,0.297115,-0.010301,0.165681,0.078893,1.0,0.007293,-0.167726,-0.20784,0.036452,-0.041118,0.37747,-0.209813
Rating,0.020311,-0.003809,-0.003163,-0.012126,-0.005921,0.024621,0.007293,1.0,-0.000455,-0.021932,0.023503,0.025625,0.020637,0.050543
ExShowroom Price,-0.068402,0.304871,-0.165105,-0.234674,-0.132745,-0.516677,-0.167726,-0.000455,1.0,0.960629,-0.087559,-0.040643,-0.32212,0.708633
selling_price,-0.077598,0.413922,-0.192289,-0.269779,-0.151554,-0.530205,-0.20784,-0.021932,0.960629,1.0,-0.096858,-0.04007,-0.385868,0.692654


In [26]:
data = data.drop(["name", "ExShowroom Price"], axis= 1)
data.head(2)

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,Rating,selling_price,company_name,km_range,year_range,ex_range
0,2010,120000,1,1,1,2,9.0,100000,25,0,3,0
1,2012,50000,4,1,1,2,9.0,100000,25,2,3,0


### Define X & Y

##### Independent Vaiables

In [27]:
X = data.drop("selling_price", axis = 1)

##### Dependent / Target Variable

In [28]:
y = data.selling_price

In [29]:
X.head(2)

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,Rating,company_name,km_range,year_range,ex_range
0,2010,120000,1,1,1,2,9.0,25,0,3,0
1,2012,50000,4,1,1,2,9.0,25,2,3,0


In [30]:
y.head(2)

0    100000
1    100000
Name: selling_price, dtype: int64

In [31]:
data.head(2)

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,Rating,selling_price,company_name,km_range,year_range,ex_range
0,2010,120000,1,1,1,2,9.0,100000,25,0,3,0
1,2012,50000,4,1,1,2,9.0,100000,25,2,3,0


### Data Scaling

In [32]:
round(X.describe(), 2)

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,Rating,company_name,km_range,year_range,ex_range
count,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0
mean,2013.09,66215.78,2.47,0.79,0.9,0.82,11.96,15.56,1.44,1.61,0.97
std,4.22,46644.1,1.51,0.46,0.3,1.23,1.91,6.81,0.76,1.1,0.93
min,1992.0,1.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
25%,2011.0,35000.0,1.0,1.0,1.0,0.0,10.02,10.0,1.0,1.0,0.0
50%,2014.0,60000.0,1.0,1.0,1.0,0.0,12.0,18.0,2.0,1.0,1.0
75%,2016.0,90000.0,4.0,1.0,1.0,2.0,13.76,18.0,2.0,3.0,1.0
max,2020.0,806599.0,4.0,2.0,1.0,4.0,15.0,28.0,3.0,3.0,3.0


In [33]:
all_x = list(X.columns)
X[all_x] = X[all_x]/(X[all_x].max())
round(X.describe(), 2)

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,Rating,company_name,km_range,year_range,ex_range
count,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0
mean,1.0,0.08,0.62,0.4,0.9,0.21,0.8,0.56,0.48,0.54,0.32
std,0.0,0.06,0.38,0.23,0.3,0.31,0.13,0.24,0.25,0.37,0.31
min,0.99,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0
25%,1.0,0.04,0.25,0.5,1.0,0.0,0.67,0.36,0.33,0.33,0.0
50%,1.0,0.07,0.25,0.5,1.0,0.0,0.8,0.64,0.67,0.33,0.33
75%,1.0,0.11,1.0,0.5,1.0,0.5,0.92,0.64,0.67,1.0,0.33
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Dividing dataset into training and testing data

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)
print(len(X_train))
print(len(X_test))

3038
1302


In [35]:
print(len(y_train))
print(len(y_test))

3038
1302


### Data Preprocessing ends here

### Regression

### Step 1: Polynomial Feature Creation

In [36]:
poly_reg = PolynomialFeatures(degree=2)
X_train_p = poly_reg.fit_transform(X_train)
X_test_p = poly_reg.fit_transform(X_test)

In [37]:
total_columns = len(X_train_p[0])
print(total_columns)

78


### Method 1 : Linear Regression

##### *Define Model*

In [38]:
model = LinearRegression()

##### *Fit the train data*

In [39]:
model.fit(X_train, y_train)

##### *Predict the Values*

In [40]:
y_pred = model.predict(X_test)

In [41]:
y_pred

array([1345911.81841683,  263983.41123261,  420464.68066975, ...,
       1600684.50018824,  568234.24828275,  581558.18912654])

In [42]:
y_test

3866    1200000
909      210000
1866     280000
1532     250000
1028     290000
         ...   
2595     275000
1775      50000
4019     850000
2762     610000
2890     409999
Name: selling_price, Length: 1302, dtype: int64

In [43]:
accuracy = r2_score(y_test, y_pred)*100

In [44]:
round(accuracy, 2)

62.66

### Method 1 (a) : Linear Regression with polynomial data.

##### Define Model 

In [45]:
model = LinearRegression()

##### Fit the train data

In [46]:
model.fit(X_train_p, y_train)

##### Predict the values

In [47]:
y_pred = model.predict(X_test_p)

##### Accuracy

In [48]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

76.41


In [49]:
importance = model.coef_
for i, j in enumerate(importance):
    print("Feature : %d, Score: %.1f" %(i,j))

Feature : 0, Score: -0.0
Feature : 1, Score: -9068813334.0
Feature : 2, Score: -91645857.3
Feature : 3, Score: 22492888.6
Feature : 4, Score: 9862779.6
Feature : 5, Score: 10928742190986860544.0
Feature : 6, Score: 17445603.8
Feature : 7, Score: 75426993.7
Feature : 8, Score: 48025575.9
Feature : 9, Score: -26047117.8
Feature : 10, Score: -71410053.7
Feature : 11, Score: -127002315.0
Feature : 12, Score: 4591563613.9
Feature : 13, Score: 85989412.7
Feature : 14, Score: -22829970.7
Feature : 15, Score: -10769976.3
Feature : 16, Score: -2506282.5
Feature : 17, Score: -17450775.4
Feature : 18, Score: -76654278.7
Feature : 19, Score: -48466797.2
Feature : 20, Score: 25591639.1
Feature : 21, Score: 71445608.0
Feature : 22, Score: 128837154.2
Feature : 23, Score: 231692.3
Feature : 24, Score: -16780.8
Feature : 25, Score: 1381395.3
Feature : 26, Score: 3788178.0
Feature : 27, Score: 501413.1
Feature : 28, Score: 86919.9
Feature : 29, Score: 1463611.1
Feature : 30, Score: 743159.2
Feature : 3

### Method 2: Ridge Regression

##### Define Model

In [50]:
model = Ridge()

##### Fit the train data

In [51]:
model.fit(X_train_p, y_train)

##### Predict the values

In [52]:
y_pred = model.predict(X_test_p)

##### Accuracy 

In [53]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

74.62


### Method 3: Lasso Regression

##### Define Model

In [54]:
model = Lasso()

##### Fit the train data

In [55]:
model.fit(X_train_p, y_train)

##### Predict the Values

In [56]:
y_pred = model.predict(X_test_p)

##### Accuracy

In [57]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

75.59


### Method 4: ElasticNet

##### Define model

In [58]:
model = ElasticNet()

##### Fit the train data

In [59]:
model.fit(X_train_p, y_train)

##### Predict the values

In [60]:
y_pred = model.predict(X_test_p)

##### Accuracy

In [61]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

50.17


### Method 5 : KNeighbor Regression

##### Define Model

In [62]:
model = KNeighborsRegressor()

##### Fit the Train data

In [63]:
model.fit(X_train_p, y_train)

##### Predict the values

In [64]:
y_pred = model.predict(X_test_p)

##### Accuracy

In [65]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

79.71


### Method 6: Support Vector Regression 

##### Define Model

In [66]:
model = SVR()

##### Fit the train data

In [67]:
model.fit(X_train_p, y_train)

##### Predict thevalue

In [68]:
y_pred =model.predict(X_test_p)

##### Accuracy

In [69]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

-6.1


### Method 7: Random Forest Regression

##### Define Model

In [70]:
model = RandomForestRegressor()

##### Fit the train data

In [71]:
model.fit(X_train_p, y_train)

##### Predict the values

In [72]:
y_pred = model.predict(X_test_p)

##### Accuracy

In [73]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

86.85


### Method 8 : Decision Tree Regression

##### Define Model

In [74]:
model = DecisionTreeRegressor()

##### Fit the train data

In [75]:
model.fit(X_train_p, y_train)

##### Predict the values

In [76]:
y_pred = model.predict(X_test_p)

##### Accuracy

In [77]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy,2))

83.18


### Method 9: Bagging Regression 

##### Define Model

In [78]:
model = BaggingRegressor()

##### Fit the train data

In [79]:
model.fit(X_train_p, y_train)

##### Predict the values

In [80]:
y_pred = model.predict(X_test_p)

##### Accuracy

In [81]:
accuracy = r2_score(y_test, y_pred)*100
print(round(accuracy, 2))

86.46


### Deployment of most accurate model

##### Step 1: Save the model

##### Define the most accurate model

In [82]:
model = RandomForestRegressor()
model.fit(X_train_p, y_train)

filename = "SecondCar_Most_Acc_Reg_Model.sav"
pickle.dump(model, open(file = filename, mode = "wb"))


##### Step 2 : Use the Saved Model

In [83]:
model_load = pickle.load(open(file = filename, mode = "rb"))
y_pred = model_load.predict(X_test_p[15:16, ])      ## write a data preprocess
y_pred

array([550529.99])

In [84]:
X_test.iloc[15:16, ]

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,Rating,company_name,km_range,year_range,ex_range
2736,0.998515,0.148773,0.25,0.5,1.0,0.0,0.6,0.607143,0.0,0.333333,0.333333


In [85]:
data.iloc[2736]

year               2017.0
km_driven        120000.0
fuel                  1.0
seller_type           1.0
transmission          1.0
owner                 0.0
Rating                9.0
selling_price    628000.0
company_name         17.0
km_range              0.0
year_range            1.0
ex_range              1.0
Name: 2736, dtype: float64