In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt

In [3]:
data = pd.read_csv("cars24.csv")
data

Unnamed: 0,year,name,model,transmission,km,ownership,fuel,registeration,price
0,2015,Maruti,Maruti Celerio VXI AMT,Automatic,7816,1st,Petrol,GJ,405199
1,2021,Hyundai,Hyundai Creta EX MT,Manual,3002,2nd,Petrol,GJ,1204199
2,2017,Datsun,Datsun Redi Go T (O),Manual,99957,1st,Petrol,GJ,223099
3,2017,Hyundai,Hyundai Elite i20 Magna Executive 1.2,Manual,43773,2nd,Petrol,GJ,515799
4,2014,Toyota,Toyota Innova 2.5 GX 8 STR BS IV,Manual,41158,2nd,Diesel,GJ,711199
...,...,...,...,...,...,...,...,...,...
4632,2018,Tata,Tata NEXON XZA+ 1.5,Automatic,80038,1st,Diesel,MH,803299
4633,2017,Maruti,Maruti S Cross ZETA SHVS,Manual,60034,1st,Diesel,MH,816299
4634,2018,Tata,Tata NEXON XZA + 1.2 PETROL A/T,Automatic,49765,1st,Petrol,MH,801899
4635,2021,Honda,Honda Jazz 1.2 ZX MT,Manual,6980,1st,Petrol,MH,982099


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4637 entries, 0 to 4636
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4637 non-null   int64 
 1   name           4637 non-null   object
 2   model          4637 non-null   object
 3   transmission   4637 non-null   object
 4   km             4637 non-null   int64 
 5   ownership      4637 non-null   object
 6   fuel           4637 non-null   object
 7   registeration  4637 non-null   object
 8   price          4637 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 326.2+ KB


In [5]:
X = data.iloc[:,0:8]
y = data.iloc[:,8:]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
X_train

Unnamed: 0,year,name,model,transmission,km,ownership,fuel,registeration
4625,2015,Maruti,Maruti Wagon R 1.0 LXI CNG,Manual,73610,1st,Petrol + CNG,MH
4591,2019,Hyundai,Hyundai Elite i20 1.2 SPORTS PLUS VTVT,Manual,32623,1st,Petrol,MH
3679,2020,Maruti,Maruti S PRESSO VXI,Manual,29984,1st,Petrol,UP
178,2018,Volkswagen,Volkswagen Vento HIGHLINE TDI AT,Automatic,53682,1st,Diesel,GJ
4333,2019,Maruti,Maruti Celerio VXI CNG OPT,Manual,46053,1st,Petrol + CNG,MH
...,...,...,...,...,...,...,...,...
4426,2017,Maruti,Maruti Dzire VXI,Manual,32288,1st,Petrol,MH
466,2018,Honda,Honda Amaze 1.2 V CVT I VTEC,Automatic,10392,1st,Petrol,KA
3092,2021,Maruti,Maruti Alto VXI,Manual,11574,1st,Petrol,TS
3772,2017,Hyundai,Hyundai Verna FLUIDIC 1.6 SX VTVT,Manual,43085,2nd,Petrol,MH


In [8]:
ohe = OneHotEncoder(drop = "first")
ohe_values = ohe.fit_transform(X[["name","model","transmission","ownership","fuel","registeration"]])

In [9]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),["name","model","transmission","ownership","fuel","registeration"]),
                                      remainder = 'passthrough')

In [10]:
column_trans

# Models

## Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
lr = LinearRegression()

In [13]:
linear_regression_pipe = make_pipeline(column_trans,lr)

In [14]:
linear_regression_pipe.fit(X_train,y_train)

In [15]:
y_pred_linear = linear_regression_pipe.predict(X_test)
y_pred_linear

array([[1612008.30027508],
       [ 473473.3324456 ],
       [ 673736.44406937],
       ...,
       [ 545433.03991622],
       [1558713.99973892],
       [1391036.7331068 ]])

In [16]:
r2 = r2_score(y_test,y_pred_linear)
r2

0.9021687413245839

In [17]:
mse = mean_squared_error(y_test,y_pred_linear)
mse

11182006982.399065

In [18]:
rmse = sqrt(mse)
rmse

105745.00925527912

In [19]:
models = pd.DataFrame()
tempResults = pd.DataFrame({'Method':['Linear Regression'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255


## Decision Tree

In [20]:
from sklearn.tree import DecisionTreeRegressor 

In [21]:
dt = DecisionTreeRegressor(random_state = 42)

In [22]:
dt_pipe = make_pipeline(column_trans,dt)

In [23]:
dt_pipe.fit(X_train,y_train)

In [24]:
y_pred_dt = dt_pipe.predict(X_test)
y_pred_dt

array([1596899.,  674399.,  687199., ...,  463899., 1365299.,  972099.])

In [25]:
r2 = r2_score(y_test,y_pred_dt)
r2

0.7466913486680167

In [26]:
mse = mean_squared_error(y_test,y_pred_dt)
mse

28952904687.590466

In [27]:
rmse = sqrt(mse)
rmse

170155.53087569756

In [28]:
tempResults = pd.DataFrame({'Method':['Decision Tree'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876


## Random Forest

In [29]:
from sklearn.ensemble import RandomForestRegressor

In [30]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 30)

In [31]:
rf_pipe = make_pipeline(column_trans,rf)

In [32]:
rf_pipe.fit(X_train,y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [33]:
y_pred_rf = rf_pipe.predict(X_test)
y_pred_rf

array([1169132. ,  509432.5,  677557. , ...,  474594. , 1331056. ,
       1591431. ])

In [34]:
r2 = r2_score(y_test,y_pred_rf)
r2

0.8446904988917864

In [35]:
mse = mean_squared_error(y_test,y_pred_rf)
mse

17751707882.92605

In [36]:
rmse = sqrt(mse)
rmse

133235.53536097662

In [37]:
tempResults = pd.DataFrame({'Method':['Random Forest'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876
0,Random Forest,0.84469,17751710000.0,133235.535361


## KNN

In [38]:
from sklearn.neighbors import KNeighborsRegressor

In [39]:
knn = KNeighborsRegressor()

In [40]:
knn_pipe = make_pipeline(column_trans,knn)

In [41]:
knn_pipe.fit(X_train,y_train)

In [42]:
y_pred_knn = knn_pipe.predict(X_test)
y_pred_knn

array([[523779.],
       [612819.],
       [950049.],
       ...,
       [811409.],
       [845859.],
       [525679.]])

In [43]:
r2 = r2_score(y_test,y_pred_knn)
r2

-0.15758429156837295

In [44]:
mse = mean_squared_error(y_test,y_pred_knn)
mse

132310631655.86913

In [45]:
rmse = sqrt(mse)
rmse

363745.2840324794

In [46]:
tempResults = pd.DataFrame({'Method':['KNN'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876
0,Random Forest,0.84469,17751710000.0,133235.535361
0,KNN,-0.157584,132310600000.0,363745.284032


## XGBoost

In [47]:
import xgboost
from xgboost import XGBRegressor

In [48]:
xgb = XGBRegressor()

In [49]:
xgb_pipe = make_pipeline(column_trans,xgb)

In [50]:
xgb_pipe.fit(X_train,y_train)

In [51]:
y_pred_xgb = xgb_pipe.predict(X_test)
y_pred_xgb

array([1330914.8 ,  546683.9 ,  634853.2 , ...,  485188.53, 1440137.  ,
       1671786.4 ], dtype=float32)

In [52]:
r2 = r2_score(y_test,y_pred_xgb)
r2

0.8372139417821917

In [53]:
mse = mean_squared_error(y_test,y_pred_xgb)
mse

18606270268.5657

In [54]:
rmse = sqrt(mse)
rmse

136404.80295270288

In [55]:
tempResults = pd.DataFrame({'Method':['XGBoost'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876
0,Random Forest,0.84469,17751710000.0,133235.535361
0,KNN,-0.157584,132310600000.0,363745.284032
0,XGBoost,0.837214,18606270000.0,136404.802953


## LGBM

In [56]:
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [57]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000,
    "force_col_wise": True
}

In [58]:
gbm = lgb.LGBMRegressor(**hyper_params)

In [59]:
lgbm_pipe = make_pipeline(column_trans,gbm)

In [60]:
lgbm_pipe.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [61]:
y_pred_lgbm = lgbm_pipe.predict(X_test)
y_pred_xgb

array([1330914.8 ,  546683.9 ,  634853.2 , ...,  485188.53, 1440137.  ,
       1671786.4 ], dtype=float32)

In [62]:
r2 = r2_score(y_test,y_pred_lgbm)
r2

0.6332365661251458

In [63]:
mse = mean_squared_error(y_test,y_pred_lgbm)
mse

41920663538.47141

In [64]:
rmse = sqrt(mse)
rmse

204745.36267879527

In [65]:
tempResults = pd.DataFrame({'Method':['Lightbgm'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876
0,Random Forest,0.84469,17751710000.0,133235.535361
0,KNN,-0.157584,132310600000.0,363745.284032
0,XGBoost,0.837214,18606270000.0,136404.802953
0,Lightbgm,0.633237,41920660000.0,204745.362679


## SVR

In [66]:
from sklearn.svm import SVR

In [67]:
svr = SVR(kernel='rbf')

In [68]:
svr_pipe = make_pipeline(column_trans,svr)

In [69]:
svr_pipe.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [70]:
y_pred_svr = svr_pipe.predict(X_test)
y_pred_svr

array([575252.70839781, 575265.14243682, 575386.46977212, ...,
       575368.44690495, 575246.91894682, 575344.79529517])

In [71]:
r2 = r2_score(y_test,y_pred_svr)
r2

-0.07663346702247664

In [72]:
mse = mean_squared_error(y_test,y_pred_svr)
mse

123058040024.5336

In [73]:
rmse = sqrt(mse)
rmse

350796.29420011496

In [74]:
tempResults = pd.DataFrame({'Method':['Support Vector Regressor'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876
0,Random Forest,0.84469,17751710000.0,133235.535361
0,KNN,-0.157584,132310600000.0,363745.284032
0,XGBoost,0.837214,18606270000.0,136404.802953
0,Lightbgm,0.633237,41920660000.0,204745.362679
0,Support Vector Regressor,-0.076633,123058000000.0,350796.2942


## Ridge

In [75]:
from sklearn.linear_model import Ridge

In [76]:
ridgeR = Ridge(alpha= 2)

In [77]:
ridgeR_pipe = make_pipeline(column_trans,ridgeR)

In [78]:
ridgeR_pipe.fit(X_train,y_train)

In [79]:
y_pred_ridgeR = svr_pipe.predict(X_test)
y_pred_ridgeR

array([575252.70839781, 575265.14243682, 575386.46977212, ...,
       575368.44690495, 575246.91894682, 575344.79529517])

In [80]:
r2 = r2_score(y_test,y_pred_ridgeR)
r2

-0.07663346702247664

In [81]:
mse = mean_squared_error(y_test,y_pred_ridgeR)
mse

123058040024.5336

In [82]:
rmse = sqrt(mse)
rmse

350796.29420011496

In [83]:
tempResults = pd.DataFrame({'Method':['Ridge'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876
0,Random Forest,0.84469,17751710000.0,133235.535361
0,KNN,-0.157584,132310600000.0,363745.284032
0,XGBoost,0.837214,18606270000.0,136404.802953
0,Lightbgm,0.633237,41920660000.0,204745.362679
0,Support Vector Regressor,-0.076633,123058000000.0,350796.2942
0,Ridge,-0.076633,123058000000.0,350796.2942


## Lasso

In [84]:
from sklearn.linear_model import Lasso

In [85]:
lasso = Lasso(alpha= 0.5)

In [86]:
lasso_pipe = make_pipeline(column_trans,lasso)

In [87]:
lasso_pipe.fit(X_train,y_train)

  model = cd_fast.sparse_enet_coordinate_descent(


In [88]:
y_pred_lasso = svr_pipe.predict(X_test)
y_pred_lasso

array([575252.70839781, 575265.14243682, 575386.46977212, ...,
       575368.44690495, 575246.91894682, 575344.79529517])

In [89]:
r2 = r2_score(y_test,y_pred_lasso)
r2

-0.07663346702247664

In [90]:
mse = mean_squared_error(y_test,y_pred_lasso)
mse

123058040024.5336

In [91]:
rmse = sqrt(mse)
rmse

350796.29420011496

In [92]:
tempResults = pd.DataFrame({'Method':['Lasso'], 'r2': [r2],'mse': [mse], 'rmse':rmse })
models = pd.concat([models, tempResults])
models = models[['Method', 'r2', 'mse','rmse']]
models

Unnamed: 0,Method,r2,mse,rmse
0,Linear Regression,0.902169,11182010000.0,105745.009255
0,Decision Tree,0.746691,28952900000.0,170155.530876
0,Random Forest,0.84469,17751710000.0,133235.535361
0,KNN,-0.157584,132310600000.0,363745.284032
0,XGBoost,0.837214,18606270000.0,136404.802953
0,Lightbgm,0.633237,41920660000.0,204745.362679
0,Support Vector Regressor,-0.076633,123058000000.0,350796.2942
0,Ridge,-0.076633,123058000000.0,350796.2942
0,Lasso,-0.076633,123058000000.0,350796.2942


Proceeding with Linear Regression for model building.

## Importing Model

In [93]:
import pickle

In [94]:
pickle.dump(linear_regression_pipe,open("modellinear.pkl","wb"))