In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
df=pd.read_csv('cardekho_imputated.csv',index_col=[0])
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


## Data cleaning

In [3]:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [4]:
df.drop('car_name',axis=1,inplace=True)
df.drop('brand',axis=1,inplace=True)

In [5]:
df

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000
...,...,...,...,...,...,...,...,...,...,...,...
19537,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5,250000
19540,Ertiga,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7,925000
19541,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5,425000
19542,XUV500,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7,1225000


In [6]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [7]:
#independent and dependent

from sklearn.model_selection import train_test_split
X=df.drop(['selling_price'],axis=1)
y=df['selling_price']

## feature encoding and scaling

In [8]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [9]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['seller_type','fuel_type','transmission_type']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
        
    ],remainder='passthrough'
    
)

In [10]:
X=preprocessor.fit_transform(X)

In [11]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [12]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [13]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.4/124.9 MB 13.9 MB/s eta 0:00:09
   ---------------------------------------- 0.5/124.9 MB 10.7 MB/s eta 0:00:12
   ---------------------------------------- 0.5/124.9 MB 10.7 MB/s eta 0:00:12
   ---------------------------------------- 1.2/124.9 MB 6.1 MB/s eta 0:00:21
   ---------------------------------------- 1.5/124.9 MB 5.6 MB/s eta 0:00:23
    --------------------------------------- 1.7/124.9 MB 5.2 MB/s eta 0:00:24
    --------------------------------------- 2.0/124.9 MB 5.1 MB/s eta 0:00:25
    --------------------------------------- 2.2/124.9 MB 4.9 MB/s eta 0:00:26
    --------------------------------------- 2.5/124.9 MB 4.8 MB/s eta 0:00:26
    --------------------------------------- 2.8/124.9 MB 4.7 MB/s eta 0:00:2

## Model training

In [14]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [15]:


models = {
    "linear-regression": LinearRegression(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision tree": DecisionTreeRegressor(),
    "random forest": RandomForestRegressor(),
    'Adaboost':AdaBoostRegressor(),
    'gradientboost':GradientBoostingRegressor(),
    'xgboost':XGBRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    r2_Score_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)

    r2_Score_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)

    print(list(models.keys())[i])

    print("Model Performance for the training dataset")
    print(f"- r2_score: {r2_Score_train:.2f}")
    print(f"- mean_squared_error: {mse_train:.2f}")
    print(f"- mean_absolute_error: {mae_train:.2f}")
    print(f"- rmse: {rmse_train:.2f}")
    
    print("Model Performance for the testing dataset")
    print(f"- r2_score: {r2_Score_test:.2f}")
    print(f"- mean_squared_error: {mse_test:.2f}")
    print(f"- mean_absolute_error: {mae_test:.2f}")
    print(f"- rmse: {rmse_test:.2f}")

    print('=' * 35)
    print('\n')


linear-regression
Model Performance for the training dataset
- r2_score: 0.62
- mean_squared_error: 317614523214.21
- mean_absolute_error: 269089.26
- rmse: 563573.00
Model Performance for the testing dataset
- r2_score: 0.66
- mean_squared_error: 253401740454.38
- mean_absolute_error: 281003.41
- rmse: 503390.25


ridge
Model Performance for the training dataset
- r2_score: 0.62
- mean_squared_error: 317615483996.88
- mean_absolute_error: 269042.57
- rmse: 563573.85
Model Performance for the testing dataset
- r2_score: 0.66
- mean_squared_error: 253383293702.41
- mean_absolute_error: 280947.91
- rmse: 503371.92


lasso
Model Performance for the training dataset
- r2_score: 0.62
- mean_squared_error: 317614534311.28
- mean_absolute_error: 269088.21
- rmse: 563573.01
Model Performance for the testing dataset
- r2_score: 0.66
- mean_squared_error: 253400903429.16
- mean_absolute_error: 281002.73
- rmse: 503389.42


decision tree
Model Performance for the training dataset
- r2_score: 1.00

## Hyperparameter tuning

In [23]:
knn_params={'n_neighbors':[2,3,10,20,40,50]}
rf_params={'max_depth':[5,8,15,None,10],
           'max_features':[5,7,'auto',8],
           'min_samples_split':[2,8,15,20],
           'n_estimators':[100,200,500,1000]}


adaboost_params={
    'n_estimators':[50,60,70,80],
    'loss':['linear','square','exponential']
}


gradient_params={'loss':['absolute_error', 'squared_error', 'huber', 'quantile'],
                 'criterion':['friedman_mse','squared_error','mse'],
                 'min_samples_split':[2,8,15,20],
                 'n_estimators':[100,200,500],
                 'max_depth':[5,8,15,None,10],
                 'learning_rate':[0.1,0.01,0.02,0.03]}


XGB_Params={"learning rate":[0.1,0.01],
            "max_depth":[5,8,12,20,30],
            "n_estimators":[100,200,300],
            "colsample_bytree":[0.5,0.8,1,0.3,0.4]
}


In [24]:
randomcv_models=[
#                  ('knn',KNeighborsRegressor(),knn_params),
#                  ('RF',RandomForestRegressor(),rf_params),
#                  ('Adaboost',AdaBoostRegressor(),adaboost_params),
                   ('Gradient',GradientBoostingRegressor(),gradient_params),
                   ('XGB',XGBRegressor(),XGB_Params)]

In [25]:
from sklearn.model_selection import RandomizedSearchCV
model_param={}
for name,model,params in randomcv_models:
    random=RandomizedSearchCV(estimator=model,param_distributions=params,
                              n_iter=100,
                              cv=3,
                              verbose=2,
                              n_jobs=-1)
    random.fit(X_train,y_train)
    model_param[name]=random.best_params_

for model_name in model_param:
    print(f"--------------------best params for{model_name}-----------------------")
    print(model_param[model_name])



Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [57]:


models = {
    "linear-regression": LinearRegression(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "KNN": KNeighborsRegressor(),
    "random forest": RandomForestRegressor(),
    'adaboost':AdaBoostRegressor(),
    'gradientboost':GradientBoostingRegressor(loss=,criterion=,min_samples_split=,n_estimators=,max_depth=,learning_rate=),
    'extremegradientboosting':XGBRegressor(learning_rate=,max_depth=,n_samples=,colsample_bytree=)
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    r2_Score_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)

    r2_Score_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)

    print(list(models.keys())[i])

    print("Model Performance for the training dataset")
    print(f"- r2_score: {r2_Score_train:.2f}")
    print(f"- mean_squared_error: {mse_train:.2f}")
    print(f"- mean_absolute_error: {mae_train:.2f}")
    print(f"- rmse: {rmse_train:.2f}")
    
    print("Model Performance for the testing dataset")
    print(f"- r2_score: {r2_Score_test:.2f}")
    print(f"- mean_squared_error: {mse_test:.2f}")
    print(f"- mean_absolute_error: {mae_test:.2f}")
    print(f"- rmse: {rmse_test:.2f}")

    print('=' * 35)
    print('\n')


linear-regression
Model Performance for the training dataset
- r2_score: 0.62
- mean_squared_error: 317614523214.21
- mean_absolute_error: 269089.26
- rmse: 563573.00
Model Performance for the testing dataset
- r2_score: 0.66
- mean_squared_error: 253401740454.38
- mean_absolute_error: 281003.41
- rmse: 503390.25


ridge
Model Performance for the training dataset
- r2_score: 0.62
- mean_squared_error: 317615483996.88
- mean_absolute_error: 269042.57
- rmse: 563573.85
Model Performance for the testing dataset
- r2_score: 0.66
- mean_squared_error: 253383293702.41
- mean_absolute_error: 280947.91
- rmse: 503371.92


lasso
Model Performance for the training dataset
- r2_score: 0.62
- mean_squared_error: 317614534311.28
- mean_absolute_error: 269088.21
- rmse: 563573.01
Model Performance for the testing dataset
- r2_score: 0.66
- mean_squared_error: 253400903429.16
- mean_absolute_error: 281002.73
- rmse: 503389.42


KNN
Model Performance for the training dataset
- r2_score: 0.94
- mean_sq