In [1]:
### Using same example from Random Forest (cardekho_imputated.csv)

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings

%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("../31 Random Forest ML/cardekho_imputated.csv",index_col=[0])
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [3]:
df.drop(["car_name","brand"],axis=1,inplace=True)


In [4]:
X=df.drop(["selling_price"],axis=1)
y=df["selling_price"]

In [5]:
from sklearn.preprocessing import LabelEncoder

labelEncoder=LabelEncoder()

X['model']=labelEncoder.fit_transform(X['model'])

In [6]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_columns=['seller_type','transmission_type','fuel_type']
num_columns=X.select_dtypes(exclude='object').columns

oneHotEncoder=OneHotEncoder(drop='first')
standardScaler=StandardScaler()

columnTransformer=ColumnTransformer(
    [
        ("StandardScalar",standardScaler,num_columns),
        ("OneHotEncoder",oneHotEncoder,cat_columns)
    ],remainder='passthrough'
)

In [7]:
X=columnTransformer.fit_transform(X)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022,0.0,0.0,1.0,0.0,0.0,0.0,1.0
15407,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444,0.0,0.0,1.0,0.0,0.0,0.0,1.0
15408,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022,0.0,0.0,1.0,1.0,0.0,0.0,0.0
15409,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor


all_models = [
    ("Logistic Regression",LogisticRegression()),
    ("Ridge Regression",Ridge()),
    ("Lasso Regression",Lasso()),
    ("KNeighborsRegressor",KNeighborsRegressor()),
    ("DecisionTreeRegressor",DecisionTreeRegressor()),
    ("RandomForestRegressor",RandomForestRegressor()),
    ("AdaBoostRegressor",AdaBoostRegressor()),
    ("GradientBoostingRegressor", GradientBoostingRegressor()),
    ("XGBoost Regressor",XGBRegressor())
]


def print_metrics(model_name,y_test,y_pred,data_type=0):
    from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
    print(model_name)
    if(data_type==0):
        print("Metrics for Training Data")
    else:
        print("Metrics for Test Data")
    print("----------------------------")
    print(f"Mean Absolute Error is {mean_absolute_error(y_test,y_pred)}")
    print(f"R2 Score is {r2_score(y_test,y_pred)}")
    print(f"Root Mean Squared Error is {np.sqrt(mean_squared_error(y_test,y_pred))}")


def train_model(models):
    for model in models:
        model_name=model[0]
        model=model[1]
        ### For training data
        model.fit(X_train, y_train)
        y_train_pred=model.predict(X_train)
        print_metrics(model_name,y_train,y_train_pred)
        print("="*18)
        ### For test data
        y_test_pred=model.predict(X_test)
        print_metrics(model_name,y_test,y_test_pred,1)
        print("="*36)
        print("\n")
train_model(all_models)

Logistic Regression
Metrics for Training Data
----------------------------
Mean Absolute Error is 192239.29266709927
R2 Score is 0.7500661157759044
Root Mean Squared Error is 450227.8026759354
Logistic Regression
Metrics for Test Data
----------------------------
Mean Absolute Error is 210308.50632500811
R2 Score is 0.6959967017317936
Root Mean Squared Error is 478380.75544814864


Ridge Regression
Metrics for Training Data
----------------------------
Mean Absolute Error is 268059.80146883114
R2 Score is 0.6217710706848424
Root Mean Squared Error is 553856.3159709624
Ridge Regression
Metrics for Test Data
----------------------------
Mean Absolute Error is 279557.2168930275
R2 Score is 0.6645239743566809
Root Mean Squared Error is 502533.8229890289


Lasso Regression
Metrics for Training Data
----------------------------
Mean Absolute Error is 268099.2220102348
R2 Score is 0.6217719516486739
Root Mean Squared Error is 553855.6709546396
Lasso Regression
Metrics for Test Data
----------

In [12]:
## Hyperparametric tuning

from sklearn.model_selection import RandomizedSearchCV

best_parms={}

rf_params = {
    "n_estimators":[100,200,500,1000],
    "max_depth":[5,7,8,None,10,15],
    "min_samples_split":[2,8,15,20],
    "max_features":[5,7,"auto",8]
}

xgb_params = {
    "n_estimators":[100,200,300],
    "colsample_bytree":[0.5,0.8,1,0.3,0.4],
    "max_depth":[5,8,12,30,20],
    "learning_rate":[0.1,0.01]
}

tuned_models = [
    ("RFR",RandomForestRegressor(),rf_params),
    ("XGBR",XGBRegressor(),xgb_params)
]

for model_name,model,params in tuned_models:
    random=RandomizedSearchCV(model,param_distributions=params,cv=3,verbose=2,n_iter=100,n_jobs=-1)
    print(f"-------------------{model_name}-------------------")
    random.fit(X_train,y_train)
    best_parms[model_name]=random.best_params_

for param in best_parms:
    print(f"Best params for {param}")
    print(best_parms[param])

-------------------RFR-------------------
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------XGBR-------------------
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best params for RFR
{'n_estimators': 200, 'min_samples_split': 2, 'max_features': 5, 'max_depth': None}
Best params for XGBR
{'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [13]:
ht_models = [
    ("RandomForestRegressor",RandomForestRegressor(n_estimators=200,max_depth=None,min_samples_split=2,max_features=5)),
    ("XGBoostingRegressor",XGBRegressor(colsample_bytree=0.8,n_estimators=300,learning_rate=0.1,max_depth=5))
]

train_model(ht_models)

RandomForestRegressor
Metrics for Training Data
----------------------------
Mean Absolute Error is 39074.56463319672
R2 Score is 0.9800864903816776
Root Mean Squared Error is 127084.81031000995
RandomForestRegressor
Metrics for Test Data
----------------------------
Mean Absolute Error is 98604.34929842026
R2 Score is 0.9395434081163494
Root Mean Squared Error is 213332.18576831563


XGBoostingRegressor
Metrics for Training Data
----------------------------
Mean Absolute Error is 71974.1484375
R2 Score is 0.9853243231773376
Root Mean Squared Error is 109098.51753346606
XGBoostingRegressor
Metrics for Test Data
----------------------------
Mean Absolute Error is 100347.0078125
R2 Score is 0.8948622345924377
Root Mean Squared Error is 281328.6401630662


