In [1]:
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 

warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv(r'C:\Users\Nitin Flavier\Desktop\Data Nexus\Data Science\ML_BootCamp\ML_Algos\Random_Forest\Dataset\updated_used_car_price.csv')
df.head()

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [3]:
X = df.drop('selling_price',axis=1)
y = df['selling_price']

print(X.head())
print(y.head())

        car_name  vehicle_age  km_driven seller_type fuel_type  \
0    Maruti Alto            9     120000  Individual    Petrol   
1  Hyundai Grand            5      20000  Individual    Petrol   
2    Hyundai i20           11      60000  Individual    Petrol   
3    Maruti Alto            9      37000  Individual    Petrol   
4  Ford Ecosport            6      30000      Dealer    Diesel   

  transmission_type  mileage  engine  max_power  seats  
0            Manual    19.70     796      46.30      5  
1            Manual    18.90    1197      82.00      5  
2            Manual    17.00    1197      80.00      5  
3            Manual    20.92     998      67.10      5  
4            Manual    22.77    1498      98.59      5  
0    120000
1    550000
2    215000
3    226000
4    570000
Name: selling_price, dtype: int64


### Feature Encoding

We will do label encoding, 

Ordinal Misrepresentation:      
If the categorical values are not ordinal (no natural order), the numerical assignment might imply a relationship between categories that doesn't exist.  
Example: Encoding ["Dog" -> 0, "Cat" -> 1, "Rabbit" -> 2] implies an ordering, which is incorrect.   

In our case we want a relation between the feature 'model' with selling price which is ordinal

In [5]:
from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder()

X['car_name'] = le.fit_transform(X['car_name'])

In [8]:
X['car_name'].value_counts()

car_name
40     906
77     890
76     781
65     778
27     757
      ... 
83       1
103      1
17       1
31       1
18       1
Name: count, Length: 121, dtype: int64

The parameter drop='first' in OneHotEncoder is used to handle the issue of multicollinearity that arises when all categories of a categorical variable are encoded into binary columns.

Including all n binary columns can introduce multicollinearity (a situation where one column can be perfectly predicted by the others) when used in linear models. Dropping one category prevents this issue while retaining the necessary information.

In [9]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_features = X.select_dtypes(exclude="object").columns 
onehot_columns = ['seller_type','fuel_type','transmission_type'] 

num_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,onehot_columns),
        ("StandardScaler",num_transformer,num_features)
    ],
    remainder="passthrough"
)

In [10]:
X = preprocessor.fit_transform(X)

In [11]:
from sklearn.model_selection import train_test_split 

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

### Model Training and Selection

In [12]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [16]:
def evaluate_model(true_value,pred_value):
    mae = mean_absolute_error(true_value,pred_value)
    mse = mean_squared_error(true_value,pred_value)
    r2_square = r2_score(true_value,pred_value)
    rmse = np.sqrt(mse)

    return mae,rmse,r2_square

In [18]:
### Begin Model training 
models = {
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Linear Regression": LinearRegression(),
    "K-Neighbours Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor()
}

for key,value in models.items():
    model = value 
    model.fit(X_train,y_train)

    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    # evaluate the model 
    mae_test, rmse_test, r2_test = evaluate_model(y_test,y_test_pred)

    mae_train, rmse_train, r2_train = evaluate_model(y_train,y_train_pred) 

    print("Model: ",key)

    print(" Model Performance on Training Set: ")
    print(" - Root Mean Square Error: {:.4f}".format(rmse_train))
    print(" - Mean Absolute Error: {:.4f}".format(rmse_train))
    print(" - R2_Score: {:.4f}".format(r2_train))
    print()
    print(" Model Performance on Test Set: ")
    print(" - Root Mean Square Error: {:.4f}".format(rmse_test))
    print(" - Mean Absolute Error: {:.4f}".format(rmse_test))
    print(" - R2_Score: {:.4f}".format(r2_test))
    print()

Model:  Lasso
 Model Performance on Training Set: 
 - Root Mean Square Error: 563785.4152
 - Mean Absolute Error: 563785.4152
 - R2_Score: 0.6167

 Model Performance on Test Set: 
 - Root Mean Square Error: 502836.1750
 - Mean Absolute Error: 502836.1750
 - R2_Score: 0.6577

Model:  Ridge
 Model Performance on Training Set: 
 - Root Mean Square Error: 563786.2859
 - Mean Absolute Error: 563786.2859
 - R2_Score: 0.6167

 Model Performance on Test Set: 
 - Root Mean Square Error: 502820.1247
 - Mean Absolute Error: 502820.1247
 - R2_Score: 0.6577

Model:  Linear Regression
 Model Performance on Training Set: 
 - Root Mean Square Error: 563785.4054
 - Mean Absolute Error: 563785.4054
 - R2_Score: 0.6167

 Model Performance on Test Set: 
 - Root Mean Square Error: 502836.9470
 - Mean Absolute Error: 502836.9470
 - R2_Score: 0.6577

Model:  K-Neighbours Regressor
 Model Performance on Training Set: 
 - Root Mean Square Error: 351502.5056
 - Mean Absolute Error: 351502.5056
 - R2_Score: 0.85

### Hyper-Parameter-Tuning

In [19]:
# so we will take KNN and Random-Forest Regressor 

knn_params = {"n_neighbors": [2,3,10,20,40,50]}

rf_params = {
    "max_depth": [5,8,15,10,None],
    "max_features": [5,7,8,"auto"],
    "min_samples_split": [2,8,15,20],
    "n_estimators": [100,200,500,1000]
}

In [20]:
tunning_models = [
    ('KNN',KNeighborsRegressor(),knn_params),
    ('Random Forest', RandomForestRegressor(),rf_params)
]

In [28]:
from sklearn.model_selection import RandomizedSearchCV 

model_param = {}
best_tuned_model = {}

for name,model,params in tunning_models:
    randomcv = RandomizedSearchCV(estimator=model,param_distributions=params,cv=3,verbose=1,refit=True)
    randomcv.fit(X_train,y_train)
    best_tuned_model[name] = randomcv
    model_param[name] = randomcv.best_params_

print(model_param)
print()
for key1,dict in model_param.items():
    print(f"For the model {key1}")
    for k,v in dict.items():
        print(k,v, end="  ")
    print()
    print()

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
{'KNN': {'n_neighbors': 2}, 'Random Forest': {'n_estimators': 500, 'min_samples_split': 2, 'max_features': 8, 'max_depth': 15}}

For the model KNN
n_neighbors 2  

For the model Random Forest
n_estimators 500  min_samples_split 2  max_features 8  max_depth 15  



In [29]:
print("Tunned Model Performance: \n")

for name,tuned_model in best_tuned_model.items():

    y_test_pred = tuned_model.predict(X_test)
    y_train_pred = tuned_model.predict(X_train)

    # evaluate the model 
    mae_test, rmse_test, r2_test = evaluate_model(y_test,y_test_pred)

    mae_train, rmse_train, r2_train = evaluate_model(y_train,y_train_pred) 

    print("Model: ",key)

    print(" Model Performance on Training Set: ")
    print(" - Root Mean Square Error: {:.4f}".format(rmse_train))
    print(" - Mean Absolute Error: {:.4f}".format(rmse_train))
    print(" - R2_Score: {:.4f}".format(r2_train))
    print()
    print(" Model Performance on Test Set: ")
    print(" - Root Mean Square Error: {:.4f}".format(rmse_test))
    print(" - Mean Absolute Error: {:.4f}".format(rmse_test))
    print(" - R2_Score: {:.4f}".format(r2_test))
    print()

Tunned Model Performance: 

Model:  Random Forest Regressor
 Model Performance on Training Set: 
 - Root Mean Square Error: 219323.1679
 - Mean Absolute Error: 219323.1679
 - R2_Score: 0.9420

 Model Performance on Test Set: 
 - Root Mean Square Error: 280835.6928
 - Mean Absolute Error: 280835.6928
 - R2_Score: 0.8932

Model:  Random Forest Regressor
 Model Performance on Training Set: 
 - Root Mean Square Error: 145139.7988
 - Mean Absolute Error: 145139.7988
 - R2_Score: 0.9746

 Model Performance on Test Set: 
 - Root Mean Square Error: 222525.5174
 - Mean Absolute Error: 222525.5174
 - R2_Score: 0.9330

