## Random forest Regression Implementation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('cardekho_imputated.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


# Data Cleaning

In [4]:
## check null values 
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
min_cost_price       0
max_cost_price       0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [5]:
df.drop(columns=['car_name', 'brand'], axis=1, inplace=True)


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Alto,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,i20,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Alto,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [7]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [8]:
## getting all the different type of features 
num_feature=[feature for feature in df.columns if df[feature].dtype!='O']
print('Numerical Featuere:',len(num_feature))
cat_feature=[feature for feature in df.columns if df[feature].dtype=='O']
print('Categorical Featuere:',len(cat_feature))
disc_feature=[feature for feature in df.columns if len(df[feature].unique())<=25]
print('Discrete  Featuere:',len(disc_feature))
con_feature=[feature for feature in df.columns if feature not in disc_feature]
print('Continous  Featuere:',len(con_feature))

Numerical Featuere: 10
Categorical Featuere: 4
Discrete  Featuere: 5
Continous  Featuere: 9


In [9]:
## independent and dependent data
x = df.drop(['selling_price'],axis = 1)
y = df['selling_price']

In [10]:
x.head()

Unnamed: 0.1,Unnamed: 0,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,0,Alto,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,1,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,2,i20,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,3,Alto,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,4,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


# Feature Encoding and Scaling


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x['model'] = le.fit_transform(x['model'])


In [12]:
x.head()

Unnamed: 0.1,Unnamed: 0,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,0,7,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,1,54,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,2,118,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,3,7,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,4,38,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [13]:
len(df['seller_type'].unique()),len(df['transmission_type'].unique()),len(df['seller_type'].unique()),len(df['fuel_type'].unique())

(3, 2, 3, 5)

In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Get column names safely
num_feature = x.select_dtypes(exclude="object").columns.tolist()
onehot_columns = ['seller_type', 'fuel_type', 'transission_type']

# Ensure onehot_columns exist in x
onehot_columns = [col for col in onehot_columns if col in x.columns]

# Define transformers
numeric_transformation = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

# Create column transformer
preprocessor = ColumnTransformer(
    [
        ("OnehotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformation, num_feature)
    ], 
    remainder='passthrough'
)

# Apply transformation
x_transformed = preprocessor.fit_transform(x)

In [15]:
xx=preprocessor.fit_transform(x)

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

((12328, 13), (3083, 13))

In [17]:
x_train

Unnamed: 0.1,Unnamed: 0,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
11210,14238,108,7.124000e+06,7.124000e+06,7,70252,Dealer,Diesel,Automatic,11.20,2400,215.00,5
1347,1731,91,5.530000e+05,7.820000e+05,2,10000,Individual,Petrol,Manual,23.84,1199,84.00,5
10363,13218,17,1.010000e+07,1.010000e+07,2,6000,Dealer,Diesel,Automatic,19.00,1950,241.30,5
316,403,25,1.268000e+06,1.680000e+06,7,63000,Dealer,Petrol,Manual,17.80,1497,117.30,5
10638,13550,117,6.983200e+05,7.293333e+05,10,80292,Dealer,Petrol,Manual,20.36,1197,78.90,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,6581,42,9.267760e+05,1.149968e+06,7,127731,Dealer,Diesel,Manual,20.77,1248,88.80,7
13418,17029,95,1.012000e+06,1.608000e+06,11,59000,Dealer,Petrol,Manual,16.09,1598,103.20,5
5390,6839,100,5.453481e+05,6.917691e+05,7,20000,Individual,Petrol,Manual,20.51,998,67.04,5
860,1104,118,7.630000e+05,1.266000e+06,2,15000,Dealer,Petrol,Manual,18.60,1197,81.86,5


## Model Training and Model seletion

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,classification_report,precision_score,recall_score,f1_score,confusion_matrix,roc_auc_score,roc_curve,mean_absolute_error,mean_squared_error,r2_score

In [19]:
## Create a Function To Evulate Model
def evualaet_model(true,predicted):
    mae=mean_squared_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_score=r2_score(true,predicted)
    return mae,rmse,r2_score

In [20]:
import pandas as pd

# Get feature names after transformation
feature_names = (
    preprocessor.get_feature_names_out()
    if hasattr(preprocessor, "get_feature_names_out")
    else None
)

# Apply transformation
x_train_transformed = preprocessor.fit_transform(x_train)
x_test_transformed = preprocessor.transform(x_test)

# Convert NumPy array to DataFrame with column names
X_train = pd.DataFrame(x_train_transformed, columns=feature_names)
X_test = pd.DataFrame(x_test_transformed, columns=feature_names)

# Check the data types
print(X_train.dtypes)

OnehotEncoder__seller_type_Individual          object
OnehotEncoder__seller_type_Trustmark Dealer    object
OnehotEncoder__fuel_type_Diesel                object
OnehotEncoder__fuel_type_Electric              object
OnehotEncoder__fuel_type_LPG                   object
OnehotEncoder__fuel_type_Petrol                object
StandardScaler__Unnamed: 0                     object
StandardScaler__model                          object
StandardScaler__min_cost_price                 object
StandardScaler__max_cost_price                 object
StandardScaler__vehicle_age                    object
StandardScaler__km_driven                      object
StandardScaler__mileage                        object
StandardScaler__engine                         object
StandardScaler__max_power                      object
StandardScaler__seats                          object
remainder__transmission_type                   object
dtype: object


In [22]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score
)

# Define models (separate classification and regression)
models = {
    "Linear Regression": LinearRegression(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Nearest Regressor": KNeighborsRegressor(),
}

# Loop through models
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(x_train, y_train)
    
    # Make Predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    print(f"\n{name} Performance:")

    # Classification models
    if name in ["Logistic Regression", "Random Forest", "Decision Tree"]:
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred, average='weighted')
        train_precision = precision_score(y_train, y_train_pred, average='weighted')
        train_recall = recall_score(y_train, y_train_pred, average='weighted')
        train_auc = roc_auc_score(y_train, model.predict_proba(x_train)[:, 1]) if hasattr(model, "predict_proba") else None

        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')
        test_precision = precision_score(y_test, y_test_pred, average='weighted')
        test_recall = recall_score(y_test, y_test_pred, average='weighted')
        test_auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1]) if hasattr(model, "predict_proba") else None

        print(f"Training Accuracy: {train_accuracy:.4f}")
        print(f"Training F1 Score: {train_f1:.4f}")
        print(f"Training Precision: {train_precision:.4f}")
        print(f"Training Recall: {train_recall:.4f}")
        print(f"Training ROC AUC Score: {train_auc:.4f}" if train_auc else "ROC AUC not available")

        print("\nTest Performance:")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test F1 Score: {test_f1:.4f}")
        print(f"Test Precision: {test_precision:.4f}")
        print(f"Test Recall: {test_recall:.4f}")
        print(f"Test ROC AUC Score: {test_auc:.4f}" if test_auc else "ROC AUC not available")

    # Regression models
    else:
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
        train_r2 = r2_score(y_train, y_train_pred)

        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
        test_r2 = r2_score(y_test, y_test_pred)

        print(f"Training MAE: {train_mae:.4f}")
        print(f"Training RMSE: {train_rmse:.4f}")
        print(f"Training R2 Score: {train_r2:.4f}")

        print("\nTest Performance:")
        print(f"Test MAE: {test_mae:.4f}")
        print(f"Test RMSE: {test_rmse:.4f}")
        print(f"Test R2 Score: {test_r2:.4f}")

    print("-" * 50)


Training Linear Regression...


ValueError: could not convert string to float: 'Dealer'