### Import Pakages

In [2]:
import pandas as pd
import os
import ast
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import pickle


### Merge All the data

In [3]:
# List of file names
bath = "Dataset\\"
file_names = ['bangalore_cars.xlsx', 'chennai_cars.xlsx', 'delhi_cars.xlsx',
              'hyderabad_cars.xlsx', 'jaipur_cars.xlsx', 'kolkata_cars.xlsx']
dfs = []

for file in file_names:
    df = pd.read_excel(f"{bath}{file}")
    state = file.split('_')[0]
    df['state'] = state
   
    dfs.append(df)
 
merged_df = pd.concat(dfs, ignore_index=True)
 
# merged_df.to_excel('merged_cars_data.xlsx', index=False)
 
print("Files merged successfully. Output saved as 'merged_cars_data.xlsx'")

Files merged successfully. Output saved as 'merged_cars_data.xlsx'


### Function Extract the Columns 

In [4]:
def convert_to_dict(row):
    try:
        a = ast.literal_eval(row)
        return a
    except ValueError:
        return None
    

def first(car_data):
    df = {}
    for row in car_data:
        for key in row:
            if key not in df:
                df[key] = []
            df[key].append(row[key])
    df = pd.DataFrame(df)
    return df

def top(car_data):
        def extract_key_value_pairs(d):
            flat_dict = {}
            try:
                if 'top' in d:
                    for item in d['top']:
                        key = item.get('key')  # Extracting the key
                        value = item.get('value') # Extracting the value
                        if key:
                            flat_dict[key] = value  # Use key as column name, value as entry
            except:
                pass
            return flat_dict
        flattened_data = [extract_key_value_pairs(d) for d in car_data if d is not None]
        df = pd.DataFrame(flattened_data)
        return df


def data(car_data):
    def extract_key_value_pairs(d):
        flat_dict = {}
        try:
            # Proceed only if 'data' exists in the dictionary
            if d is not None and 'data' in d:
                for section in d['data']:
                    if 'list' in section:
                        for item in section['list']:
                            key = item.get('key')  # Using .get to avoid KeyError
                            value = item.get('value')
                            if key and value:  # Ensure both key and value exist
                                flat_dict[key] = value
        except Exception as e:
            # Log or handle exception if needed
            print(f"Error processing row: {e}")
        return flat_dict

    # Filter out None entries
    flattened_data = [extract_key_value_pairs(d) for d in car_data if d is not None]
    df = pd.DataFrame(flattened_data)
    return df



### Extract the Columns 

In [5]:
car_columns = ['new_car_detail', 'new_car_overview', 'new_car_feature', 'new_car_specs']

for columns in car_columns:
    merged_df[columns] = merged_df[columns].apply(convert_to_dict)


final_df = pd.DataFrame()

q = pd.DataFrame()
for i in car_columns:
    w = first(merged_df[i])

    if w is None:
        continue
    columns = w.columns

    if "top" in columns:
        w = top(merged_df[i])

    if "data" in columns:
        q = data(merged_df[i])

    final_df = pd.concat([final_df, w], axis=1, ignore_index=False)
    if not q.empty:
        final_df = pd.concat([final_df, q], axis=1, ignore_index=False)
final_df['city'] = merged_df["state"]
final_df['car_links'] = merged_df['car_links']

In [None]:
pd.set_option('display.max_columns', None)
final_df

### Data Cleaning and Remove Null values

In [7]:
# Import necessary libraries
import re
import numpy as np
import pandas as pd

# Create a copy of the final dataframe
df = final_df.copy()

df_columns = df.columns
for i,k in df.isna().sum().items():
    if k >= 300:
        df.drop(i, axis=1, inplace=True)
df_columns_1 = df.columns
print(len(df_columns),len(df_columns_1))

df_1 = pd.DataFrame(df['Max Power'])
df_1.columns = ['Max Power (bhp)', 'Max Power (bhp@rpm)']
df = df.drop(columns=["Max Power"])
df['Max Power'] = df_1['Max Power (bhp)']


conversion_factor = 1.39
df['Mileage'] = df['Mileage'].str.replace(' kmpl', '') 
df['Mileage'] = df['Mileage'].apply(lambda x: float(x.replace(' km/kg', '')) * conversion_factor if 'km/kg' in str(x) else float(x))
df['Mileage'] = df['Mileage'].astype(float)

def extract_integer(value):
    result = re.findall(r'\d+', str(value))
    return int(result[0]) if result else None


df['km'] = df['km'].str.replace(',', '').astype(int)

df = df.drop(["priceActual","priceSaving","Year of Manufacture","Seats","Fuel Type","trendingText","owner","Ownership","Kms Driven","transmission",'Engine Displacement','Displacement','Max Torque',"car_links",'centralVariantId'], axis=1)

df['Insurance Validity'] = df['Insurance Validity'].replace({
    'Third Party insurance': 'Third Party',
    '1': 'Not Available',
    '2': 'Not Available',
})

df['Gear Box'] = df['Gear Box'].str.strip()
df['Gear Box'] = df['Gear Box'].replace({
    '5-Speed': '5 Speed', '5-Speed`': '5 Speed', '5-Speed ': '5 Speed',
    '5 speed': '5 Speed', '5-Speed ': '5 Speed',
    'Six Speed Manual': '6 Speed', 'Six Speed': '6 Speed', '6-Speed': '6 Speed',
    '6-Speed IVT': '6 Speed IVT', 'Six Speed Automatic Transmission': '6 Speed',
    '6Speed': '6 Speed', 'Six Speed  Gearbox': '6 Speed', '6-speed': '6 Speed',
    'Five Speed': '5 Speed', 'Five Speed Manual': '5 Speed',
    'Five Speed Manual Transmission': '5 Speed', 'Five Speed Manual Transmission Gearbox': '5 Speed',
    '7-Speed': '7 Speed', '7-speed': '7 Speed',
    'Seven Speed': '7 Speed', '7-speed PDK': '7 Speed',
    '7G-DCT': '7 Speed DCT', '7-Speed DCT': '7 Speed DCT',
    '8-Speed': '8 Speed', '8-speed': '8 Speed',
    '8Speed': '8 Speed', '8-Speed DCT': '8 Speed',
    '9-Speed': '9 Speed', '9-speed': '9 Speed',
    '10-speed': '10 Speed',
    'Fully Automatic': 'Automatic', 'Single-speed transmission': 'Single Speed',
    'Single speed reduction gear': 'Single Speed', 'Single Speed': 'Single Speed',
    'SPEEDSHIFT TCT 9G': '9 Speed', '9G-TRONIC': '9 Speed',
    '9G TRONIC': '9 Speed', '9G-TRONIC automatic': '9 Speed',
    'AGS': 'Automatic', 'IVT': 'IVT', 'E-CVT': 'CVT', 'eCVT': 'CVT',
    'AMG SPEEDSHIFT DCT 8G': '8 Speed DCT', 'AMG 7-SPEED DCT': '7 Speed DCT',
    '10 Speed': '10 Speed'
})

df['Engine'] = df['Engine'].str.replace(' CC', '').astype(float)

df['Max Power'] = df['Max Power'].apply(extract_integer)

df['Torque'] = df['Torque'].apply(extract_integer)

df['Length'] = df['Length'].apply(extract_integer)

df['Width'] = df['Width'].apply(extract_integer)

df['Wheel Base'] = df['Wheel Base'].apply(extract_integer)

df['Height'] = df['Height'].apply(extract_integer)

df['Kerb Weight'] = df['Kerb Weight'].apply(extract_integer)
df['Kerb Weight'] = df['Kerb Weight'].fillna(df['Kerb Weight'].mean())

df['Registration Year'] = df['Registration Year'].apply(extract_integer)

df['Tyre Type'] = df['Tyre Type'].str.strip()
df['Tyre Type'] = df['Tyre Type'].replace({
    'Tubeless, Radial': 'Tubeless Radial', 'Tubeless,Radial': 'Tubeless Radial', 
    'Radial, Tubless': 'Tubeless Radial', 'Radial, Tubeless': 'Tubeless Radial',
    'Tubeless Radial Tyres': 'Tubeless Radial', 'Tubless, Radial': 'Tubeless Radial',
    'Radial,Tubeless': 'Tubeless Radial', 'Tubeless,Radials': 'Tubeless Radial',
    'Tubeless, Runflat': 'Tubeless Runflat', 'Tubeless. Runflat': 'Tubeless Runflat',
    'Runflat Tyres': 'Runflat', 'Runflat Tyre': 'Runflat', 'Runflat,Radial': 'Runflat Radial',
    'Run-Flat': 'Runflat', 'Radial with tube': 'Radial Tube',
    'Radial Tubeless': 'Tubeless Radial', 'Tubeless Radials Tyre': 'Tubeless Radial',
    'Tubeless ': 'Tubeless', 'Radial ': 'Radial', 
    'tubeless tyre': 'Tubeless', 'Tubeless Tyre': 'Tubeless',
    'Tubeless Tyres': 'Tubeless', 'Radial Tyres': 'Radial',
    'Tubeless Tyres, Radial': 'Tubeless Radial', 'Tubeless,Radial ': 'Tubeless Radial',
    'Tubeless Tyres Mud Terrain': 'Tubeless Mud Terrain',
    'Tubeless Tyres All Terrain': 'Tubeless All Terrain',
    'Tubeless Radials': 'Tubeless Radial','Tubeless,Runflat': 'Tubeless Runflat',
    'Tubless,Radial': 'Tubeless Radial', 'Radial Tube': 'Radial', 'Tubeless, Radials': 'Tubeless Radial'
    
})

def clean_price(price):
    price = price.replace(',', '').replace('₹', '').strip()
    try:
        if 'Lakh' in price:
            price = float(price.replace('Lakh', '').strip()) * 100000
        elif 'Crore' in price:
            price = float(price.replace('Crore', '').strip()) * 10000000
        else:
            price = float(price)
    except ValueError:
        return None
    return price
df['price'] = df['price'].apply(clean_price)
df['price'] = df['price'].astype(int)


categorical_columns = [
 "ft","bt","ownerNo","oem","model","modelYear",
 "Registration Year","Insurance Validity","Transmission",
 "Engine","Color","Engine Type","No of Cylinder", "Values per Cylinder",
 "Gear Box","Seating Capacity","Steering Type","Front Brake Type",
 "Rear Brake Type","Tyre Type","No Door Numbers","city"]

Numerical_columns = ["it","km","Mileage","Torque","Length","Width","Height","Wheel Base","Kerb Weight","Max Power"]

for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in Numerical_columns:
    df[col] = df[col].fillna(df[col].mean())

70 51


# Data Preprocessing 

In [8]:
OneHotEncoding_columns = [
    'ft', 'bt', 'oem', 'model', 'Insurance Validity', 'Transmission',
    'Color', 'Engine Type', 'Gear Box', 'Steering Type', 
    'Front Brake Type', 'Rear Brake Type', 'Tyre Type', 'city','variantName'
]

label_encoding_columns = [
    'Engine', 'No of Cylinder', 'Values per Cylinder', 
    'Seating Capacity', 'No Door Numbers', 'ownerNo', 
    'modelYear', 'Registration Year'
]

numerical_columns = [
    "it","km", "Mileage", "Torque", "Length", "Width", "Height",
    "Wheel Base", "Kerb Weight", "Max Power"
]

drop_columns = []


X = df.drop(columns=['price'])
y = df['price']



## Low variance threshold

In [174]:
numerical_column = X[numerical_columns]

variance = numerical_column.var()

low_variance_columns = variance[variance == 0].index.tolist()

print("Columns with low variance (variance = 0):", low_variance_columns)

drop_columns.append(low_variance_columns[0])
numerical_columns.remove(low_variance_columns[0])

Columns with low variance (variance = 0): ['it']


## correlation matrix

In [None]:
# Select only numeric columns for correlation
numeric_df = X[numerical_columns]

# Calculate the correlation matrix
correlation_matrix = numeric_df.corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

def correlation(dataset, threshold):
    col_corr = set()  
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]  
                col_corr.add(colname)
    return col_corr

numeric_df = X[numerical_columns]
corr_features = correlation(numeric_df, 0.9)

print(corr_features)
for i in corr_features:
    drop_columns.append(i)
    numerical_columns.remove(i)

## correlation with price

In [None]:
X_corr = pd.DataFrame(X[numerical_columns])

X_corr['price'] = y.values  

correlation_with_price = X_corr[numerical_columns + ['price']].corr()['price']

print("Correlation with price:\n", correlation_with_price)

high_correlation_features = correlation_with_price[abs(correlation_with_price) < 0.1].index.tolist()
print("Highly correlated features with price (|correlation| < 0.1):", high_correlation_features)
for i in high_correlation_features:
    drop_columns.append(i)
    numerical_columns.remove(i)

## Chi Square Test

In [None]:
from scipy.stats import chi2_contingency

categorical_columns = label_encoding_columns + OneHotEncoding_columns

chi_square_results = []

for i, col1 in enumerate(categorical_columns):
    for j, col2 in enumerate(categorical_columns):
        if i >= j:  
            continue
        
        contingency_table = pd.crosstab(X[col1], X[col2])

        chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

        chi_square_results.append((col1, col2, chi2_stat, p_value))

chi_square_df = pd.DataFrame(chi_square_results, columns=['Feature 1', 'Feature 2', 'Chi-Square Statistic', 'p-value'])

pivot_table = chi_square_df.pivot(index='Feature 1', columns='Feature 2', values='Chi-Square Statistic')

pivot_table = pivot_table.fillna(pivot_table.T)

plt.figure(figsize=(12, 10))
sns.heatmap(pivot_table, cmap="YlGnBu", annot=False, linewidths=0.5)
plt.title("Heatmap of Chi-Square Statistics Between Categorical Features")
plt.xticks(rotation=90)  
plt.yticks(rotation=0)   
plt.tight_layout()
plt.show()

chi_square_threshold = 10000 

highly_correlated = chi_square_df[chi_square_df['Chi-Square Statistic'] > chi_square_threshold]

highly_correlated = highly_correlated.sort_values(by='Chi-Square Statistic', ascending=False)

highly_correlated

## Drop columns based on Chi Square Test

In [178]:
drop_ = ["Engine Type","model","variantName","Registration Year",'Rear Brake Type',"Front Brake Type",'Tyre Type']
for i in drop_:
    drop_columns.append(i)
    try:
        label_encoding_columns.remove(i)
    except:
        OneHotEncoding_columns.remove(i)

## Anova test

In [179]:
import pandas as pd
from scipy import stats
from sklearn.preprocessing import LabelEncoder

anova_results = []

for feature in label_encoding_columns:
    groups = [y[X[feature] == category] for category in X[feature].unique()]
    
    f_stat, p_value = stats.f_oneway(*groups)

    anova_results.append((feature, f_stat))

for feature in OneHotEncoding_columns:
    groups = [y[X[feature] == category] for category in X[feature].unique()]
    
    f_stat, p_value = stats.f_oneway(*groups)
    
    anova_results.append((feature, f_stat))

anova_df = pd.DataFrame(anova_results, columns=['Feature', 'F_statistic'])

anova_df = anova_df.sort_values(by='F_statistic', ascending=False)


In [None]:
top_5_features = anova_df.head(11)
last_features = anova_df[11:]
print(top_5_features['Feature'])
print(last_features['Feature'])

In [None]:
for i in last_features['Feature']:
    print(i)
    drop_columns.append(i)
    try:
        label_encoding_columns.remove(i)
    except:
        OneHotEncoding_columns.remove(i)

## Encoding process

In [182]:
X = X.drop(columns=drop_columns)
for col in label_encoding_columns:
    label_encoder = LabelEncoder()
    label_encoder.fit(X[col]) 
    X[col] = label_encoder.transform(X[col])
    with open(f'Models\\label_encoder_{col}.pkl', 'wb') as file:
        pickle.dump(label_encoder, file)


In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_encoded = pd.DataFrame(onehot_encoder.fit_transform(X_train[OneHotEncoding_columns]))
X_test_encoded = pd.DataFrame(onehot_encoder.transform(X_test[OneHotEncoding_columns]))

X_train_encoded.columns = onehot_encoder.get_feature_names_out(OneHotEncoding_columns)
X_test_encoded.columns = onehot_encoder.get_feature_names_out(OneHotEncoding_columns)

X_train_encoded.index = X_train.index
X_test_encoded.index = X_test.index

X_train = X_train.drop(columns=OneHotEncoding_columns)
X_test = X_test.drop(columns=OneHotEncoding_columns)

X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)



## Scaling

In [184]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)


## Model training

In [158]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import numpy as np

lr_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
svr_model = SVR()
knn_model = KNeighborsRegressor()
dt_model = DecisionTreeRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42, eval_metric='rmse')

models = {
    'Linear Regression': lr_model,
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model,
    'Support Vector Regressor': svr_model,
    'K-Nearest Neighbors': knn_model,
    'Decision Tree': dt_model,
    'XGBoost': xgb_model
}

# Initialize a dictionary to store cross-validation scores
cv_scores = {}

# Loop through each model, perform cross-validation, and store the results
for model_name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    cv_scores[model_name] = scores
    print(f"{model_name} CV Scores: {scores}")
    print(f"{model_name} Mean CV R² Score: {np.mean(scores)}\n")

# Find the model with the highest mean cross-validation R² score
best_model = max(cv_scores, key=lambda x: np.mean(cv_scores[x]))
best_score = np.mean(cv_scores[best_model])

print(f"Best model: {best_model} with mean R² score of {best_score}")

Linear Regression CV Scores: [-4.32386482e+24 -1.69427835e+25 -5.27174262e+21 -5.12399293e+23
 -4.10257173e+24]
Linear Regression Mean CV R² Score: -5.177378223685654e+24

Random Forest CV Scores: [0.61466488 0.92403906 0.87953269 0.85847013 0.87259771]
Random Forest Mean CV R² Score: 0.8298608948968041

Gradient Boosting CV Scores: [0.5796606  0.92134317 0.89102619 0.88542972 0.84811279]
Gradient Boosting Mean CV R² Score: 0.825114492976506

Support Vector Regressor CV Scores: [-0.05631644 -0.08965832 -0.06148658 -0.0781905  -0.06245647]
Support Vector Regressor Mean CV R² Score: -0.06962166335698376

K-Nearest Neighbors CV Scores: [0.5388989  0.84888337 0.65791461 0.72563647 0.6755662 ]
K-Nearest Neighbors Mean CV R² Score: 0.6893799103938031

Decision Tree CV Scores: [0.50157702 0.87822218 0.75973181 0.84444299 0.76498566]
Decision Tree Mean CV R² Score: 0.749791930349686

XGBoost CV Scores: [0.5977093  0.92810094 0.90203726 0.79699004 0.8625896 ]
XGBoost Mean CV R² Score: 0.8174854

In [61]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

model_xgb = XGBRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [80, 100, 200, 300, 500],          # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1, 0.2],      # Learning rate
    'max_depth': [3, 5, 7,10,50,100],                   # Maximum depth of a tree
    'min_child_weight': [1, 3, 5],                # Minimum sum of instance weight(hessian) needed in a child
    'gamma': [0, 0.1, 0.5],                  # Minimum loss reduction required to make a further partition
    'subsample': [0.6, 0.8, 1.0],                 # Subsample ratio of the training instance
    'colsample_bytree': [0.6, 0.8, 1.0],          # Subsample ratio of columns when constructing each tree
    'reg_alpha': [0, 0.01, 0.1, 1],               # L1 regularization term on weights
    'reg_lambda': [0.01, 0.1, 1, 10],             # L2 regularization term on weights
}

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=model_xgb,
                                   param_distributions=param_grid,
                                   n_iter=100,           # Number of different combinations to try
                                   scoring='neg_mean_squared_error',  # Use a suitable scoring method
                                   cv=3,                 # Cross-validation
                                   verbose=2,            # To see the progress
                                   n_jobs=-1,            # Use all processors
                                   random_state=42)      # Ensure reproducibility


In [62]:
random_search.fit(X_train_scaled, y_train)

best_params = random_search.best_params_
print(f"Best parameters found: {best_params}")

best_model = random_search.best_estimator_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found: {'subsample': 1.0, 'reg_lambda': 10, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 0.5, 'colsample_bytree': 0.8}


In [189]:
model_xgb = XGBRegressor(subsample=1.0,reg_lambda =10,reg_alpha=0,n_estimators=500,min_child_weight=5,max_depth = 5,learning_rate= 0.2,gamma=0.5, colsample_bytree= 0.8)
model_xgb.fit(X_train_scaled, y_train)

y_pred_xgb = model_xgb.predict(X_test_scaled)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBRegressor:")
print("MSE:", mse_xgb)
print("RMSE:", rmse_xgb)
print("R2:", r2_xgb)

XGBRegressor:
MSE: 373865999363.78064
RMSE: 611445.8270065964
R2: 0.8747429251670837


In [190]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train_scaled, y_train)

y_pred_xgb = model_xgb.predict(X_test_scaled)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBRegressor : ")
print("MSE:", mse_xgb)
print("RMSE:", rmse_xgb)
print("R2:", r2_xgb)

XGBRegressor:
MSE: 188392201549.15237
RMSE: 434041.7048500666
R2: 0.9368826150894165


In [171]:
# Model 2: HistGradientBoostingRegressor
hgb_model = HistGradientBoostingRegressor()
hgb_model.fit(X_train_scaled, y_train)

y_pred_hgb = hgb_model.predict(X_test_scaled)
mse_hgb = mean_squared_error(y_test, y_pred_hgb)
rmse_hgb = np.sqrt(mse_hgb)
r2_hgb = r2_score(y_test, y_pred_hgb)

print("HistGradientBoostingRegressor:")
print("MSE:", mse_hgb)
print("RMSE:", rmse_hgb)
print("R2:", r2_hgb)


HistGradientBoostingRegressor:
MSE: 661697067930.2336
RMSE: 813447.6430171973
R2: 0.7783103633757368


In [169]:
# Model 4: RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=500)
rf_model.fit(X_train_scaled, y_train)

y_pred_rf = rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("RandomForestRegressor:")
print("MSE:", mse_rf)
print("RMSE:", rmse_rf)
print("R2:", r2_rf)

RandomForestRegressor:
MSE: 252286232049.02133
RMSE: 502281.02895592357
R2: 0.915476060241287


In [None]:
Gr_model = GradientBoostingRegressor(n_estimators=500)
Gr_model.fit(X_train_scaled, y_train)

y_pred_rf = Gr_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("RandomForestRegressor:")
print("MSE:", mse_rf)
print("RMSE:", rmse_rf)
print("R2:", r2_rf)

In [191]:
with open('Models\\model_xgb.pkl', 'wb') as file:
    pickle.dump(model_xgb, file)
with open('Models\\scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
with open('Models\\onehot_encoder.pkl', 'wb') as file:
    pickle.dump(onehot_encoder, file)

In [84]:
X_train_1 = pd.read_csv('Dataset\\X_train_1.csv') 

In [194]:
numerical_columns

['Mileage',
 'Torque',
 'Length',
 'Width',
 'Height',
 'Wheel Base',
 'Kerb Weight',
 'Max Power']