In [6]:
# A Few Imports Required

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
from IPython.display import HTML
from matplotlib_venn import venn2
from subprocess import check_output
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin
import tensorflow as tf

In [None]:
# Dynamically load the file path
file_path = os.path.join('sample_data/Clean_Merged_AutomobileData.csv')

# Function to load data
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
        return None
    except pd.errors.ParserError:
        print("Error: The file could not be parsed.")
        return None

# Step 3: Load the dataset into a pandas DataFrame
DF_DataSet = load_data(file_path)

#Feature Selection:
feature_cols = ["brand", "model", "year", "transmission", "mileage", "fuelType", "mpg", "engineSize"]
target_col = ["price"]
categorical_cols = ["brand", "model", "transmission", "fuelType"]

if DF_DataSet is not None:
    # Select relevant columns
    desired_columns = ["brand", "model", "year", "price", "transmission", "mileage", "fuelType", "mpg", "engineSize"]
    DF_DataSet = DF_DataSet[desired_columns]

# Function to preprocess data
def preprocess_data(DF_DataSet, feature_cols, target_col, categorical_cols):
    try:
        # Label encode categorical variables
        label_encoder = LabelEncoder()
        DF_DataSet['brand_encoded'] = label_encoder.fit_transform(DF_DataSet['brand'])
        DF_DataSet['model_encoded'] = label_encoder.fit_transform(DF_DataSet['model'])
        DF_DataSet['transmission_encoded'] = label_encoder.fit_transform(DF_DataSet['transmission'])
        DF_DataSet['fuelType_encoded'] = label_encoder.fit_transform(DF_DataSet['fuelType'])

        # Select relevant features for X and target for Y
        X = DF_DataSet[['brand_encoded', 'model_encoded', 'year', 'transmission_encoded', 'mileage', 'fuelType_encoded', 'mpg', 'engineSize' ]]
        Y = DF_DataSet[target_col]

        return X, Y, label_encoder
    except KeyError as e:
        print(f"Error: The specified column {e} does not exist in the dataframe.")
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred during preprocessing: {e}")
        return None, None, None

# Preprocess data
X, Y, label_encoder = preprocess_data(DF_DataSet, feature_cols, target_col, categorical_cols)

#Splitting the data

#Function to split data for Machine Learning
def split_data1(X, Y, test_size=0.3, random_state=42):
    try:
        return train_test_split(X, Y, test_size=test_size, random_state=random_state)
    except ValueError as e:
        print(f"Error: {e}")
        return None

if X is not None and Y is not None:
        # Split data (function call)
        X_train, X_test, Y_train, Y_test = split_data1(X, Y)

In [8]:
# #Training the Random Forest Model

# #function definition
# def train_model_RFR(X_train, Y_train):
#         modelRFR = RandomForestRegressor(n_estimators = 400)
#         modelRFR.fit(X_train, Y_train)
#         return modelRFR

# #Function call
# if X_train is not None and Y_train is not None:
#     # Train model
#     modelRFR = train_model_RFR(X_train, Y_train)

#     # Save the model using joblib
#     joblib.dump(modelRFR, 'RFR_model.joblib')


# # -----------------------------------------

# # Function to split dataset for deep learning
# def split_dataset2(x, y, train_size=0.75, val_size=0.15, test_size=0.15):
#     try:

#         # Started by spliting to get the training set
#         x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=(1.0 - train_size), random_state=42)

#         # Next I split the remaining data into validation and test sets
#         val_ratio = val_size / (val_size + test_size)
#         x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=(1.0 - val_ratio), random_state=42)

#         return x_train, x_val, x_test, y_train, y_val, y_test
#     except Exception as e:
#         print(f"Error splitting the dataset: {e}")
#         return None, None, None, None, None, None

# # Function call
# x_train, x_val, x_test, y_train, y_val, y_test = split_dataset2(X, Y)

# # Scaling the feature values
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_val = scaler.transform(x_val)

# # Scaling the target values
# y_scaler = MinMaxScaler()
# y_train = y_scaler.fit_transform(y_train)
# y_val = y_scaler.transform(y_val)

# # Defining the MLP model for regression
# def create_mlp(input_shape):
#     model = Sequential([
#         Dense(128, activation='relu', input_shape=input_shape),
#         Dense(64, activation='relu'),
#         Dense(1)  # Linear activation (default) for regression
#     ])
#     return model

# # Creating and compile the model
# mlp_model = create_mlp((x_train.shape[1],))
# mlp_model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mean_squared_error'])

# # Define early stopping
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# # Train the model
# history_mlp = mlp_model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=20, batch_size=32, callbacks=[early_stopping])

# # Save model
# mlp_model.save('mlp_model.h5')

# # -------------------

Epoch 1/20
[1m2310/2310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.0045 - mean_squared_error: 0.0045 - val_loss: 6.2482e-04 - val_mean_squared_error: 6.2482e-04
Epoch 2/20
[1m2310/2310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 6.6032e-04 - mean_squared_error: 6.6032e-04 - val_loss: 5.0297e-04 - val_mean_squared_error: 5.0297e-04
Epoch 3/20
[1m2310/2310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 5.0457e-04 - mean_squared_error: 5.0457e-04 - val_loss: 4.6802e-04 - val_mean_squared_error: 4.6802e-04
Epoch 4/20
[1m2310/2310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 4.7818e-04 - mean_squared_error: 4.7818e-04 - val_loss: 4.4302e-04 - val_mean_squared_error: 4.4302e-04
Epoch 5/20
[1m2310/2310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 4.3370e-04 - mean_squared_error: 4.3370e-04 - val_loss: 4.1317e-04 - val_mean_squared_error: 4.1317e-04
Epoch 



In [9]:
# Training the hybrid model

# Loading the pre-trained models
modelRFR = joblib.load('RFR_model.joblib')
MLP_model = load_model('mlp_model.h5')

# Define the hybrid model
class HybridModel(BaseEstimator, RegressorMixin):
    def __init__(self, rfr_model, mlp_model, alpha=0.5):
        self.rfr_model = rfr_model
        self.mlp_model = mlp_model
        self.alpha = alpha

    def fit(self, X, y):
        self.rfr_model.fit(X, y)  # Fit the RFR model

    def predict(self, X):
        # Convert DataFrame to NumPy array before reshaping
        X_np = X.values
        mlp_X = X_np.reshape(X_np.shape[0], -1)
        rfr_pred = self.rfr_model.predict(X_np)
        mlp_pred = self.mlp_model.predict(mlp_X).flatten()
        return self.alpha * rfr_pred + (1 - self.alpha) * mlp_pred

    def get_params(self, deep=True):
        return {'alpha': self.alpha, 'rfr_model': self.rfr_model, 'mlp_model': self.mlp_model}

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

# Defining a scoring function
def scoring_func(estimator, X, y):
    predictions = estimator.predict(X)
    return -mean_squared_error(y, predictions)

# Initialising the hybrid model
hybrid_model = HybridModel(modelRFR, MLP_model)

# Setting up parameter grid for alpha
param_grid = {'alpha': [0.1, 0.3, 0.5, 0.7, 0.9]}

# Using GridSearchCV for tuning
grid_search = GridSearchCV(estimator=hybrid_model, param_grid=param_grid, scoring=scoring_func, cv=3, verbose=2)

# Performing grid search
grid_search.fit(x_train, y_train)

# Getting the best parameters and model
best_params = grid_search.best_params_
best_hybrid_model = grid_search.best_estimator_

# Printing the best parameters
# print("Best parameters found: ", best_params)

# Saving the best Hybrid model if it is not None
if best_hybrid_model is not None:
   joblib.dump(best_hybrid_model, 'best_hybrid_model.joblib')
else:
    print("The best hybrid model's .model attribute is None. Model was not saved.")


# ------------------- PREDICTION ------------------------

# Prediction with MLP Model (Single decoded output)
def predict_future_prices(model, df, future_years, feature_cols):
    try:
        # Included encoded columns in model features
        model_features = feature_cols + ['brand_encoded', 'model_encoded', 'transmission_encoded', 'fuelType_encoded']

        # Calculating the average yearly change for each numeric feature grouped by 'model_encoded'
        numeric_features = df[model_features].select_dtypes(include=[np.number]).columns

        yearly_changes = df.groupby(['brand_encoded', 'model_encoded', 'year'])[numeric_features].mean().groupby(level=[0, 1]).diff().groupby(level=[0, 1]).mean()

        # Getting all rows for each model from the original dataset
        # Include 'brand_encoded' and 'model_encoded' in the selected columns
        last_rows = df.groupby(['brand_encoded', 'model_encoded', 'year']).apply(lambda x: x[model_features].iloc[-1]).reset_index(drop=True)

        # Creating a list to store future data rows
        future_data_rows = []

        for (brand_code, model_code, year) in last_rows[['brand_encoded', 'model_encoded', 'year']].drop_duplicates().values:
            model_last_row = last_rows[(last_rows['brand_encoded'] == brand_code) & (last_rows['model_encoded'] == model_code) & (last_rows['year'] == year)].copy()

            for i, future_year in enumerate(future_years):
                new_row = model_last_row.copy()
                new_row['future_year'] = future_year  # Add future year

                # Updating numeric features based on yearly changes for each model
                for feature in numeric_features:
                    if feature != 'year':
                        new_row[feature] += yearly_changes.loc[(brand_code, model_code), feature] * (i + 1)

                future_data_rows.append(new_row)

        # Combining the future data rows
        future_data = pd.concat(future_data_rows, ignore_index=True)

        # Handling missing values - Impute only numeric columns
        imputer = SimpleImputer(strategy='mean')
        numeric_cols_to_impute = future_data.select_dtypes(include=[np.number]).columns
        future_data[numeric_cols_to_impute] = imputer.fit_transform(future_data[numeric_cols_to_impute])

        # Create DataFrame after imputation
        future_data_imputed = pd.DataFrame(future_data, columns=future_data.columns)

        # Ensuring that the order of columns matches the model's feature order, adding 'future_year' correctly
        # Creating a new list with 'future_year' appended
        all_features = list(model_features) + ['future_year']
        future_data_imputed = future_data_imputed[all_features]

        # Using only encoded features for prediction ---
        encoded_features = ['brand_encoded', 'model_encoded', 'year', 'transmission_encoded', 'mileage', 'fuelType_encoded', 'mpg', 'engineSize']
        future_prices = model.predict(future_data_imputed[encoded_features])

        future_data_imputed['Predicted Price'] = future_prices.round().astype(int)

        # Setting a minimum price (e.g., 0)
        future_data_imputed['Predicted Price'] = future_data_imputed['Predicted Price'].apply(lambda x: max(2458, x))

        # Returning the DataFrame with 'Predicted Price' before 'future_year'
        return future_data_imputed[['brand_encoded', 'model_encoded', 'year', 'Predicted Price', 'future_year']]
    except Exception as e:
        print(f"Error predicting future prices: {e}")
        return None

def main():
    # Load the Hybrid model using joblib
    Model_HYBRID = joblib.load('best_hybrid_model.joblib')

    future_years = range(2025, 2035)
    # Passing feature_cols to predict_future_prices
    predictions = predict_future_prices(Model_HYBRID, DF_DataSet, future_years, feature_cols)
    if predictions is not None:
        # print("\nPredicted Car Prices for Years 2025-2035:")
        # print(predictions)

        # Converting 'brand_encoded' and 'model_encoded' to integers before inverse transforming
        predictions['brand_encoded'] = predictions['brand_encoded'].astype(int)
        predictions['model_encoded'] = predictions['model_encoded'].astype(int)

        # Geting original labels from the DataFrame (assuming they are still available)
        # Handling potential missing values in encoded columns before creating mappings
        brand_mapping = dict(zip(DF_DataSet['brand_encoded'].dropna().unique(), DF_DataSet['brand'].dropna().unique()))
        model_mapping = dict(zip(DF_DataSet['model_encoded'].dropna().unique(), DF_DataSet['model'].dropna().unique()))

        # Mapping encoded values to original labels, handling potential missing values
        predictions['brand'] = predictions['brand_encoded'].map(brand_mapping).fillna('Unknown Brand') # Handle missing brands
        predictions['model'] = predictions['model_encoded'].map(model_mapping).fillna('Unknown Model') # Handle missing models

        # Creating a DataFrame with the predicted car prices, brand, model, and future years
        predicted_df = predictions[['brand', 'model', 'year', 'Predicted Price', 'future_year']]
        predicted_df['Predicted Price'] = predicted_df['Predicted Price'].round().astype(int)
        predicted_df['future_year'] = predicted_df['future_year'].astype(int)
        predicted_df['year'] = predicted_df['year'].astype(int)

        # print("\nPredicted DataFrame:")
        display(pd.DataFrame(predicted_df))

        # Saving the DataFrame to a CSV file
        predicted_df.to_csv('predicted_car_prices.csv', index=False)

if __name__ == "__main__":
    main()



Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ..........................................alpha=0.1; total time= 1.4min
[CV] END ..........................................alpha=0.1; total time= 1.4min
[CV] END ..........................................alpha=0.1; total time= 1.4min
[CV] END ..........................................alpha=0.3; total time= 1.4min
[CV] END ..........................................alpha=0.3; total time= 1.4min
[CV] END ..........................................alpha=0.3; total time= 1.4min
[CV] END ..........................................alpha=0.5; total time= 1.4min
[CV] END ..........................................alpha=0.5; total time= 1.4min
[CV] END ..........................................alpha=0.5; total time= 1.5min
[CV] END ..........................................alpha=0.7; total time= 1.4min
[CV] END ..........................................alpha=0.7; total time= 1.4min
[CV] END ........................................

Unnamed: 0,brand,model,year,Predicted Price,future_year
0,Audi,A1,2010,2458,2025
1,Audi,A1,2010,2458,2026
2,Audi,A1,2010,2458,2027
3,Audi,A1,2010,2458,2028
4,Audi,A1,2010,2458,2029
...,...,...,...,...,...
16025,Volkswagen,Up,2020,2458,2030
16026,Volkswagen,Up,2020,2458,2031
16027,Volkswagen,Up,2020,2458,2032
16028,Volkswagen,Up,2020,2586,2033
