# Import

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
import time
import os, sys

currentdir = os.path.dirname(os.path.realpath("randomForest.ipynb"))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from decisionTree import decisionTree

# Importing and Splitting data

In [2]:
modelEncoder = LabelEncoder()
transmissionEncoder = LabelEncoder()
fuelTypeEncoder = LabelEncoder()

def dataset(brand):

    file = pd.read_csv(brand, quotechar='"', skipinitialspace=True)

    for i in ['year']:
        q75,q25 = np.percentile(file.loc[:,i],[75,25])
        IQR = q75-q25
    
        max = q75+(1.5*IQR)
        min = q25-(1.5*IQR)
    
        file.loc[file[i] < min, i] = np.nan
        file.loc[file[i] > max, i] = np.nan

    file = file.dropna(axis = 0)


    modelEncoder.fit(file["model"])
    file["model"] = modelEncoder.transform(file["model"])

    transmissionEncoder.fit(file["transmission"])
    file["transmission"] = transmissionEncoder.transform(file["transmission"])

    fuelTypeEncoder.fit(file["fuelType"])
    file["fuelType"] = fuelTypeEncoder.transform(file["fuelType"])

    file = file.head(10000)
    # X = file.drop(columns = ['price'])
    # Y = file.price
    # print(file)

    X = file.drop(['price'], axis = 1).to_numpy()
    # X = file[['year', 'mileage', 'tax', 'mpg', 'engineSize']].to_numpy(dtype = 'int')
    Y = file['price'].values.reshape(-1,1)

#     print(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 601)
    
    return  X_train, X_test, Y_train, Y_test, file, X, Y

X_train, X_test, Y_train, Y_test, file, X, Y = dataset("../UKUsedCarDataSet/vauxhall.csv") 

# Random Forest Algorithm

In [3]:
class randomForest():

    # Initialize the model with hyperparameters
    def __init__(self, numTrees = 31, minSample = 6, maxDepth = 90, random_state = 0):
        self.numTrees = numTrees # Number of trees in the forest
        self.minSamples = minSample # Minimum number of samples required to split an internal node
        self.maxDepth = maxDepth # Maximum depth of the decision trees
        self.decisionTree = [] # List to hold decision trees of the fores
        self.random_state = random_state # Random state used to generate bootstrap samples
        
    # Function to generate bootstrap samples 
    def bootstrapSample(self, X, y, state):
        sampleNumb, featuresNumb = X.shape
        samples = np.random.RandomState(state).choice(a = sampleNumb, size = sampleNumb, replace = True)
        return X[samples], y[samples]

    # Method to fit the random forest on training data   
    def fit(self, X, y):
        if len(self.decisionTree) > 0:
            self.decisionTree= []

        num_built = 0
        # Loop over the number of trees to build
        for i in range(self.numTrees):
            
            try:
                DT = decisionTree(minSamples = self.minSamples, maxDepth = self.maxDepth) # Create a decision tree object
                X, y = self.bootstrapSample(X, y, self.random_state + i) # Generate bootstrap sample of data
                DT.fit(X, y) # Fit the decision tree on bootstrap sample
                self.decisionTree.append(DT) # Add the decision tree to the forest
                num_built += 1
                # print("NUMBER BUILT: ", num_built)

            except Exception as e:
                print("ERROR: ", e) # Handle any exception that occurs during building a decision tree
                continue
    
    # Method to predict the target values on new data using the random forest
    def predict(self, X):
        y = []
        
        # Loop over the decision trees of the forest to get their predictions
        for tree in self.decisionTree:
            y.append(tree.predict(X))
        y = np.swapaxes(a = y, axis1 = 0, axis2 = 1) 
        predictions = []

        # Loop over the predictions of each row and compute their mean as the final prediction
        for preds in y:
            predictions.append(np.mean(preds))
        return predictions

# RMSE

In [4]:
def rmse(test, pred):
    MSE = np.square(np.subtract(test, pred)).mean()
    return sqrt(MSE)

# Hold-out Validation

In [5]:
def evaluation(train, test, yTrain, yTest):  
    
    myForest = randomForest()  
    myForest.fit(X_train, Y_train) 

    y_pred = myForest.predict(test)
    error = rmse(yTest, y_pred) 
    print('The RMSE value is:', round(error, 2))

evaluation(X_train, X_test, Y_train, Y_test)

The RMSE value is: 4031.11


# Cross-Vaidation

In [None]:
from sklearn.model_selection import KFold
RMSEScores = []
kf = KFold(n_splits=5, shuffle=True, random_state=601)

for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    Y_train, Y_val = Y[train_index], Y[val_index]
    
    # Fit the model on the training data
    RF = randomForest()
    RF.fit(X_train, Y_train)
    
    # Predict on the validation data
    Y_pred = RF.predict(X_val)
    
    # Compute the RMSE score for this fold
    RMSE = np.sqrt(mean_squared_error(Y_val, Y_pred))
    RMSEScores.append(RMSE)

# Compute the average RMSE score across all folds
print("Average RMSE score:", np.mean(RMSEScores))


In [6]:
def userInput():
        chooseBrand = input("Choose your car brand: Audi, BMW, Ford, Hyundai, Mercedes, Skoda, Toyota, Vauxhall or Volkswagen \n")
    
        if chooseBrand == "Audi":
            return "../UKUsedCarDataSet/audi.csv"
        elif chooseBrand == "BMW":
            return "../UKUsedCarDataSet/bmw.csv"
        elif chooseBrand == "Ford":
            return "../UKUsedCarDataSet/ford.csv"
        elif chooseBrand == "Hyundai":
            return "../UKUsedCarDataSet/hyundi.csv"
        elif chooseBrand == "Mercedes":
            return "../UKUsedCarDataSet/merc.csv"
        elif chooseBrand == "Skoda":
            return "../UKUsedCarDataSet/skoda.csv"
        elif chooseBrand == "Toyota":
            return "../UKUsedCarDataSet/toyota.csv"
        elif chooseBrand == "Vauxhall":
            return "../UKUsedCarDataSet/vauxhall.csv"
        elif chooseBrand == "Volkswagen":
            return "../UKUsedCarDataSet/vw.csv"
        else:
            print("Invalid Car Brand")
            return
        
X_train, X_test, Y_train, Y_test, file, X, Y = dataset(userInput())
print("\n ***Training Tree Model***")
myForest = randomForest()  
myForest.fit(X_train, Y_train)

inputPred = []
entries = []

inputPred.append((modelEncoder.transform([input("\nWhat Model is your car? ")]))[0])
inputPred.append(int(input("What year is your car? ")))
inputPred.append((transmissionEncoder.transform([input("What transmission is your car? ")]))[0])
inputPred.append(int(input("How much mileage does your car have? ")))
inputPred.append((fuelTypeEncoder.transform([input("What's your car fuel type? ")]))[0])
inputPred.append(int(input("How much is your cars tax? ")))
inputPred.append(float(input("What's MPG of your car? ")))
inputPred.append(float(input("What the engine size of your car? ")))
entries.append(inputPred)

import time
print("\n ***Predicting***")
start = time.time()
y_pred = myForest.predict([inputPred])

print("\n Predicted price for your car is: £", round(y_pred[0], 2))

print("\n ***Predicted in", time.time() - start,"seconds***")


 ***Training Tree Model***

 ***Predicting***

 Predicted price for your car is: £ 49930.39

 ***Predicted in 0.004992961883544922 seconds***
