# Import

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import plot_tree


import os, sys
currentdir = os.path.dirname(os.path.realpath("randomForest.ipynb"))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from decisionTree import decisionTree
from sklearn.utils import resample

# Importing and Splitting data

In [None]:
modelEncoder = LabelEncoder()
transmissionEncoder = LabelEncoder()
fuelTypeEncoder = LabelEncoder()

def dataset(brand):

    file = pd.read_csv(brand, quotechar='"', skipinitialspace=True)

    for i in ['year']:
        q75,q25 = np.percentile(file.loc[:,i],[75,25])
        IQR = q75-q25
    
        max = q75+(1.5*IQR)
        min = q25-(1.5*IQR)
    
        file.loc[file[i] < min, i] = np.nan
        file.loc[file[i] > max, i] = np.nan

    file = file.dropna(axis = 0)


    modelEncoder.fit(file["model"])
    file["model"] = modelEncoder.transform(file["model"])

    transmissionEncoder.fit(file["transmission"])
    file["transmission"] = transmissionEncoder.transform(file["transmission"])

    fuelTypeEncoder.fit(file["fuelType"])
    file["fuelType"] = fuelTypeEncoder.transform(file["fuelType"])

    file = file.head(5000)
    # X = file.drop(columns = ['price'])
    # Y = file.price
    # print(file)

    X = file.drop(['price'], axis = 1).to_numpy()
    # X = file[['year', 'mileage', 'tax', 'mpg', 'engineSize']].to_numpy(dtype = 'int')
    Y = file['price'].values.reshape(-1,1)

#     print(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 601)
    
    return  X_train, X_test, Y_train, Y_test

In [None]:
X_train, X_test, Y_train, Y_test = dataset("../UKUsedCarDataSet/audi.csv") # Use Audi dataset as default for KNN analysis

# Using Scikit-Learn

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(random_state = 601)
regressor.fit(X_train, Y_train)
print("R Squared: ", regressor.score(X_test, Y_test))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = params = {'min_samples_split': list(range(2, 100)), 'max_depth': list(range(2, 100)), 'n_estimators': list(range(2, 100))}

#{'min_samples_split': [3, 4, 5, 6], 'max_depth': list(range(1, 6)), 'n_estimators': list(range(1, 100))}

rf_random = RandomizedSearchCV(estimator = regressor, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=601, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, Y_train)

In [None]:
rf_random.best_params_

In [None]:
regressor2 = RandomForestRegressor(random_state = 601, n_estimators = 31, min_samples_split = 6, max_depth = 90)
regressor2.fit(X_train, Y_train)
print("R Squared: ", regressor2.score(X_test, Y_test))

#22,6,96 = 0.879540341188823
#25,6,99 = 0.8833043912108435
#31,6,90 = 0.8853793922050485

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'min_samples_split': list(range(2, 10)), 'max_depth': list(range(90, 100)), 'n_estimators': list(range(10, 50))}

grid_search_cv = GridSearchCV(regressor, params, cv=3)
grid_search_cv.fit(X_train, Y_train)

In [None]:
grid_search_cv.best_estimator_

In [None]:
regressor3 = RandomForestRegressor(random_state = 601, n_estimators = 23, min_samples_split = 3, max_depth = 5)
regressor3.fit(X_train, Y_train) #14, 5, 5
print("R Squared: ", regressor3.score(X_test, Y_test))
#23, 3, 5 = 0.8554524402174026

# MyCODE

In [None]:
class randomForest():

    def __init__(self, numTrees = 31, minSample = 6, maxDepth = 90, random_state = 0):
        self.numTrees = numTrees
        self.minSamples = minSample
        self.maxDepth = maxDepth
        self.decisionTree = []
        self.random_state = random_state
        
    def _sample(self, X, y, state):
        sampleNumb, featuresNumb = X.shape
        samples = np.random.RandomState(state).choice(a = sampleNumb, size = sampleNumb, replace = True)
        return X[samples], y[samples]
        
    def fit(self, X, y):
        if len(self.decisionTree) > 0:
            self.decisionTree= []

        num_built = 0
        for i in range(self.numTrees):
            
            try:
                DT = decisionTree(minSamples = self.minSamples, maxDepth = self.maxDepth)
                _X, _y = self._sample(X, y, self.random_state + i)
                DT.fit(_X, _y)
                self.decisionTree.append(DT)
                num_built += 1
                print("NUMBER BUILT: ", num_built)
            except Exception as e:
                print("ERROR: ", e)
                continue
    
    def predict(self, X):
        y = []
        for tree in self.decisionTree:
            y.append(tree.predict(X))
        y = np.swapaxes(a = y, axis1 = 0, axis2 = 1) 
        predictions = []
        for preds in y:
            predictions.append(np.mean(preds))
        return predictions

In [None]:
def rmse(h, y):
  return sqrt(mean_squared_error(h, y))

In [None]:
myForest = randomForest()  
myForest.fit(X_train, Y_train)

In [None]:
def userInput():
        chooseBrand = input("Choose your car brand: Audi, BMW, Ford, Hyundai, Mercedes, Skoda, Toyota, Vauxhall or Volkswagen \n")
    
        if chooseBrand == "Audi":
            return "../UKUsedCarDataSet/audi.csv"
        elif chooseBrand == "BMW":
            return "../UKUsedCarDataSet/bmw.csv"
        elif chooseBrand == "Ford":
            return "../UKUsedCarDataSet/ford.csv"
        elif chooseBrand == "Hyundai":
            return "../UKUsedCarDataSet/hyundi.csv"
        elif chooseBrand == "Mercedes":
            return "../UKUsedCarDataSet/merc.csv"
        elif chooseBrand == "Skoda":
            return "../UKUsedCarDataSet/skoda.csv"
        elif chooseBrand == "Toyota":
            return "../UKUsedCarDataSet/toyota.csv"
        elif chooseBrand == "Vauxhall":
            return "../UKUsedCarDataSet/vauxhall.csv"
        elif chooseBrand == "Volkswagen":
            return "../UKUsedCarDataSet/vw.csv"
        else:
            print("Invalid Car Brand")
            return
        
X_train, X_test, Y_train, Y_test = dataset(userInput())
print("\n ***Training Tree Model***")
# myForest = randomForest()  
# myForest.fit(X_train, Y_train)

inputPred = []
entries = []

inputPred.append((modelEncoder.transform([input("\nWhat Model is your car? ")]))[0])
inputPred.append(int(input("What year is your car? ")))
inputPred.append((transmissionEncoder.transform([input("What transmission is your car? ")]))[0])
inputPred.append(int(input("How much mileage does your car have? ")))
inputPred.append((fuelTypeEncoder.transform([input("What's your car fuel type? ")]))[0])
inputPred.append(int(input("How much is your cars tax? ")))
inputPred.append(float(input("What's MPG of your car? ")))
inputPred.append(float(input("What the engine size of your car? ")))
entries.append(inputPred)

import time
print("\n ***Predicting***")
start = time.time()
y_pred = myForest.predict([inputPred])
# {0:.2f}'.format()
print("\n Predicted price for your car is: £", round(y_pred[0], 2))

print("\n ***Predicted in", time.time() - start,"seconds***")

# RS6,2016,Semi-Auto,49050,Petrol,325,29.4,4.0 -- Price is £44,985
# BMW,5 Series,2019,Semi-Auto,4405,Petrol,145,48.7,2.0     Price = £26,000


In [None]:
def evaluation(train, test, yTrain, yTest):    
    y_pred = myForest.predict(test)
    error = rmse(yTest, y_pred) 
    print('The RMSE value is:', round(error, 2))
    # for i in range(len(y_pred)):
    #     print("\nOriginal value:", yTest[i], "vs the predicted value:", y_pred[i])
    #     print("The difference is:", yTest[i] - y_pred[i])


    file2 = pd.read_csv("../UKUsedCarDataSet/audi.csv", quotechar='"', skipinitialspace=True)
    mean= np.mean(file2["price"])

    print("R VALUE:", 1 - (error/mean))



evaluation(X_train, X_test, Y_train, Y_test)

#rmse with 1000: 5771.8260997248035 
#rmse with 5000: 3414.0288745712533
#rmse with resample None: 2431.746126478666
#rmse with resample 601: 3414.0288745712533
#rmse with for loop: 2436.033694659025
#rmse using my method: 2436.033694659025
