# Import

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import plot_tree

from decisionTree import DTRegressor

# Importing and Splitting data

In [37]:
modelEncoder = LabelEncoder()
transmissionEncoder = LabelEncoder()
fuelTypeEncoder = LabelEncoder()

def dataset(brand):

    file = pd.read_csv(brand, quotechar='"', skipinitialspace=True)

    modelEncoder.fit(file["model"])
    file["model"] = modelEncoder.transform(file["model"])

    transmissionEncoder.fit(file["transmission"])
    file["transmission"] = transmissionEncoder.transform(file["transmission"])

    fuelTypeEncoder.fit(file["fuelType"])
    file["fuelType"] = fuelTypeEncoder.transform(file["fuelType"])

    file = file.head(1000)
    # X = file.drop(columns = ['price'])
    # Y = file.price
    # print(file)

    X = file.drop(['price'], axis = 1).to_numpy()
    # X = file[['year', 'mileage', 'tax', 'mpg', 'engineSize']].to_numpy(dtype = 'int')
    Y = file['price'].values.reshape(-1,1)

#     print(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 601)
    
    return  X_train, X_test, Y_train, Y_test

In [38]:
X_train, X_test, Y_train, Y_test = dataset("UKUsedCarDataSet/audi.csv") # Use Audi dataset as default for KNN analysis

# Using Scikit-Learn

In [184]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(random_state = 601)
regressor.fit(X_train, Y_train)
print("R Squared: ", regressor.score(X_test, Y_test))

R Squared:  0.8784516562563071


In [193]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = params = {'min_samples_split': list(range(2, 100)), 'max_depth': list(range(2, 100)), 'n_estimators': list(range(2, 100))}

#{'min_samples_split': [3, 4, 5, 6], 'max_depth': list(range(1, 6)), 'n_estimators': list(range(1, 100))}

rf_random = RandomizedSearchCV(estimator = regressor, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=601, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=601),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, 31, ...],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10, 11, 12,
                                                              13, 14, 15, 16,
                                                              17, 18, 19, 20,
                                                              21, 22, 23, 24,
                                                              25, 26, 27, 28,
                      

In [194]:
rf_random.best_params_

{'n_estimators': 25, 'min_samples_split': 2, 'max_depth': 99}

In [15]:
regressor2 = RandomForestRegressor(random_state = 601, n_estimators = 31, min_samples_split = 6, max_depth = 90)
regressor2.fit(X_train, Y_train)
print("R Squared: ", regressor2.score(X_test, Y_test))

#22,6,96 = 0.879540341188823
#25,6,99 = 0.8833043912108435
#31,6,90 = 0.8853793922050485

  regressor2.fit(X_train, Y_train)


R Squared:  0.8853793922050485


In [195]:
from sklearn.model_selection import GridSearchCV
params = {'min_samples_split': list(range(2, 10)), 'max_depth': list(range(90, 100)), 'n_estimators': list(range(10, 50))}

grid_search_cv = GridSearchCV(regressor, params, cv=3)
grid_search_cv.fit(X_train, Y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=601),
             param_grid={'max_depth': [90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
                         'n_estimators': [10, 11, 12, 13, 14, 15, 16, 17, 18,
                                          19, 20, 21, 22, 23, 24, 25, 26, 27,
                                          28, 29, 30, 31, 32, 33, 34, 35, 36,
                                          37, 38, 39, ...]})

In [203]:
grid_search_cv.best_estimator_

RandomForestRegressor(max_depth=90, n_estimators=31, random_state=601)

In [11]:
regressor3 = RandomForestRegressor(random_state = 601, n_estimators = 23, min_samples_split = 3, max_depth = 5)
regressor3.fit(X_train, Y_train) #14, 5, 5
print("R Squared: ", regressor3.score(X_test, Y_test))
#23, 3, 5 = 0.8554524402174026

R Squared:  0.8554524402174026


# MyCODE

In [39]:
class forestRegression():

    def __init__(self, numTrees = 31, minSample = 6, maxDepth = 90):
        self.numTrees = numTrees
        self.minSamples = minSample
        self.maxDepth = maxDepth
        self.decisionTree = []
        
    @staticmethod
    def _sample(X, y):
        n_rows, n_cols = X.shape
        samples = np.random.RandomState(601).choice(a = n_rows, size = n_rows, replace = True)
        # samples =rnd.choice(a = n_rows, size = n_rows, replace = True)
        return X[samples], y[samples]
        
    def fit(self, X, y):
        if len(self.decisionTree) > 0:
            self.decisionTree= []
        print("HELLO")
        num_built = 0
        while num_built < self.numTrees:
            print("NUMBER BUILT: ", num_built)
            try:
                clf = DTRegressor(minSamples = self.minSamples, maxDepth = self.maxDepth) ##try 3, then 1
#                 print("GOT HEREE")
                _X, _y = self._sample(X, y)
#                 print("sample X: ", _X)
#                 print("sample y: ", _y)
                clf.fit(_X, _y)
#                 print("GOT HEREE 2")
                self.decisionTree.append(clf)
#                 print("GOT HEREE 3")
                num_built += 1
            except Exception as e:
                print("ERROR: ", e)
                continue
    
    def predict(self, X):
        y = []
        for tree in self.decisionTree:
            y.append(tree.predict(X))
        y = np.swapaxes(a = y, axis1 = 0, axis2 = 1) 
        predictions = []
        for preds in y:
            predictions.append(np.mean(preds))
        return predictions

In [None]:
def userInput():
        chooseBrand = input("Choose your car brand: Audi, BMW, Ford, Hyundai, Mercedes, Skoda, Toyota, Vauxhall or Volkswagen \n")
    
        if chooseBrand == "Audi":
            return "UKUsedCarDataSet/audi.csv"
        elif chooseBrand == "BMW":
            return "UKUsedCarDataSet/bmw.csv"
        elif chooseBrand == "Ford":
            return "UKUsedCarDataSet/ford.csv"
        elif chooseBrand == "Hyundai":
            return "UKUsedCarDataSet/hyundi.csv"
        elif chooseBrand == "Mercedes":
            return "UKUsedCarDataSet/merc.csv"
        elif chooseBrand == "Skoda":
            return "UKUsedCarDataSet/skoda.csv"
        elif chooseBrand == "Toyota":
            return "UKUsedCarDataSet/toyota.csv"
        elif chooseBrand == "Vauxhall":
            return "UKUsedCarDataSet/vauxhall.csv"
        elif chooseBrand == "Volkswagen":
            return "UKUsedCarDataSet/vw.csv"
        else:
            print("Invalid Car Brand")
            return
        
X_train, X_test, Y_train, Y_test = dataset(userInput())
print("\n ***Training Tree Model***")
myForest = forestRegression()  
myForest.fit(X_train, Y_train)

inputPred = []
entries = []

inputPred.append((modelEncoder.transform([input("\nWhat Model is your car? ")]))[0])
inputPred.append(int(input("What year is your car? ")))
inputPred.append((transmissionEncoder.transform([input("What transmission is your car? ")]))[0])
inputPred.append(int(input("How much mileage does your car have? ")))
inputPred.append((fuelTypeEncoder.transform([input("What's your car fuel type? ")]))[0])
inputPred.append(int(input("How much is your cars tax? ")))
inputPred.append(float(input("What's MPG of your car? ")))
inputPred.append(float(input("What the engine size of your car? ")))
entries.append(inputPred)

import time
print("\n ***Predicting***")
start = time.time()
y_pred = myForest.predict([inputPred])
# {0:.2f}'.format()
print("\n Predicted price for your car is: £", y_pred[0])

print("\n ***Predicted in", time.time() - start,"seconds***")

# RS6,2016,Semi-Auto,49050,Petrol,325,29.4,4.0 -- Price is £44,985
# BMW,5 Series,2019,Semi-Auto,4405,Petrol,145,48.7,2.0     Price = £26,000


In [40]:
def rmse(h, y):
  return sqrt(mean_squared_error(h, y))

In [41]:
myForest = forestRegression()  
myForest.fit(X_train, Y_train)

HELLO
NUMBER BUILT:  0
STARTED
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
diffff errorrrr:  'gain'
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREA

NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
NOT GREATE THAN 0
ENDED???? <decisionTree.Node object at 0x0000016857D14250>
NUMBER BUILT:  2
STARTED


KeyboardInterrupt: 

In [20]:
def evaluation(train, test, yTrain, yTest):    
    y_pred = myForest.predict(X_test)
    error = rmse(yTest, y_pred) 
    print('The RMSE value is:', error)
    for i in range(len(y_pred)):
        print("\nOriginal value:", yTest[i], "vs the predicted value:", y_pred[i])
        print("The difference is:", yTest[i] - y_pred[i])
evaluation(X_train, X_test, Y_train, Y_test)

The RMSE value is: 5771.8260997248035

Original value: [18498] vs the predicted value: 15023.25
The difference is: [3474.75]

Original value: [22995] vs the predicted value: 22749.0
The difference is: [246.]

Original value: [12495] vs the predicted value: 50225.0
The difference is: [-37730.]

Original value: [14500] vs the predicted value: 15247.5
The difference is: [-747.5]

Original value: [24995] vs the predicted value: 26381.0
The difference is: [-1386.]

Original value: [10200] vs the predicted value: 13446.25
The difference is: [-3246.25]

Original value: [21995] vs the predicted value: 18196.25
The difference is: [3798.75]

Original value: [16800] vs the predicted value: 20248.0
The difference is: [-3448.]

Original value: [12798] vs the predicted value: 16495.0
The difference is: [-3697.]

Original value: [20498] vs the predicted value: 34745.0
The difference is: [-14247.]

Original value: [11498] vs the predicted value: 13600.0
The difference is: [-2102.]

Original value: [48