# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import plot_tree

# Importing and Splitting data

In [2]:
modelEncoder = LabelEncoder()
transmissionEncoder = LabelEncoder()
fuelTypeEncoder = LabelEncoder()

def dataset(brand):

    file = pd.read_csv(brand, quotechar='"', skipinitialspace=True)

    modelEncoder.fit(file["model"])
    file["model"] = modelEncoder.transform(file["model"])

    transmissionEncoder.fit(file["transmission"])
    file["transmission"] = transmissionEncoder.transform(file["transmission"])

    fuelTypeEncoder.fit(file["fuelType"])
    file["fuelType"] = fuelTypeEncoder.transform(file["fuelType"])

    file = file.head(5000)
    # X = file.drop(columns = ['price'])
    # Y = file.price
    # print(file)

    X = file.drop(['price'], axis = 1).to_numpy()
    # X = file[['year', 'mileage', 'tax', 'mpg', 'engineSize']].to_numpy(dtype = 'int')
    Y = file['price'].values.reshape(-1,1)

#     print(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 601)
    
    return  X_train, X_test, Y_train, Y_test

In [3]:
X_train, X_test, Y_train, Y_test = dataset("UKUsedCarDataSet/audi.csv") # Use Audi dataset as default for KNN analysis

# Using Scikit-Learn

In [27]:
myTree = DecisionTreeRegressor(random_state = 601, max_depth = 34, min_samples_split = 3) #, max_depth = 3
myTree.fit(X_train, Y_train)
print("R Squared: ", myTree.score(X_test, Y_test))
#92, 3 = 0.8593705968859954
#5, 3 = 0.7566846373060692
#34, 3 = 0.8593705968859954

R Squared:  0.8593705968859954


# GridSeacrhCV to find optimal parameters

In [None]:
from sklearn.model_selection import GridSearchCV

# params = {'min_samples_split': [2, 3, 4], 'max_leaf_nodes': list(range(2, 100)), 'max_depth': list(range(1, 100))} #'max_leaf_nodes': list(range(2, 100)),
# params = {'min_samples_split': [3, 4, 5, 6], 'max_depth': list(range(1, 6)), 'max_leaf_nodes': list(range(2, 6))}
params = {'min_samples_split': list(range(2, 100)), 'max_leaf_nodes': list(range(2, 100)), 'max_depth': list(range(2, 100))}
grid_search_cv = GridSearchCV(myTree, params, cv=3)
grid_search_cv.fit(X_train, Y_train)

In [None]:
grid_search_cv.best_estimator_

# Visualising Decsion Tree:

X1 = file.drop(['price'], axis = 1)

plt.figure(figsize=(20, 10), dpi = 200)
plot_tree(myTree, feature_names = X1.columns, filled=True);

# RMSE

In [17]:
def rmse(h, y):
  return sqrt(mean_squared_error(h, y))

# Node Class

In [5]:
# Node class to initialise instances of each 
class Node():
    
    def __init__(self, feature = None, limit = None, leftSide = None, rightSide = None, gain = None, leaf = None):
        
        self.feature = feature
        self.limit = limit
        self.leftSide = leftSide
        self.rightSide = rightSide
        self.gain = gain
        self.leaf = leaf 

# Decision Tree Regression Class

In [6]:
class DTRegressor():
    def __init__(self, minSamples, maxDepth):
        self.root = None
        self.minSamples = minSamples
        self.maxDepth = maxDepth
        
    def infoGain(self, parent, leftNode, rightNode):
        leftWeight = len(leftNode) / len(parent)
        rightWeight = len(rightNode) / len(parent)
        
        informationGain = np.var(parent) - (leftWeight * np.var(leftNode) + rightWeight * np.var(rightNode))

        return informationGain
        
    def splitTree(self, trainingSet, feature, limit):
        leftBranch = []
        rightBranch = []
        for i in trainingSet:
            if i[feature] <= limit:
                leftBranch.append(i)
            else:
                rightBranch.append(i)
        rightBranch = np.array(rightBranch)
        leftBranch = np.array(leftBranch)
        return leftBranch, rightBranch
        
    def bestSplit(self, trainingSet, X):
        bestSplitt = {} 
        biggestGain = -1
        for feature in range(X.shape[1]): 
#             featureValues = trainingSet[:, feature] #current feature selected
            featureValues = []
            for i in range(len(trainingSet)):
                featureValues.append(trainingSet[i, feature])
            thresholds = np.unique(featureValues)
            for j in thresholds: #j = threshold
                leftSide, rightSide = self.splitTree(trainingSet, feature, j) #splits node into 2 sub-trees
                if (len(leftSide) > 0 and len(rightSide) > 0 ):
                    parent = []
                    for i in range(len(trainingSet)):
                        parent.append(trainingSet[i, -1])

                    leftNode = []
                    for i in range(len(leftSide)):
                        leftNode.append(leftSide[i, -1])
                        
                    rightNode = []
                    for i in range(len(rightSide)):
                        rightNode.append(rightSide[i, -1])

                    currentGain = self.infoGain(parent, leftNode, rightNode) 
                    if currentGain > biggestGain:
                        
                        bestSplitt["feature"] = feature
                        bestSplitt["limit"] = j
                        bestSplitt["leftSide"] = leftSide
                        bestSplitt["rightSide"] = rightSide
                        bestSplitt["gain"] = currentGain
                        biggestGain = currentGain
                
        return bestSplitt
   
        
    def treeBuild(self, trainingSet, currentDepth = 0):
        
#       #Split training into features and labels
        X = trainingSet[:,:-1] # everything but the last value
        Y = []
        for i in range(len(trainingSet)):
            Y.append(trainingSet[i, -1])# only the last value
        
        #iterates until this condition is met
        if X.shape[0] >= self.minSamples and currentDepth <= self.maxDepth:
#             bestSplit = self.bestSplit(trainingSet, samplesNumb, featuresNumb)
            bestSplitNode = self.bestSplit(trainingSet, X)
            
            if "gain" in bestSplitNode and bestSplitNode["gain"] > 0:
                leftTree = self.treeBuild(bestSplitNode["leftSide"], currentDepth + 1)
                rightTree = self.treeBuild(bestSplitNode["rightSide"], currentDepth + 1)
                node = Node(bestSplitNode["feature"], bestSplitNode["limit"], leftTree, rightTree, bestSplitNode["gain"])

                return node
        
        leafValue = np.mean(Y) #calculates mean of leaf nodes
        val = Node(leaf = leafValue)
        return val
    
    def predictionLoop(self, testRow, root):
        if root.leaf != None: #not empty
            return root.leaf
        
        featureVal = testRow[root.feature]
        if featureVal <= root.limit:
            return self.predictionLoop(testRow, root.leftSide)
        else:
            return self.predictionLoop(testRow, root.rightSide)
        
   
    def predict(self, xTest):
        predictions = []
        for row in xTest:
            predictions.append(self.predictionLoop(row, self.root)) 
        return predictions

        
        
    def fit(self, X, Y):
#         print(Y)
        trainingSet = np.concatenate((X, Y), axis=1) #Joins training data back together
        self.root = self.treeBuild(trainingSet)

# Training the algorithm

In [8]:
myTree = DTRegressor(3, 34) #then 34, 3
myTree.fit(X_train, Y_train) #trains the model using the training set
#3, 11 = 4331.929325238453
#3, 34 = 4355.8298808608215

# Evaluation

In [18]:
def evaluation(train, test, yTrain, yTest):    
    y_pred = myTree.predict(X_test)
    # print(y_pred)
    error = rmse(yTest, y_pred) 
    print('The RMSE value is:', error)
    for i in range(len(y_pred)):
        print("\nOriginal value:", yTest[i], "vs the predicted value:", y_pred[i])
        print("The difference is:", yTest[i] - y_pred[i])
evaluation(X_train, X_test, Y_train, Y_test)
# Old RMSE value was 5486.187742617547
#4355.8298808608215 = 3, 34
#4331.929325238453 = 3, 11
#5050.108048448073 with entropy method
#3027.4338424903326 with 5000 values

The RMSE value is: 3027.4338424903326

Original value: [17498] vs the predicted value: 15498.0
The difference is: [2000.]

Original value: [25990] vs the predicted value: 25495.0
The difference is: [495.]

Original value: [26495] vs the predicted value: 20391.0
The difference is: [6104.]

Original value: [14791] vs the predicted value: 17490.0
The difference is: [-2699.]

Original value: [26888] vs the predicted value: 26888.0
The difference is: [0.]

Original value: [29500] vs the predicted value: 28997.5
The difference is: [502.5]

Original value: [12498] vs the predicted value: 11995.0
The difference is: [503.]

Original value: [11980] vs the predicted value: 10550.0
The difference is: [1430.]

Original value: [12898] vs the predicted value: 12998.0
The difference is: [-100.]

Original value: [20452] vs the predicted value: 15783.5
The difference is: [4668.5]

Original value: [15991] vs the predicted value: 16742.5
The difference is: [-751.5]

Original value: [12750] vs the predicte

# User Input Predictions

In [17]:
inputPred = []
entries = []

def userInput():
    chooseBrand = input("Choose your car brand: Audi, BMW, Ford, Hyundai, Mercedes, Skoda, Toyota, Vauxhall or Volkswagen \n")
    
    if chooseBrand == "Audi":
        return "UKUsedCarDataSet/audi.csv"
    elif chooseBrand == "BMW":
        return "UKUsedCarDataSet/bmw.csv"
    elif chooseBrand == "Ford":
        return "UKUsedCarDataSet/ford.csv"
    elif chooseBrand == "Hyundai":
        return "UKUsedCarDataSet/hyundi.csv"
    elif chooseBrand == "Mercedes":
        return "UKUsedCarDataSet/merc.csv"
    elif chooseBrand == "Skoda":
        return "UKUsedCarDataSet/skoda.csv"
    elif chooseBrand == "Toyota":
        return "UKUsedCarDataSet/toyota.csv"
    elif chooseBrand == "Vauxhall":
        return "UKUsedCarDataSet/vauxhall.csv"
    elif chooseBrand == "Volkswagen":
        return "UKUsedCarDataSet/vw.csv"
    else:
        print("Invalid Car Brand")
        userInput()
    return 

X_train, X_test, Y_train, Y_test = dataset(userInput())
print("\n ***Training Tree Model***")
myTree = DTRegressor(3, 93)  
myTree.fit(X_train, Y_train)

print("\n List of models:")
print(list(modelEncoder.classes_))
inputPred.append((modelEncoder.transform([input("\nWhat Model is your car? ")]))[0])
inputPred.append(int(input("What year is your car? ")))
inputPred.append((transmissionEncoder.transform([input("What transmission is your car? ")]))[0])
inputPred.append(int(input("How much mileage does your car have? ")))
inputPred.append((fuelTypeEncoder.transform([input("What's your car fuel type? ")]))[0])
inputPred.append(int(input("How much is your cars tax? ")))
inputPred.append(float(input("What's MPG of your car? ")))
inputPred.append(float(input("What the engine size of your car? ")))
entries.append(inputPred)
# inputPred = scaler.transform([inputPred])

import time
print("\n ***Predicting***")
start = time.time()
y_pred = myTree.predict([inputPred])
# {0:.2f}'.format()
print("\n Predicted price for your car is: £", y_pred[0])

print("\n ***Predicted in", time.time() - start,"seconds***")

# RS6,2016,Semi-Auto,49050,Petrol,325,29.4,4.0 -- Price is £44,985   old Pred = £41,233.30,  new pred = £45492.50
# BMW,5 Series,2019,Semi-Auto,4405,Petrol,145,48.7,2.0     Price = £26,000 old Pred = £27,077.49, new pred = £26500.00

Choose your car brand: Audi, BMW, Ford, Hyundai, Mercedes, Skoda, Toyota, Vauxhall or Volkswagen 
Audi

 ***Training Tree Model***

 List of models:
['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'Q2', 'Q3', 'Q5', 'Q7', 'Q8', 'R8', 'RS3', 'RS4', 'RS5', 'RS6', 'RS7', 'S3', 'S4', 'S5', 'S8', 'SQ5', 'SQ7', 'TT']

What Model is your car? RS6
What year is your car? 2016
What transmission is your car? Semi-Auto
How much mileage does your car have? 49050
What's your car fuel type? Petrol
How much is your cars tax? 325
What's MPG of your car? 29.4
What the engine size of your car? 4

 ***Predicting***

 Predicted price for your car is: £ 45492.5

 ***Predicted in 0.0007822513580322266 seconds***
