# Imports

In [1]:
import numpy as np 
import pandas as pd 
from math import sqrt
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# Linear Regression Algorithm

In [2]:
class linearRegression():

    # Constructor to initialize the learning rate and the number of iterations
    def __init__(self, learning_rate, iterations):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weights = None
        
    # Fits the linear regression model to the given training data.
    def fit(self, X_train, Y_train):
        X = np.insert(X_train, 0, 1, axis=1)
        
        # Initialize the weights to zero
        self.weights = np.zeros(X.shape[1]) 
        
        # Performs gradient descent for the specified number of iterations
        for i in range(self.iterations):
            Y_pred = np.dot(X, self.weights)
            error = Y_train - Y_pred
            gradient = - (2 * (X.T).dot(error)) / X.shape[0] # Calculates the gradient
            self.weights = self.weights - self.learning_rate * gradient # Updates the weights
        
        return self
        
    def predict(self, X_test):
        X_test = np.insert(X_test, 0, 1, axis=1) # Adds a column of 1's to the input data for the bias term
        return np.dot(X_test, self.weights)


# Cleaning and Splitting Data

In [3]:
modelEncoder = LabelEncoder()
transmissionEncoder = LabelEncoder()
fuelTypeEncoder = LabelEncoder()
scaler = MinMaxScaler() # Scaler for normalizing the data


# Takes given dataset and returns split data
def dataset(brand): 
    file = pd.read_csv(brand, quotechar='"', skipinitialspace=True) # Reads the dataset

    # Removes all outliers from the 'year' column
    for i in ['year']:
        q75,q25 = np.percentile(file.loc[:,i],[75,25])
        IQR = q75-q25 # Interquartile range
    
        max = q75+(1.5*IQR)
        min = q25-(1.5*IQR)
    
        file.loc[file[i] < min, i] = np.nan # Replaces outliers smaller than min with NaN
        file.loc[file[i] > max, i] = np.nan # Replaces outliers larger than max with NaN

    file = file.dropna(axis = 0) # Removes rows with NaN values

    # Turns string values into numerical values using LabelEncoder
    modelEncoder.fit(file["model"])
    file["model"] = modelEncoder.transform(file["model"])
    
    transmissionEncoder.fit(file["transmission"])
    file["transmission"] = transmissionEncoder.transform(file["transmission"])
    
    fuelTypeEncoder.fit(file["fuelType"])
    file["fuelType"] = fuelTypeEncoder.transform(file["fuelType"])

    file = file.head(10000) # Limits dataset size to 10,000

    X = file.drop(columns = ['price'])
    Y = file.price
    
    # Splits data into 75% training and 25% testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 601)
    
    # Normalises data using MinMaxScaler
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    return  X_train, X_test, Y_train, Y_test, file, X, Y

X_train, X_test, Y_train, Y_test, file, X, Y = dataset("../UKUsedCarDataSet/vauxhall.csv") # Change file name to change dataset here

# RMSE

In [4]:
# Calculates the root mean squared error
def rmse(test, pred):
    MSE = np.square(np.subtract(test, pred)).mean()
    return sqrt(MSE)

# Hold-out Validation

In [8]:
# Initialises and trains model
LR = linearRegression( iterations = 1000, learning_rate = 0.01 )
LR.fit(X_train, Y_train)   

y_pred =  LR.predict(X_test)
error = rmse(Y_test, y_pred) 
print('The RMSE value is:', round(error, 2))

The RMSE value is: 7764.08


# User Input

In [7]:
inputPred = []
entries = []

def userInput():
    chooseBrand = input("Choose your car brand: Audi, BMW, Ford, Hyundai, Mercedes, Skoda, Toyota, Vauxhall or Volkswagen \n")
    
    if chooseBrand == "Audi":
        return "../UKUsedCarDataSet/audi.csv"
    elif chooseBrand == "BMW":
        return "../UKUsedCarDataSet/bmw.csv"
    elif chooseBrand == "Ford":
        return "../UKUsedCarDataSet/ford.csv"
    elif chooseBrand == "Hyundai":
        return "../UKUsedCarDataSet/hyundi.csv"
    elif chooseBrand == "Mercedes":
        return "../UKUsedCarDataSet/merc.csv"
    elif chooseBrand == "Skoda":
        return "../UKUsedCarDataSet/skoda.csv"
    elif chooseBrand == "Toyota":
        return "../UKUsedCarDataSet/toyota.csv"
    elif chooseBrand == "Vauxhall":
        return "../UKUsedCarDataSet/vauxhall.csv"
    elif chooseBrand == "Volkswagen":
        return "../UKUsedCarDataSet/vw.csv"
    else:
        print("Invalid Car Brand")
        userInput()
    return 
  
X_train, X_test, Y_train, Y_test, file, X, Y = dataset(userInput())

print("\n List of models:")
print(list(modelEncoder.classes_))

inputPred.append((modelEncoder.transform([input("\nWhat Model is your car? ")]))[0])
inputPred.append(int(input("What year is your car? ")))
inputPred.append((transmissionEncoder.transform([input("What transmission is your car? ")]))[0])
inputPred.append(int(input("How much mileage does your car have? ")))
inputPred.append((fuelTypeEncoder.transform([input("What's your car fuel type? ")]))[0])
inputPred.append(int(input("How much is your cars tax? ")))
inputPred.append(float(input("What's MPG of your car? ")))
inputPred.append(float(input("What the engine size of your car? ")))
entries.append(inputPred)
inputPred = scaler.transform([inputPred]) # Normalises input data


print("\n ***Predicting***")
start = time.time()

# Initialises and trains model
LR = linearRegression( iterations = 1000, learning_rate = 0.01 ) 
LR.fit( X_train, Y_train ) 

Y_pred = LR.predict(inputPred) # Predicts price of car
print("\n Predicted price for your car is: £", round(Y_pred[0], 2)) 

print("\n ***Predicted in", time.time() - start,"seconds***") # Prints time taken to predict


 List of models:
['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'Q2', 'Q3', 'Q5', 'Q7', 'Q8', 'R8', 'RS3', 'RS4', 'RS5', 'RS6', 'RS7', 'S3', 'S4', 'S5', 'S8', 'SQ5', 'SQ7', 'TT']





 ***Predicting***

 Predicted price for your car is: £ 30930.91

 ***Predicted in 0.20644187927246094 seconds***
