In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stats
import sklearn.metrics

def loadDataSample(samplesize: int, populationsize: int):
    full = pd.read_csv('cleaned_cars.csv', nrows=populationsize) #take 10k rows due to memory limitations
    sample = full.sample(n=samplesize)
    del(full)
    print("Number of features: " +str(len(sample.columns)))
    print("Number of rows: "+str(len(sample)))
    print("\n")
    
    return sample

def loadDataSet():
    full = pd.read_csv('cleaned_cars.csv')
    
    return full

def splitTestTrain(data):
    train = data.sample(frac=0.75)

    #validate = full.drop(train.index).sample(frac=0.5)

    test = data.drop(train.index)

    print("Train size = "+str(len(train)))
    #print("Validation size = "+str(len(validation)))
    print("Test size = "+str(len(test))) 
    
    return train,test

In [6]:
cars = loadDataSample(10000, 20000)
train, test = splitTestTrain(cars)
del(cars)

trainX = train.drop(['price'], axis=1)
trainY = train['price']

#validX = validation.drop(['price'], axis=1)
#validY = validation['price']

testX = test.drop(['price'], axis=1)
testY = test['price']

Number of features: 544
Number of rows: 10000


Train size = 7500
Test size = 2500


# Linear

In [8]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

##Normal linear regression
reg = linear_model.LinearRegression()
reg.fit(trainX, trainY)
trainPredict = reg.predict(trainX)
#validPredict = reg.predict(validX)
testPredict = reg.predict(testX)
regCrossValScore = cross_val_score(reg, trainX, trainY, cv=10, scoring='neg_mean_squared_error')

print("#####Normal Linear Regression######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
#print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print("K Fold Cross Validation Score (k=10)"+ str(regCrossValScore.mean()))
print('\n')

##Linear regression with L1 (LASSO) regulariser
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(trainX, trainY)
trainPredict = lasso.predict(trainX)
#validPredict = lasso.predict(validX)
testPredict = lasso.predict(testX)
lassoCrossValScore = cross_val_score(lasso, trainX, trainY, cv=10, scoring='neg_mean_squared_error')

print("#####LASSO Linear Regression######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
#print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print("K Fold Cross Validation Score (k=10)"+ str(lassoCrossValScore.mean()))
print('\n')


##Linear regression with L2 (RIDGE) regulariser
ridge = linear_model.Ridge(alpha=.5)
ridge.fit(trainX, trainY)
trainPredict = ridge.predict(trainX)
#validPredict = ridge.predict(validX)
testPredict = ridge.predict(testX)
ridgeCrossValScore = cross_val_score(ridge, trainX, trainY, cv=10, scoring='neg_mean_squared_error')

print("#####RIDGE Linear Regression######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
#print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print("K Fold Cross Validation Score (k=10)"+ str(ridgeCrossValScore.mean()))
print('\n')

##Linear regression ARD prior
ard = linear_model.ARDRegression()
ard.fit(trainX, trainY)
trainPredict = ard.predict(trainX)
#validPredict = ard.predict(validX)
testPredict = ard.predict(testX)
ardCrossValScore = cross_val_score(ard, trainX, trainY, cv=5, scoring='neg_mean_squared_error')

print("#####ARD Linear Regression######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
#print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print("K Fold Cross Validation Score (k=10): "+ str(ardCrossValScore.mean()))
print('\n')


#####Normal Linear Regression######
Train MSE: 0.004443888554720913
Test MSE: 0.004316138499511461
K Fold Cross Validation Score (k=10)-1.973776279129402e+20


#####LASSO Linear Regression######
Train MSE: 0.0121266564685232
Test MSE: 0.011806262889681094
K Fold Cross Validation Score (k=10)-0.012130831747777845


#####RIDGE Linear Regression######
Train MSE: 0.004424609951920451
Test MSE: 0.00430793672703212
K Fold Cross Validation Score (k=10)-0.004573619453479722


#####ARD Linear Regression######
Train MSE: 0.0045082383859658
Test MSE: 0.004313486575795373
K Fold Cross Validation Score (k=10): -0.004625410120090687




# Naive Bayes

# SVM

# Random Forests

In [9]:
from sklearn.ensemble import RandomForestRegressor

randForestReg = RandomForestRegressor(max_depth=2, random_state=0)
randForestReg.fit(trainX, trainY)
trainPredict = randForestReg.predict(trainX)
#validPredict = randForestReg.predict(validX)
testPredict = randForestReg.predict(testX)
rForestCrossValScore = cross_val_score(randForestReg, trainX, trainY, cv=5, scoring='neg_mean_squared_error')

print("#####Bagging (Random Forests) ######")
print("Train MSE: "+str(mean_squared_error(trainY, trainPredict)))
#print("Validation MSE: "+str(mean_squared_error(validY, validPredict)))
print("Test MSE: "+str(mean_squared_error(testY, testPredict)))
print("K Fold Cross Validation Score (k=5): "+ str(rForestCrossValScore.mean()))
print("K Fold CV std dev: "+ str(stats.stdev(rForestCrossValScore)))
print('\n')

#####Bagging (Random Forests) ######
Train MSE: 0.0066804739112627165
Test MSE: 0.006259156039364021
K Fold Cross Validation Score (k=5): -0.0066874192106492645
K Fold CV std dev: 0.00040405973681935316




# Neural Nets