In [1]:
#Download data
!git clone git://gonito.net/auta

Cloning into 'auta'...
remote: Wymienianie obiektów: 385, gotowe.[K
remote: Zliczanie obiektów: 100% (385/385), gotowe.[K
remote: Kompresowanie obiektów: 100% (374/374), gotowe.[K
remote: Razem 385 (delty 112), użyte ponownie 0 (delty 0), paczki użyte ponownie 0
Receiving objects: 100% (385/385), 1.93 MiB | 1.71 MiB/s, done.
Resolving deltas: 100% (112/112), done.


In [6]:
#Import libraries
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error, accuracy_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from scipy import stats

In [3]:
#Helper functions
def AddMissingColumn(test,train):
  '''Match number of columns between datasets'''
  train_objs_num = len(train)
  dataset = pd.concat(objs=[train, test], axis=0,sort=False)
  dataset_preprocessed = pd.get_dummies(dataset)
  test_preprocessed = dataset_preprocessed[train_objs_num:]
  return test_preprocessed

def CrossValidation(X,Y):
  '''Get precise accuracy regardless of dataset random split'''
  k_fold = KFold(n_splits=10, shuffle=False, random_state=0)
  scoring = 'accuracy'
  score = cross_val_score(regresion, X, Y, cv=k_fold, n_jobs=1, scoring=scoring)
  for value in score:
    print("%.3f" % value, end=" ")
  print(sum(score) / len(score))

In [95]:
#Load data
folder_path = "/content/auta/"
trainData = pd.read_csv(folder_path + 'train/train.tsv', sep='\t',names=['price',	'mileage', 'year',	'brand', 'engineType', 'engineCapacity'])
devDataInput = pd.read_csv(folder_path +'dev-0/in.tsv', sep='\t',header=None)
devDataOutput = pd.read_csv(folder_path +'dev-0/expected.tsv', sep='\t',header=None)
testDataInput = pd.read_csv(folder_path +'test-A/in.tsv', sep='\t', header=None)

In [96]:
trainData.sample(3)

Unnamed: 0,price,mileage,year,brand,engineType,engineCapacity
2055,31900,128139,2013,Ford,diesel,1600
17020,23900,165000,2010,Ford,diesel,1800
18561,34800,307000,2006,Mercedes-Benz,diesel,3000


In [97]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48002 entries, 0 to 48001
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   price           48002 non-null  int64 
 1   mileage         48002 non-null  int64 
 2   year            48002 non-null  int64 
 3   brand           48002 non-null  object
 4   engineType      48002 non-null  object
 5   engineCapacity  48002 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 2.2+ MB


In [98]:
trainData.describe()[0:2]

Unnamed: 0,price,mileage,year,engineCapacity
count,48002.0,48002.0,48002.0,48002.0
mean,37803.583809,142920.244156,2008.351298,1818.395942


In [99]:
dataToClean = [trainData, devDataInput, testDataInput]

In [100]:
#Are there any nan data?
for data in dataToClean:
  print(data.isna().sum().sum())

0
0
0


In [101]:
#Set same column names between datasets
devDataInput.columns=trainData.columns[1:]
testDataInput.columns=trainData.columns[1:]

In [102]:
#Choose which data to use as regression
regresionV = ['mileage', 'year', 'engineCapacity']

#Clean some outliers in train
trainData = trainData[(np.abs(stats.zscore(trainData[regresionV])) < 3).all(axis=1)]

idTrain = trainData[regresionV]
idDev = devDataInput[regresionV]
idTest = testDataInput[regresionV]

In [103]:
#Choose which data to use as categorical (One hot encoding)
values = {'brand' : 'Opel', 'engineType': 'benzyna'}
category = list(values.keys())
X_c = pd.get_dummies(trainData[category],columns=category)
dev_c = pd.get_dummies(devDataInput[category],columns=category)
test_c = pd.get_dummies(testDataInput[category],columns=category)

In [110]:
dev_c = (AddMissingColumn(dev_c, X_c)).fillna(0)
test_c = (AddMissingColumn(test_c, X_c)).fillna(0)

In [111]:
len(trainData)

46661

In [112]:
#Prepare final arrays (Merge categorical and regression)
arrayX = pd.concat([idTrain,X_c], axis=1)
arrayDevX = pd.concat([idDev,dev_c], axis=1)
testDevX = pd.concat([idTest,test_c], axis=1)
Y = trainData['price']

In [113]:
#Train model
regresion = RandomForestRegressor(random_state=3,n_estimators=40)
regresion.fit(arrayX,Y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=40, n_jobs=None, oob_score=False,
                      random_state=3, verbose=0, warm_start=False)

In [137]:
#We got one additional problem to solve. There is column in test set which was not in training examples. 
set(testDevX.columns) - set(arrayX.columns)

{'brand_Fabrycznie'}

In [140]:
testDevX = testDevX.drop(['brand_Fabrycznie'],axis=1)

In [141]:
#Predict
predictedTrain = regresion.predict(arrayX)
predictedDev = regresion.predict(arrayDevX)
predictedTest = regresion.predict(testDevX)

In [146]:
#Train set accuracy
print(regresion.score(arrayX,Y))

0.975373433737474


In [147]:
#Dev set accuracy
regresion.score(arrayDevX,devDataOutput)

0.7412624609975231

In [150]:
#RMSE
print(math.sqrt(mean_squared_error(devDataOutput,predictedDev)))

25604.223404608674


In [154]:
#Save results
np.savetxt('/content/auta/test-A/out.tsv', predictedTest, '%.0f')
np.savetxt('/content/auta/dev-0/out.tsv', predictedDev, '%.0f')