# Preliminary modelling

## Necessary imports and configurations

In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, max_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
DATA_PATH = '../data/transformed_clean.csv'
TEST_SIZE = 0.2

## Data Loading

In [3]:
dataset = pd.read_csv(DATA_PATH, index_col=[0])

In [4]:
dataset.head()

Unnamed: 0,region,price,year,manufacturer,model,fuel_gas,fuel_other,fuel_diesel,fuel_hybrid,fuel_electric,...,title_status,transmission_automatic,transmission_other,transmission_manual,state,posting_date_datetime,lat_sin,lat_cos,long_sin,long_cos
0,0.367246,18997,0.92623,0.243902,0.420818,1,0,0,0,0,...,1.0,1,0,0,0.68,-2.275675,-0.996488,-0.083741,-0.538527,-0.842608
1,0.367246,28997,0.95082,0.95122,0.888492,1,0,0,0,0,...,1.0,1,0,0,0.68,-2.275607,-0.996488,-0.083741,-0.538527,-0.842608
2,0.71464,18937,0.909836,0.317073,0.168476,1,0,0,0,0,...,1.0,1,0,0,0.9,-2.275018,-0.92482,-0.380405,-0.195778,-0.980648
3,0.759305,24900,0.918033,0.95122,0.816288,1,0,0,0,0,...,1.0,1,0,0,0.64,-2.274891,-0.982771,-0.184829,0.499958,-0.86605
4,0.965261,24900,0.918033,0.95122,0.816288,1,0,0,0,0,...,1.0,1,0,0,0.1,-2.274876,-0.982771,-0.184829,0.499958,-0.86605


To prevent data leakage in a time-series model I will perform train-test split based on a publication date.

In [5]:
train_size = int((1 - TEST_SIZE) * dataset.shape[0])
train_data = dataset.iloc[:train_size, :]
test_data = dataset.iloc[train_size:, :]

In [6]:
train_y, train_X = train_data['price'], train_data.drop(columns=['price', 'posting_date_datetime'])
test_y, test_X = test_data['price'], test_data.drop(columns=['price', 'posting_date_datetime'])

In [7]:
# sanity check
assert not train_X.isnull().values.any()
assert not test_X.isnull().values.any()

## Model selection

Many models can be selected as a baseline solution:
* Linear (or polynomial regression)
* Tree-based regressors (random forest, boosting and bagging models)
* Support vector regressor
* KNN regressor

To keep things simple let us use the KNN regressor.

## Model training
To ensure the reasonable quality of the baseline model we will finetune the number of neighbours considered.

In [8]:
grid = {
    'n_neighbors' : range(3, 11, 2),
}

model = GridSearchCV(KNeighborsRegressor(n_jobs=-1), grid, verbose=3)
model.fit(train_X, train_y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[CV 1/5] END .....................n_neighbors=3;, score=0.591 total time=   3.3s
[CV 2/5] END .....................n_neighbors=3;, score=0.618 total time=   3.3s
[CV 3/5] END .....................n_neighbors=3;, score=0.618 total time=   3.3s
[CV 4/5] END .....................n_neighbors=3;, score=0.595 total time=   3.3s
[CV 5/5] END .....................n_neighbors=3;, score=0.570 total time=   3.3s
[CV 1/5] END .....................n_neighbors=5;, score=0.599 total time=   3.3s
[CV 2/5] END .....................n_neighbors=5;, score=0.623 total time=   3.3s
[CV 3/5] END .....................n_neighbors=5;, score=0.620 total time=   3.3s
[CV 4/5] END .....................n_neighbors=5;, score=0.602 total time=   3.3s
[CV 5/5] END .....................n_neighbors=5;, score=0.581 total time=   3.3s
[CV 1/5] END .....................n_neighbors=7;, score=0.599 total time=   3.3s
[CV 2/5] END .....................n_neighbors=7;, score=0.619 total time=   3.4s
[CV 3/5] END ...............

## Model evaluation
We can assess the quality of obtained model based on:
* $R^2$ score
* Maximal absolute error - worst case misprediction
* MAE - average misprediction in USD

In [9]:
def regression_report(model_name, y_true, y_pred):
    print(f'{model_name} has reached the following quality metrics')
    print(f'R2 = {r2_score(y_true, y_pred)}')
    print(f'Max absolute error = {max_error(y_true, y_pred)}')
    print(f'MAE = {mean_absolute_error(y_true, y_pred)}')

In [10]:
regression_report("KNNRegressor", train_y, model.predict(train_X))

KNNRegressor has reached the following quality metrics
R2 = 0.7873142807099359
Max absolute error = 29800.8
MAE = 3440.4159275799298


In [11]:
regression_report("KNNRegressor", test_y, model.predict(test_X))

KNNRegressor has reached the following quality metrics
R2 = 0.5732693023457412
Max absolute error = 36230.0
MAE = 5033.4227696158605


## Conclusion
Analysing the regression reports above we may conclude that our naive and simplistic model can already give some reasonably good predictions.