#### Setup

In [1]:
# Import dataset class and accuracy functions
from __init__ import (PricingWizardDataset, 
                      regression_accuracy, 
                      threshold_accuracy, 
                      base_regression_pipeline, 
                      ridge_regression_pipeline,
                      RegressionNN,
                      load_model,
                      drop_helpers,
                      test,
                      set_device)

# Model loading imports
import joblib
import torch

# Import other libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# Others...

In [2]:
# Load Data
data = PricingWizardDataset()

Dataset Loaded: post_preprocessing_without_dummies
	Number of Rows: 283055
	Number of Columns: 22
	Outlier Removal: True
	Train Size: 0.8
	Test Size: 0.2
	Random State: 42


In [3]:
# Current working directory
cwd = os.getcwd()

# model directory
model_dir = os.path.join(cwd, 'models', 'pickled_models')

In [4]:
# Files in model directory
os.listdir(model_dir)

['.DS_Store',
 'regression_neural_net.pkl',
 'regularized_regression.pkl',
 'base_regression.pkl']

#### Base Regression Model

In [5]:
# Reset dataset as standard
data.reset_dataset()

In [6]:
# Perform preprocessing
data.apply_function(base_regression_pipeline)


# Load model
base_regression = load_model(f'{model_dir}/base_regression.pkl')

# Split
_, X_test, _, y_test = data.stratify_train_test_split(val_size=0,
                                                      y_column='log_listing_price')

Model loaded successfully from /Users/rasmuskrebs/Documents/School/semester_3/data_mining/pricing_wizards/models/pickled_models/base_regression.pkl
Dependent variable distribution is equal across all subsets


In [7]:
# Predict
regression_prediction = base_regression.predict(drop_helpers(X_test))

# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(regression_prediction, y_test)
threshold_accuracy(regression_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(regression_prediction, y_test, return_metrics=False, scale_up=True)
threshold_accuracy(regression_prediction, y_test, p=0.1, scale_up=True)

Log Scale Accuracy
R2 Score: 0.5454074633046996
MSE: 0.43798999970075375
MAE 0.5134088438732489
RMSE 0.6618081290682019
Threshold Accuracy 0.6385154828566886

Original Scale Accuracy
R2 Score: 0.3491713283801632
MSE: 193549.2874597735
MAE 211.4213411969003
RMSE 439.9423683390513
Threshold Accuracy 0.12686580346575754


#### Regularized Regression Model

In [8]:
# Reset dataset as standard
data.reset_dataset()

# Perform preprocessing
data.apply_function(ridge_regression_pipeline)

# Load model
ridge_regression = load_model(f'{model_dir}/regularized_regression.pkl')

# Split
_, X_test, _, y_test = data.stratify_train_test_split(val_size=0,
                                                      y_column='log_listing_price')

Model loaded successfully from /Users/rasmuskrebs/Documents/School/semester_3/data_mining/pricing_wizards/models/pickled_models/regularized_regression.pkl
Dependent variable distribution is equal across all subsets


In [9]:
# Predict
reg_regression_prediction = ridge_regression.predict(drop_helpers(X_test))

# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(regression_prediction, y_test)
threshold_accuracy(regression_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(regression_prediction, y_test, scale_up=True)
threshold_accuracy(regression_prediction, y_test, p=0.1, scale_up=True)


Log Scale Accuracy
R2 Score: 0.5454074633046996
MSE: 0.43798999970075375
MAE 0.5134088438732489
RMSE 0.6618081290682019
Threshold Accuracy 0.6385154828566886

Original Scale Accuracy
R2 Score: 0.3491713283801632
MSE: 193549.2874597735
MAE 211.4213411969003
RMSE 439.9423683390513
Threshold Accuracy 0.12686580346575754


### Neural Net

In [23]:
# Reset dataset as standard
data.reset_dataset()

# Perform preprocessing
data.apply_function(ridge_regression_pipeline)

# Split data (Since validation is created on the training data, we can just exclude the argument here as it won't change test data)
_, X_test, _, y_test = data.stratify_train_test_split(val_size=0,
                                                      y_column='log_listing_price')

# Make test dataset a tensor
X_test_tensor = torch.tensor(drop_helpers(X_test).values, dtype=torch.float32)

Dependent variable distribution is equal across all subsets


In [24]:
# Load model
network = RegressionNN(input_size=drop_helpers(X_test).shape[1])

# Load weights
network.load_state_dict(torch.load(f'{model_dir}/regression_neural_net.pt'))

<All keys matched successfully>

In [25]:
# Determine device
device = set_device()

nn_prediction = test(network, X_test_tensor, device)

In [26]:
# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(nn_prediction, y_test)
threshold_accuracy(nn_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(nn_prediction, y_test, scale_up=True)
threshold_accuracy(nn_prediction, y_test, p=0.1, scale_up=True)


Log Scale Accuracy
R2 Score: 0.6489673544113241
MSE: 0.33821230206291775
MAE 0.4363567338250978
RMSE 0.5815602308126973
Threshold Accuracy 0.7065411315822013

Original Scale Accuracy
R2 Score: 0.5519765521442916
MSE: 133237.2448833896
MAE 171.23982758015052
RMSE 365.01677342745444
Threshold Accuracy 0.16957835049725317
