#### Setup

In [1]:
# Import dataset class and accuracy functions
from __init__ import (PricingWizardDataset, 
                      regression_accuracy, 
                      threshold_accuracy, 
                      base_regression_pipeline, 
                      ridge_regression_pipeline,
                      RegressionNN,
                      load_model,
                      drop_helpers,
                      test,
                      set_device)

# Model loading imports
import joblib
import torch

# Import other libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named '__init__'

In [2]:
# Load Data
data = PricingWizardDataset()

Dataset Loaded: post_preprocessing_without_dummies
	Number of Rows: 283055
	Number of Columns: 22
	Outlier Removal: True
	Train Size: 0.8
	Test Size: 0.2
	Random State: 42


In [3]:
# Current working directory
cwd = os.getcwd()

# model directory
model_dir = os.path.join(cwd, 'models', 'pickled_models')

In [4]:
# Files in model directory
os.listdir(model_dir)

['.DS_Store',
 'prediction_svr_linear.pkl',
 'regularized_regression.pkl',
 'prediction_random_forest.pkl.zip',
 'base_regression.pkl',
 'regression_neural_net.pt']

In [5]:
df = data.df.copy()

#### Base Regression Model

In [6]:
# Reset dataset as standard
data.reset_dataset()

In [7]:
# Perform preprocessing
data.apply_function(base_regression_pipeline)


# Load model
base_regression = load_model(f'{model_dir}/base_regression.pkl')

# Split
_, X_test, _, y_test = data.stratify_train_test_split(val_size=0,
                                                      y_column='log_listing_price')

Model loaded successfully from /Users/rasmuskrebs/Documents/School/semester_3/data_mining/pricing_wizards/models/pickled_models/base_regression.pkl
Dependent variable distribution is equal across all subsets


In [8]:
# Predict
regression_prediction = base_regression.predict(drop_helpers(X_test))

# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(regression_prediction, y_test)
threshold_accuracy(regression_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(regression_prediction, y_test, return_metrics=False, scale_up=True)
threshold_accuracy(regression_prediction, y_test, p=0.1, scale_up=True)

Log Scale Accuracy
R2 Score: 0.5454074633046996
MSE: 0.43798999970075375
MAE 0.5134088438732489
RMSE 0.6618081290682019
Threshold Accuracy 0.6385154828566886

Original Scale Accuracy
R2 Score: 0.3491713283801632
MSE: 193549.2874597735
MAE 211.4213411969003
RMSE 439.9423683390513
Threshold Accuracy 0.12686580346575754


#### Regularized Regression Model

In [9]:
# Reset dataset as standard
data.reset_dataset()

# Perform preprocessing
data.apply_function(ridge_regression_pipeline)

# Load model
ridge_regression = load_model(f'{model_dir}/regularized_regression.pkl')

# Split
_, X_test, _, y_test = data.stratify_train_test_split(val_size=0,
                                                      y_column='log_listing_price')

Model loaded successfully from /Users/rasmuskrebs/Documents/School/semester_3/data_mining/pricing_wizards/models/pickled_models/regularized_regression.pkl
Dependent variable distribution is equal across all subsets


In [10]:
# Predict
ridge_regression_prediction = ridge_regression.predict(drop_helpers(X_test))

# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(ridge_regression_prediction, y_test)
threshold_accuracy(ridge_regression_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(ridge_regression_prediction, y_test, scale_up=True)
threshold_accuracy(ridge_regression_prediction, y_test, p=0.1, scale_up=True)


Log Scale Accuracy
R2 Score: 0.6060281080749564
MSE: 0.37958332989970334
MAE 0.4704520986562844
RMSE 0.6161033435225809
Threshold Accuracy 0.6783134019890127

Original Scale Accuracy
R2 Score: 0.4562210332121559
MSE: 161713.8828494586
MAE 191.055882600751
RMSE 402.1366469863927
Threshold Accuracy 0.15161364399145041


### Neural Net

In [14]:
# Reset dataset, used during modelling and overwrites any previous changes
data.reset_dataset()

# Apply ridge regression pipeline
data.apply_function(ridge_regression_pipeline)

In [15]:
# Inspecting head
data.head()

Unnamed: 0,classified_id,log_listing_price,condition_name,brand_name_& Other Stories,brand_name_(di)vision,brand_name_66 North,brand_name_7 DAYS ACTIVE,brand_name_A.P.C.,brand_name_AF Agger,brand_name_AMI Paris,...,subsubsubcategory_name_Wall lights,subsubsubcategory_name_Wallets,subsubsubcategory_name_Watches,subsubsubcategory_name_Weekend bags,subsubsubcategory_name_Wireless speakers,subsubsubcategory_name_Women,subsubsubcategory_name_iPhone,subsubsubcategory_name_iPhones,subsubsubcategory_name_Øreringe,brand_name_other
0,30343099,7.17012,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30346312,5.860786,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30364278,4.795791,2,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,30406315,6.111467,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30420441,6.398595,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Standard Scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(drop_helpers(data.df))

In [17]:
# Assigning X to data.df
data.df[drop_helpers(data.df).columns] = X

In [20]:
# Splitting dataset
X_train, X_test, y_train, y_test  = data.stratify_train_test_split(val_size=0, return_splits=True, y_column='log_listing_price')

# Converting to PyTorch tensors
X_test_tensor = torch.tensor(drop_helpers(X_test).to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

Dependent variable distribution is equal across all subsets


In [21]:
# Load model
network = RegressionNN(input_size=X_test_tensor.shape[1])

# Load weights
network.load_state_dict(torch.load(f'{model_dir}/regression_neural_net.pt'))

<All keys matched successfully>

In [22]:
# Determine device
device = set_device()

nn_prediction = test(network, X_test_tensor, device)

In [23]:
# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(nn_prediction, y_test)
threshold_accuracy(nn_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(nn_prediction, y_test, scale_up=True)
threshold_accuracy(nn_prediction, y_test, p=0.1, scale_up=True)


Log Scale Accuracy
R2 Score: 0.6408822819519124
MSE: 0.3460020931356468
MAE 0.4392821771925936
RMSE 0.588219426010096
Threshold Accuracy 0.7047216971966579

Original Scale Accuracy
R2 Score: 0.5474373442587677
MSE: 134587.15537471496
MAE 173.7124975176537
RMSE 366.8612208652135
Threshold Accuracy 0.1768207592164067


In [31]:
x = 124142
log_of_x = np.log(x)
log_of_x

11.729181350680463

In [32]:
np.exp(log_of_x)

124142.00000000006

### Random Forest

In [None]:
# Extract model
with zipfile.ZipFile(f'{model_dir}/prediction_random_forest.pkl.zip', 'r') as zip_ref:
    # Extract all contents to the specified directory
    zip_ref.extractall(model_dir)

In [None]:
# Reset dataset as standard
data.reset_dataset()

In [None]:
# Perform preprocessing
data.apply_function(base_regression_pipeline)

# Load model
random_forest = load_model(f'{model_dir}/prediction_random_forest.pkl')

# Split
_, X_test, _, y_test = data.stratify_train_test_split(val_size=0,
                                                      y_column='log_listing_price')

In [None]:
# Predict
rf_prediction = random_forest.predict(X_test.values)

# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(rf_prediction, y_test)
threshold_accuracy(rf_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(rf_prediction, y_test, return_metrics=False, scale_up=True)
threshold_accuracy(rf_prediction, y_test, p=0.1, scale_up=True)

### Support Vector Machines (SVM)

In [None]:
# Reset dataset as standard
data.reset_dataset()

In [None]:
# Perform preprocessing
data.apply_function(base_regression_pipeline)

# Load model
linear_svr = load_model(f'{model_dir}/prediction_svr_linear.pkl')

# Split
_, X_test, _, y_test = data.stratify_train_test_split(val_size=0,
                                                      y_column='log_listing_price')

In [None]:
# StandardScaler
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
# Predict
svr_prediction = linear_svr.predict(X_test_scaled)

# Calculate accuracy
print('Log Scale Accuracy')
regression_accuracy(svr_prediction, y_test)
threshold_accuracy(svr_prediction, y_test, p=0.1)

print('\nOriginal Scale Accuracy')
regression_accuracy(svr_prediction, y_test, return_metrics=False, scale_up=True)
threshold_accuracy(svr_prediction, y_test, p=0.1, scale_up=True)