## MAPE

In [24]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/processed/auction_results_no_img.csv')

# Group the data by artist and calculate the average auction price for each artist
artist_avg_price = df.groupby('ARTIST')['PRICE'].mean().reset_index()

# Merge the average prices back to the original dataset
df = df.merge(artist_avg_price, on='ARTIST', suffixes=('', '_avg'))

# Calculate the absolute percentage error for each row
df['abs_percentage_error'] = abs((df['PRICE'] - df['PRICE_avg']) / df['PRICE']) * 100

# Calculate MAPE (Mean Absolute Percentage Error) as the mean of all absolute percentage errors
mape_baseline = df['abs_percentage_error'].mean()

# Display the MAPE for the baseline model
print(f'MAPE for baseline model: {mape_baseline:.2f}%')


MAPE for baseline model: 477.95%


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb

# Load the dataset correctly using read_csv for CSV files
df = pd.read_csv('../data/processed/auction_results_no_img.csv')

# Extracting the target variable (Price)
y = df['PRICE']
X = df.drop(['PRICE'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize and train the XGBoost model with specified parameters
xg_reg = xgb.XGBRegressor(
    objective ='reg:squarederror'
)

# Train the model
xg_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xg_reg.predict(X_test)

# Calculate MAPE for the XGBoost model
mape_xgboost = mean_absolute_percentage_error(y_test, y_pred) * 100

# Display the MAPE for the XGBoost model
print(f'MAPE for XGBoost model: {mape_xgboost:.2f}%')

# Assuming you have already calculated the baseline MAPE earlier
# Compare the two MAPE scores (make sure you have mape_baseline defined earlier)
improvement_percentage = ((mape_baseline - mape_xgboost) / mape_baseline) * 100
print(f'XGBoost model improves MAPE by {improvement_percentage:.2f}% compared to the baseline.')


MAPE for XGBoost model: 292.74%
XGBoost model improves MAPE by 38.75% compared to the baseline.


## sMAPE

In [26]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/processed/auction_results_no_img.csv')

# Group the data by artist and calculate the average auction price for each artist
artist_avg_price = df.groupby('ARTIST')['PRICE'].mean().reset_index()

# Merge the average prices back to the original dataset
df = df.merge(artist_avg_price, on='ARTIST', suffixes=('', '_avg'))

# Calculate the symmetric absolute percentage error for each row
# Formula: sMAPE = 2 * |actual - forecast| / (|actual| + |forecast|)
df['symmetric_abs_percentage_error'] = 2 * abs(df['PRICE'] - df['PRICE_avg']) / (abs(df['PRICE']) + abs(df['PRICE_avg']))

# Calculate sMAPE as the mean of all symmetric absolute percentage errors
smape_baseline = df['symmetric_abs_percentage_error'].mean() * 100

# Display the sMAPE for the baseline model
print(f'sMAPE for baseline model: {smape_baseline:.2f}%')


sMAPE for baseline model: 73.64%


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np

# Function to calculate sMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

# Load the dataset correctly using read_csv for CSV files
df = pd.read_csv('../data/processed/auction_results_no_img.csv')

# Extracting the target variable (Price)
y = df['PRICE']
X = df.drop(['PRICE'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize and train the XGBoost model with specified parameters
xg_reg = xgb.XGBRegressor(
    objective='reg:squarederror'
)

# Train the model
xg_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xg_reg.predict(X_test)

# Calculate sMAPE for the XGBoost model
smape_xgboost = smape(y_test, y_pred)

# Display the sMAPE for the XGBoost model
print(f'sMAPE for XGBoost model: {smape_xgboost:.2f}%')

# Assuming you have already calculated the baseline sMAPE earlier (smape_baseline)
# Compare the two sMAPE scores
# Improvement percentage
improvement_percentage = ((smape_baseline - smape_xgboost) / smape_baseline) * 100
print(f'XGBoost model improves sMAPE by {improvement_percentage:.2f}% compared to the baseline.')

sMAPE for XGBoost model: 56.99%
XGBoost model improves sMAPE by 22.61% compared to the baseline.


## XLSX

In [39]:
import pandas as pd

# Load the dataset
df = pd.read_excel('../data/processed/encoded_results_2024_05_11.xlsx')

# Group the data by artist and calculate the average auction price for each artist
artist_avg_price = df.groupby('ARTIST')['PRICE'].mean().reset_index()

# Merge the average prices back to the original dataset
df = df.merge(artist_avg_price, on='ARTIST', suffixes=('', '_avg'))

# Calculate the absolute percentage error for each row
df['abs_percentage_error'] = abs((df['PRICE'] - df['PRICE_avg']) / df['PRICE']) * 100

# Calculate MAPE (Mean Absolute Percentage Error) as the mean of all absolute percentage errors
mape_baseline = df['abs_percentage_error'].mean()

# Display the MAPE for the baseline model
print(f'MAPE for baseline model: {mape_baseline:.2f}%')


MAPE for baseline model: 416.15%


In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb

# Load the dataset
df = pd.read_excel('../data/processed/encoded_results_2024_05_11.xlsx')

# Prepare features and target
df = df.drop(['URL', 'ImageName','AUCTION DATE'], axis=1)

# Extracting the target variable (Price)
y = df['PRICE']
X = df.drop(['PRICE'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=50)

# Initialize and train the XGBoost model
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')

# Train the model
xg_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xg_reg.predict(X_test)

# Calculate MAPE for the XGBoost model
mape_xgboost = mean_absolute_percentage_error(y_test, y_pred) * 100

# Display the MAPE for the XGBoost model
print(f'MAPE for XGBoost model: {mape_xgboost:.2f}%')

# Assuming you have already calculated the baseline MAPE earlier
# Compare the two MAPE scores
improvement_percentage = ((mape_baseline - mape_xgboost) / mape_baseline) * 100
print(f'XGBoost model improves MAPE by {improvement_percentage:.2f}% compared to the baseline.')


MAPE for XGBoost model: 269.27%
XGBoost model improves MAPE by 35.29% compared to the baseline.
