In [6]:
# Loading 'optimal' model
import joblib

best_xgb = joblib.load('./optimal_model/best_xgb_model.pkl')
scaler = joblib.load('./optimal_model/scaler.pkl')

In [42]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_absolute_error

# Load the necessary files
best_xgb = joblib.load('best_xgb_model.pkl')
scaler = joblib.load('scaler.pkl')
sel = joblib.load('feature_selector.pkl')

# Load the test data
df_test = pd.read_csv('./data/portfolio_value_fair_value_data.csv')  # Ensure this file matches your test data structure
df_test = df_test.drop(columns=['Ticker'])
df_test = pd.get_dummies(df_test)
df_test = df_test.apply(pd.to_numeric, errors='coerce')
df_test = df_test.fillna(0)  # Ensure this matches your training preprocessing

# Ensure the same preprocessing and feature selection as the training data
df_test = df_test.clip(lower=1e-12, upper=1e12)
X_test = df_test.drop(columns=['Fair Value'])
y_test = df_test['Fair Value']

# Apply the same transformations as in training
skewed_feats = X_test.apply(lambda x: np.abs(x.skew())).sort_values(ascending=False)
skewed_features = skewed_feats[skewed_feats > 0.75].index
for feat in skewed_features:
    if X_test[feat].dtype in [np.float64, np.float32]:
        X_test[feat] = np.log1p(X_test[feat].clip(lower=1e-12))

X_test_scaled = scaler.transform(X_test)
X_test_selected = sel.transform(X_test_scaled)

# Make predictions
y_pred = best_xgb.predict(X_test_selected)

# Calculate the errors
errors = y_test - y_pred
abs_errors = np.abs(errors)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Error': errors,
    'Absolute Error': abs_errors
})

# Save results to a CSV file for further analysis
results_df.to_csv('test_set_errors.csv', index=False)

# Calculate and print the overall MAE for verification
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on the test set: {mae:.2f} dollars")

# Show the first few rows of the results for a quick look
print(results_df.head())


Mean Absolute Error on the test set: 283.05 dollars
         Actual   Predicted      Error  Absolute Error
0  1.700000e+02  172.890961  -2.890961        2.890961
1  1.930000e+02  179.553940  13.446060       13.446060
2  1.790000e+02  161.267471  17.732529       17.732529
3  4.350000e+02  426.080353   8.919647        8.919647
4  1.000000e-12   34.740982 -34.740982       34.740982
