- Change the np.load to include the file
- Add the following .pkl files into the same folder as this .ipynb (model, model_features, scaler, poly)
- Edit the transform_features() method if necessary

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
import joblib
import networkx as nx
import sys
import os

sys.path.append(os.path.abspath(".."))

test=np.load("../datasets/unlabeled/fourth_batch_multi.npz")

X_test=test["X"]

XX_test = pd.DataFrame(X_test)
XX_test.rename(columns={0:"user",1:"item",2:"rating"},inplace=True)

num_unique_users = XX_test["user"].nunique()
print(f"Number of unique user IDs in the test set: {num_unique_users}")

Number of unique user IDs in the test set: 1100


In [2]:
from utils.feature_transformation import aggregate_features
from sklearn.impute import SimpleImputer

test_features = aggregate_features(XX_test, 0.001)

# TODO: To achieve the best result, need to select the best features based on SHAP. Just uncomment this code to get the highest AUC so far.
# keep = set([
#     "user",  # Keep user/label for merging

#     # Existing SHAP-important features
#     "rare_movies_watched_pct", "gap_max", "gap_mean", "z_rating_max", "std_deviation",
#     "mean_rating_diff", "variance_movie", "sum_item_rating", "max_movie_popularity",
#     "avg_movie_popularity", "max_movie", "likes_rare", "rare_like_ratio",
#     "user_pop_percentile_std", "min_movie_popularity", "median_movie",
#     "like_dislike_ratio", "std_rating", "avg_product_vs_avg_rating",
#     "rating_changes_pct", "unknown_pct", "neutral_pct", "z_rating_skew",
#     "std_rating_diff", "dislike_pct", "like_count", "average_product",
# ])
# test_features = test_features[[col for col in test_features.columns if col in keep]]

test_features.sort_values(by="user", inplace=True)

# Select only important features
model_features = joblib.load("model_features.pkl")
print(f"test_features before selecting features {test_features.shape}")

# If feature does not exist, populate with 0s
for feat in model_features:
    if feat not in test_features.columns:
        test_features[feat] = 0
test_features = test_features[model_features]
print(f"test_features after selecting features {test_features.shape}")

# debugging
# print(test_features.columns)

scaler = joblib.load("scaler.pkl")
poly = joblib.load("poly.pkl")

test_features_scaled = scaler.transform(test_features)
test_features_poly = poly.transform(test_features_scaled)

# Load the trained model and predict probabilities (shape: #test_users x 6)
xgb_model = joblib.load("xgb_model.pkl")
probabilities = xgb_model.predict_proba(test_features_poly)
y_pred_proba_rf = xgb_model.predict_proba(test_features_poly)
# print(y_pred_proba_rf)

np.savez("predictions.npz", probabilities=probabilities)
print(f"prediction shape {probabilities.shape}")

test_results=np.load("predictions.npz")
test_results_df = pd.DataFrame(test_results["probabilities"])
test_results_df.head()

Best contamination value: 0.001
test_features before selecting features (1100, 66)
test_features after selecting features (1100, 28)
prediction shape (1100, 6)


Unnamed: 0,0,1,2,3,4,5
0,0.044892,0.012898,0.766773,0.000703,0.067188,0.107546
1,0.961272,0.000511,0.001732,9.8e-05,0.035957,0.000431
2,0.989284,0.000508,0.005014,0.003277,0.001456,0.00046
3,0.987125,0.000549,0.000168,3e-05,0.012053,7.6e-05
4,0.926955,0.002248,0.002984,0.001321,0.06511,0.001382


In [3]:
data = np.load('predictions.npz')
predictions = data['probabilities']

class_counts = {i: 0 for i in range(6)}

for row in predictions:
    predicted_class = np.argmax(row)
    class_counts[predicted_class] += 1

print("Class instance counts:")
for class_label, count in class_counts.items():
    print(f"Class {class_label}: {count}") 

Class instance counts:
Class 0: 1061
Class 1: 1
Class 2: 7
Class 3: 13
Class 4: 13
Class 5: 5
