- Change the np.load to include the file
- Add the following .pkl files into the same folder as this .ipynb (model, model_features, scaler, poly)
- Edit the transform_features() method if necessary

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
import joblib
import networkx as nx
import sys
import os

sys.path.append(os.path.abspath(".."))

test=np.load("../datasets/unlabeled/fifth_batch_multi.npz")

X_test=test["X"]

XX_test = pd.DataFrame(X_test)
XX_test.rename(columns={0:"user",1:"item",2:"rating"},inplace=True)

num_unique_users = XX_test["user"].nunique()
print(f"Number of unique user IDs in the test set: {num_unique_users}")

Number of unique user IDs in the test set: 2200


In [4]:
from utils.feature_transformation import aggregate_features
from sklearn.impute import SimpleImputer

test_features = aggregate_features(XX_test)

test_features.sort_values(by="user", inplace=True)

# Select only important features
model_features = joblib.load("model_features.pkl")
print(f"test_features before selecting features {test_features.shape}")

# If feature does not exist, populate with 0s
for feat in model_features:
    if feat not in test_features.columns:
        test_features[feat] = 0
test_features = test_features[model_features]
print(f"test_features after selecting features {test_features.shape}")

# debugging
# print(test_features.columns)

scaler = joblib.load("scaler.pkl")

test_features_scaled = scaler.transform(test_features)

# Load the trained model and predict probabilities (shape: #test_users x 6)
xgb_model = joblib.load("xgb_model.pkl")
probabilities = xgb_model.predict_proba(test_features_scaled)
y_pred_proba_rf = xgb_model.predict_proba(test_features_scaled)
# print(y_pred_proba_rf)

np.savez("predictions.npz", probabilities=probabilities)
print(f"prediction shape {probabilities.shape}")

test_results=np.load("predictions.npz")
test_results_df = pd.DataFrame(test_results["probabilities"])
test_results_df.head()

test_features before selecting features (2200, 86)
test_features after selecting features (2200, 54)
prediction shape (2200, 6)


Unnamed: 0,0,1,2,3,4,5
0,0.994435,0.001443,0.000129,6.8e-05,0.00228,0.001646
1,0.959771,0.00476,0.001389,9.1e-05,0.030396,0.003592
2,0.994322,0.001265,0.000579,3.9e-05,0.002087,0.001709
3,0.999104,0.000134,5.7e-05,1.2e-05,0.000451,0.000243
4,0.986112,0.007262,0.000479,5.2e-05,0.001368,0.004726


In [None]:
data = np.load('predictions.npz')
predictions = data['probabilities']

class_counts = {i: 0 for i in range(6)}

for row in predictions:
    predicted_class = np.argmax(row)
    class_counts[predicted_class] += 1

print("Class instance counts:")
for class_label, count in class_counts.items():
    print(f"Class {class_label}: {count}") 

Class instance counts:
Class 0: 2109
Class 1: 18
Class 2: 4
Class 3: 43
Class 4: 18
Class 5: 8
