- Change the np.load to include the file
- Add the following .pkl files into the same folder as this .ipynb (model, model_features, scaler, poly)
- Edit the transform_features() method if necessary

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
import joblib

test=np.load("../datasets/labeled/first_batch_multi_labels.npz")

X_test=test["X"]
y_test=test["y"]

# Convert the NumPy array to a Pandas DataFrame
df_y_test = pd.DataFrame(y_test, columns=["ID", "label"])

# Set the "ID" column as the index
y_test_formatted = df_y_test.set_index("ID")["label"]

# Print the first few rows to verify the format
print(y_test_formatted.head())

y_test = y_test_formatted

XX_test = pd.DataFrame(X_test)
XX_test.rename(columns={0:"user",1:"item",2:"rating"},inplace=True)

num_unique_users = XX_test["user"].nunique()
print(f"Number of unique user IDs in the test set: {num_unique_users}")

ID
0    0
1    0
2    3
3    0
4    0
Name: label, dtype: int64
Number of unique user IDs in the test set: 1100


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
from sklearn.preprocessing import StandardScaler

EPS = 1e-6

def transform_features(X):
    """
    Transforms the input DataFrame X (assumed to have columns 'user', 'item', 'rating')
    into a DataFrame with aggregated user-level features. These include:
      - Aggregation of counts and summary statistics.
      - Proportional features.
      - Gap statistics on item IDs.
      - Min, max, median, and variance of movie IDs.
      - Product features based on item and rating.
      - Rating distribution entropy.
      - Movie popularity features.
      - Unique movies count and diversity.
      - Deviation from population average rating.
      - Sequential pattern features (rating differences, etc.).
      - Advanced interaction features and review count binning.

    Returns a DataFrame with all the computed features.
    """
    df = X.copy()

    # --- AGGREGATION: Compute counts and summary statistics ---
    user_features = df.groupby("user").agg(
        review_count=("rating", "count"),
        avg_rating=("rating", "mean"),
        std_rating=("rating", "std"),
        like_count=("rating", lambda x: (x == 10).sum()),
        dislike_count=("rating", lambda x: (x == -10).sum()),
        unknown_count=("rating", lambda x: (x == 1).sum()),
        neutral_count=("rating", lambda x: (x == 0).sum()),
    ).reset_index()


    #graph stuff
    # # Create the bipartite graph (Users ↔ Movies)
    # G = nx.Graph()
    # G.add_edges_from(X[['user', 'item']].values)

    # # Compute Graph Features
    # print("Computing graph features...")

    # user_ids = X['user'].unique()

    # # Compute core graph features
    # degree_centrality = pd.Series(nx.degree_centrality(G), name="degree_centrality")
    # pagerank = pd.Series(nx.pagerank(G), name="pagerank")
    # betweenness_centrality = pd.Series(nx.betweenness_centrality(G), name="betweenness_centrality")

    # # New Feature: Average Neighbor Degree
    # avg_neighbor_degree = pd.Series(nx.average_neighbor_degree(G), name="avg_neighbor_degree")

    # # New Feature: Ego Network Density
    # ego_density = {}
    # for user in user_ids:
    #     ego_net = nx.ego_graph(G, user)
    #     if len(ego_net.nodes) > 1:
    #         ego_density[user] = nx.density(ego_net)
    #     else:
    #         ego_density[user] = 0
    # ego_density = pd.Series(ego_density, name="ego_density")

    # New Feature: Jaccard Coefficient (Measures similarity between users)
    # jaccard_coeff = {}
    # for u in user_ids:
    #     neighbors = set(G.neighbors(u))
    #     scores = [len(neighbors & set(G.neighbors(v))) / len(neighbors | set(G.neighbors(v)))
    #               if len(neighbors | set(G.neighbors(v))) > 0 else 0
    #               for v in user_ids if v != u]
    #     jaccard_coeff[u] = np.mean(scores) if scores else 0
    # jaccard_coeff = pd.Series(jaccard_coeff, name="jaccard_coefficient")

    # # Combine all graph features into a DataFrame
    # graph_features = pd.concat([
    #     degree_centrality, pagerank, betweenness_centrality,
    #     avg_neighbor_degree, jaccard_coeff
    # ], axis=1)

    # # Keep only user nodes (filtering out movie nodes)
    # graph_features = graph_features.loc[user_ids]
    # graph_features = graph_features.reset_index().rename(columns={'index': 'user'})

    # user_features = user_features.merge(graph_features, on='user', how='left')

    # --- PROPORTIONAL FEATURES ---
    user_features["like_pct"] = user_features["like_count"] / (user_features["review_count"] + EPS)
    user_features["dislike_pct"] = user_features["dislike_count"] / (user_features["review_count"] + EPS)
    user_features["unknown_pct"] = user_features["unknown_count"] / (user_features["review_count"] + EPS)
    user_features["neutral_pct"] = user_features["neutral_count"] / (user_features["review_count"] + EPS)

    # --- GAP STATISTICS ---
    XX_sorted = X.sort_values(by=['user', 'item'])
    XX_sorted['item_diff'] = XX_sorted.groupby('user')['item'].diff().fillna(0)
    gap_stats = XX_sorted.groupby('user')['item_diff'].agg(['mean', 'std', 'max', 'min'])
    gap_stats.columns = ['gap_mean', 'gap_std', 'gap_max', 'gap_min']
    user_features = user_features.merge(gap_stats, on='user', how='left')

    # --- MIN/MAX/MEDIAN/VARIANCE OF MOVIE IDs ---
    min_max_df = X.groupby("user")["item"].agg(min_movie="min", max_movie="max", median_movie="median", variance_movie="var").reset_index()
    user_features = user_features.merge(min_max_df, on="user", how="left")

    # --- PRODUCT FEATURES ---
    X = X.copy()  # work on a copy to avoid modifying original data
    X["item_rating"] = X["item"] * X["rating"]
    sum_rating = X.groupby("user")["rating"].sum().reset_index(name="sum_rating")
    sum_product = X.groupby("user")["item_rating"].sum().reset_index(name="sum_item_rating")
    user_features = user_features.merge(sum_product, on="user", how="left")
    user_features = user_features.merge(sum_rating, on="user", how="left")

    user_features["average_product"] = user_features["sum_item_rating"] / user_features["review_count"]
    user_features["product_above_zero"] = (user_features["sum_item_rating"] > 0).astype(int)
    user_features["sum_above_zero"] = (user_features["sum_rating"] > 0).astype(int)
    user_features["avg_product_vs_avg_rating"] = user_features["average_product"] / (user_features["avg_rating"] + EPS)

    # --- RATING DISTRIBUTION ENTROPY ---
    def calc_entropy(row):
        probs = [row["like_pct"], row["dislike_pct"], row["unknown_pct"], row["neutral_pct"]]
        probs = [p for p in probs if p > 0]
        return entropy(probs) if probs else 0
    user_features["rating_entropy"] = user_features.apply(calc_entropy, axis=1)

    # --- MOVIE POPULARITY FEATURES ---
    movie_popularity = X.groupby("item").size().reset_index(name="movie_popularity")
    X_with_pop = X.merge(movie_popularity, on="item")
    pop_features = X_with_pop.groupby("user").agg(
        avg_movie_popularity=("movie_popularity", "mean"),
        std_movie_popularity=("movie_popularity", "std"),
        min_movie_popularity=("movie_popularity", "min"),
        max_movie_popularity=("movie_popularity", "max"),
    ).reset_index()

    # --- UNIQUE MOVIES AND DIVERSITY ---
    unique_items = X.groupby("user")["item"].nunique().reset_index()
    unique_items.columns = ["user", "unique_movies"]

    # --- DEVIATION FROM POPULATION FEATURES ---
    movie_avg_rating = X.groupby("item")["rating"].mean().reset_index(name="movie_avg_rating")
    X_with_avg = X.merge(movie_avg_rating, on="item")
    X_with_avg["rating_deviation"] = X_with_avg["rating"] - X_with_avg["movie_avg_rating"]
    X_with_avg["abs_rating_deviation"] = np.abs(X_with_avg["rating_deviation"])
    deviation_features = X_with_avg.groupby("user").agg(
        mean_deviation=("rating_deviation", "mean"),
        std_deviation=("rating_deviation", "std"),
        mean_abs_deviation=("abs_rating_deviation", "mean"),
        max_abs_deviation=("abs_rating_deviation", "max"),
    ).reset_index()

    # --- SEQUENTIAL PATTERN FEATURES ---
    X_sorted = X.sort_values(["user", "item"])
    X_sorted["next_rating"] = X_sorted.groupby("user")["rating"].shift(-1)
    X_sorted["rating_diff"] = X_sorted["next_rating"] - X_sorted["rating"]
    X_sorted["abs_rating_diff"] = np.abs(X_sorted["rating_diff"])
    X_sorted = X_sorted.dropna(subset=["rating_diff"])
    sequence_features = X_sorted.groupby("user").agg(
        mean_rating_diff=("rating_diff", "mean"),
        std_rating_diff=("rating_diff", "std"),
        mean_abs_rating_diff=("abs_rating_diff", "mean"),
        max_abs_rating_diff=("abs_rating_diff", "max"),
        rating_changes_count=("rating_diff", lambda x: (x != 0).sum()),
    ).reset_index()
    sequence_features["rating_changes_pct"] = sequence_features["rating_changes_count"] / (
        user_features.set_index("user")["review_count"] - 1 + EPS
    ).reindex(sequence_features["user"]).values

    # --- COMBINE ALL FEATURES ---
    all_features = user_features.merge(pop_features, on="user", how="left")
    all_features = all_features.merge(unique_items, on="user", how="left")
    all_features = all_features.merge(deviation_features, on="user", how="left")
    all_features = all_features.merge(sequence_features, on="user", how="left")

    all_features["diversity_ratio"] = all_features["unique_movies"] / (all_features["review_count"] + EPS)
    all_features = all_features.fillna(0)

    # --- UNSUPERVISED ANOMALY DETECTION FEATURES ---
    feature_cols = [col for col in all_features.columns
                    if col not in ["user", "label", "is_anomalous"]
                    and all_features[col].dtype in [np.float64, np.int64]]
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(all_features[feature_cols])

    # --- ADVANCED INTERACTION FEATURES ---
    all_features["like_dislike_ratio"] = all_features["like_count"] / (all_features["dislike_count"] + EPS)
    all_features["rating_range"] = all_features["max_abs_rating_diff"]
    all_features["popularity_vs_deviation"] = all_features["avg_movie_popularity"] * all_features["mean_abs_deviation"]
    all_features["entropy_by_count"] = all_features["rating_entropy"] * np.log1p(all_features["review_count"])

    # --- BINNING FEATURES ---
    all_features["review_count_bin"] = pd.qcut(all_features["review_count"], q=5, labels=False, duplicates="drop")

    return pd.DataFrame(all_features)

In [None]:
# Feature engineering
test_features = transform_features(XX_test)
test_features.sort_values(by="user", inplace=True) #Sort by user

# Select only important features
model_features = joblib.load("model_features.pkl")

print(f"test_features before selecting features {test_features.shape}")

# If feature does not exist, populate with 0s
for feat in model_features:
    if feat not in test_features.columns:
        test_features[feat] = 0
test_features = test_features[model_features]

print(f"test_features after selecting features {test_features.shape}")


print(test_features.columns)
print(test_features.head())

# Load scaler and polynomial transformer, and transform the test features
scaler = joblib.load("scaler.pkl")
poly = joblib.load("poly.pkl")
test_features_scaled = scaler.transform(test_features)
test_features_poly = poly.transform(test_features_scaled)

# Load the trained model and predict probabilities (shape: #test_users x 6)
rf_model = joblib.load("model.pkl")
# probabilities = rf_model.predict_proba(test_features_poly)
y_pred_proba_rf = rf_model.predict_proba(test_features_poly)
print(y_pred_proba_rf)


# import joblib

# # Save the predictions as an .npz file
# np.savez("predictions.npz", probabilities=probabilities)
# print(f"prediction shape {probabilities.shape}")

# # View predictions.npz
# test_results=np.load("predictions.npz")
# test_results_df = pd.DataFrame(test_results["probabilities"])
# print(test_results_df.head())

test_features before selecting features (1100, 51)
test_features after selecting features (1100, 41)
Index(['review_count', 'avg_rating', 'std_rating', 'like_count',
       'dislike_count', 'neutral_count', 'dislike_pct', 'unknown_pct',
       'neutral_pct', 'gap_mean', 'gap_max', 'gap_min', 'rating_entropy',
       'svd_1', 'svd_2', 'svd_3', 'avg_movie_popularity',
       'std_movie_popularity', 'min_movie_popularity', 'max_movie_popularity',
       'std_deviation', 'mean_abs_deviation', 'max_abs_deviation',
       'mean_rating_diff', 'std_rating_diff', 'max_abs_rating_diff',
       'rating_changes_pct', 'diversity_ratio', 'like_dislike_ratio',
       'popularity_vs_deviation', 'entropy_by_count', 'review_count_bin',
       'min_movie', 'max_movie', 'median_movie', 'variance_movie',
       'sum_item_rating', 'average_product', 'product_above_zero',
       'sum_above_zero', 'avg_product_vs_avg_rating'],
      dtype='object')
   review_count  avg_rating  std_rating  like_count  dislike_

In [None]:
print("RandomForest AUC Scores per Class:")
from sklearn.metrics import roc_auc_score

print(y_test)
auc_per_class_rf = {}
for idx, cls in enumerate(rf_model.classes_):
    binary_true = (y_test == cls).astype(int)
    try:
        auc = roc_auc_score(binary_true, y_pred_proba_rf[:, idx])
        auc_per_class_rf[cls] = auc
        print(f"  Class {cls}: AUC = {auc:.3f}")
    except Exception as e:
        auc_per_class_rf[cls] = None
        print(e)
        print(f"  Class {cls}: AUC could not be computed")

k = 5
AUC_0 = auc_per_class_rf[0]
anomaly_aucs = [auc_per_class_rf[i] for i in range(1, k+1) if i in auc_per_class_rf]

final_metric = (0.5 * AUC_0) + (0.5 / k) * sum(anomaly_aucs)

print(f"\n🏆 Final Evaluation Metric: {final_metric:.3f}")

RandomForest AUC Scores per Class:
ID
0       0
1       0
2       3
3       0
4       0
       ..
1095    0
1096    3
1097    0
1098    0
1099    0
Name: label, Length: 1100, dtype: int64
  Class 0: AUC = 0.784
  Class 1: AUC = 0.669
  Class 2: AUC = 0.709
  Class 3: AUC = 0.997
  Class 4: AUC = 0.539
  Class 5: AUC = 0.605

🏆 Final Evaluation Metric: 0.744
