In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from scipy.stats import entropy
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
import torch
import torch.nn as nn
import torch.optim as optim
import sys
import os

sys.path.append(os.path.abspath(".."))


SEED = 1

data=np.load("../datasets/labeled/first_batch_multi_labels.npz")
data2=np.load("../datasets/labeled/second_batch_multi_labels.npz")
data3=np.load("../datasets/labeled/third_batch_multi_labels.npz")

X1=data["X"]
y1=data["y"]

X2=data2["X"]
y2=data2["y"]

X3=data3["X"]
y3=data3["y"]

X=np.concatenate((X1,X2,X3),axis=0)
y=np.concatenate((y1,y2,y3),axis=0)

XX=pd.DataFrame(X)
yy=pd.DataFrame(y)
XX.rename(columns={0:"user",1:"item",2:"rating"},inplace=True)
yy.rename(columns={0:"user",1:"label"},inplace=True)

In [3]:
# Merge labels into main dataset
XX = XX.merge(yy, on="user", how="left").sort_values(by=["user", "item"]).reset_index(drop=True)

print(XX.shape)
XX.head()

(524883, 4)


Unnamed: 0,user,item,rating,label
0,0,9,0,0
1,0,12,10,0
2,0,13,10,0
3,0,15,10,0
4,0,16,1,0


In [4]:
from utils.feature_transformation import aggregate_features

user_features = aggregate_features(XX)
user_features = user_features.merge(yy, on="user", how="left")
print(user_features.shape)
# user_features.to_csv("user_features.csv", index=False)
user_features.head()

(3300, 87)


Unnamed: 0,user,review_count,avg_rating,std_rating,like_count,dislike_count,unknown_count,neutral_count,like_pct,dislike_pct,...,item_mean_vs_user_avg,item_skew_bias,normalized_movie_popularity,popularity_skew,rating_polarity,activity_weighted_skew,switch_pct,dominant_rating,dominance_ratio,label
0,0,168,5.946429,5.253181,100,5,49,14,0.595238,0.029762,...,-3.066024,-0.437554,1.0,2282,0.565476,-2.817185,0.714286,10,0.595238,0
1,1,208,3.158654,5.890205,76,16,57,59,0.365385,0.076923,...,-0.082865,-0.454684,1.0,2333,0.288462,-1.70327,0.740385,10,0.365385,0
2,2,195,1.025641,7.750913,66,52,60,17,0.338462,0.266667,...,1.954071,-0.449163,1.0,2294,0.071795,-1.356825,0.805128,10,0.338462,3
3,3,41,1.073171,6.455193,10,7,14,10,0.243902,0.170732,...,2.567873,-0.533277,1.0,1990,0.073171,-0.522866,0.634146,1,0.341463,0
4,4,6,6.833333,4.91596,4,0,1,1,0.666667,0.0,...,-3.739258,-0.500907,1.0,1578,0.666667,-1.449192,0.833333,10,0.666667,0


In [None]:
from tqdm import tqdm
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier

df = user_features.copy()
print(df.shape)
df.head()

In [None]:
df.drop(columns=["user"], inplace=True)
correlation_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
correlated_features = set()
threshold = 0.9

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

df.drop(columns=correlated_features, inplace=True, errors="ignore")
print(f"Dropped correlated features: {correlated_features}")

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Define features and target
X = df.drop(columns=["label"])
y = df["label"]

print(X.columns)
X.head()

In [None]:
X.info()

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

print("🔹 Original Class Distribution:", Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=SEED)

smote = SMOTE(sampling_strategy='not majority', random_state=SEED)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)
print("🔹 Distribution after SMOTE (only minority oversampled):", Counter(y_train_over))

tomek = TomekLinks(sampling_strategy='not majority')
X_train_resampled, y_train_resampled = tomek.fit_resample(X_train_over, y_train_over)
print("🔹 Final Resampled Distribution (After applying Tomek on minority):", Counter(y_train_resampled))

# smote_tomek = SMOTETomek(random_state=SEED)
# X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

# class_distribution = Counter(y_train_resampled)
# print("🔹 Resampled Class Distribution (After SMOTE):", class_distribution)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# poly = PolynomialFeatures(degree=1, interaction_only=False, include_bias=False)
# X_train_poly = poly.fit_transform(X_train_scaled)
# X_test_poly = poly.transform(X_test_scaled)


In [None]:
import optuna
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import json

params_file = "best_xgb_params.json"

if os.path.exists(params_file):
    print("Hyperparameters found.")
    with open(params_file, "r") as f:
        best_params = json.load(f)
else:
    def objective(trial):
        params = {
            "objective": "multi:softprob",
            "num_class": len(np.unique(y_train_resampled)),
            "eval_metric": "mlogloss",
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 9),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 5),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 0.2),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
            "random_state": SEED,
            "tree_method": "exact",
            "predictor": "cpu_predictor",
            "verbosity": 0,
            "n_jobs": 1
        }

        model = xgb.XGBClassifier(**params)
        model.fit(X_train_scaled, y_train_resampled)
        y_pred_proba = model.predict_proba(X_test_scaled)

        auc_scores = []
        for i in range(y_pred_proba.shape[1]):
            binary_true = (y_test == i).astype(int)
            try:
                auc = roc_auc_score(binary_true, y_pred_proba[:, i])
                auc_scores.append(auc)
            except:
                auc_scores.append(0)

        return np.mean(auc_scores)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=80)

    best_params = study.best_params

    with open(params_file, "w") as f:
        json.dump(best_params, f)

    print(best_params)


In [None]:
# num_classes = len(np.unique(y))
# xgb_model = xgb.XGBClassifier(
#     **best_params,
#     objective='multi:softprob',
#     num_class=num_classes,
#     eval_metric='mlogloss',
#     random_state=SEED,
#     reg_lambda=0.1,
#     n_jobs=1,
# )

# xgb_model.fit(X_train_scaled, y_train_resampled)

# y_pred_proba_xgb = xgb_model.predict_proba(X_test_scaled)

# print("XGBoost AUC Scores per Class:")
# auc_per_class_xgb = {}
# for i in range(y_pred_proba_xgb.shape[1]):
#     binary_true = (y_test == i).astype(int)
#     try:
#         auc = roc_auc_score(binary_true, y_pred_proba_xgb[:, i])
#         auc_per_class_xgb[i] = auc
#         print(f"  Class {i}: AUC = {auc:.3f}")
#     except Exception as e:
#         auc_per_class_xgb[i] = None
#         print(f"  Class {i}: AUC could not be computed")

# k = 5
# AUC_0 = auc_per_class_xgb[0]
# anomaly_aucs = [auc_per_class_xgb[i] for i in range(1, k+1) if i in auc_per_class_xgb]

# final_metric = (0.5 * AUC_0) + (0.5 / k) * sum(anomaly_aucs)

# print(f"\nFinal Evaluation Metric: {final_metric:.3f}")

In [None]:
from collections import Counter
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

model_features = list(X_train.columns)

# Combine train and test
X_full = np.concatenate((X_train, X_test), axis=0)
y_full = np.concatenate((y_train, y_test), axis=0)
print("🔹 Original Class Distribution (Full Dataset):", Counter(y_full))

# Step 1: Oversample only minority classes (labels 1-5) to match class 0 using SMOTE.
# Setting sampling_strategy='not majority' means only classes other than the majority are oversampled.
smote = SMOTE(sampling_strategy='not majority', random_state=SEED)
X_over, y_over = smote.fit_resample(X_full, y_full)
print("🔹 Distribution after SMOTE (only minority oversampled):", Counter(y_over))

tomek = TomekLinks(sampling_strategy='not majority')
X_resampled, y_resampled = tomek.fit_resample(X_over, y_over)
print("🔹 Final Resampled Distribution (After applying Tomek on minority):", Counter(y_resampled))

# Preprocess and train the model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pd.DataFrame(X_resampled, columns=model_features))

num_classes = len(np.unique(y_full))
xgb_model = xgb.XGBClassifier(
    **best_params,
    objective='multi:softprob',
    num_class=num_classes,
    eval_metric='mlogloss',
    random_state=SEED,
    reg_lambda=0.1,
    n_jobs=1,
)

xgb_model.fit(X_scaled, y_resampled)

In [None]:
import joblib

# Save objects
joblib.dump(model_features, "../testing/model_features.pkl")
joblib.dump(scaler, "../testing/scaler.pkl")
joblib.dump(xgb_model, "../testing/xgb_model.pkl")

# Feature Analysis

In [None]:
# import shap

# explainer = shap.TreeExplainer(xgb_model, X_train_scaled)
# shap_values = explainer(X_test_scaled[:100], check_additivity=False)

# shap_array = shap_values.values  # shape: (n_samples, n_features, n_classes)

# # Loop over classes and show SHAP summary
# for class_id in range(num_classes):
#     print(f"\n📊 SHAP Summary for Class {class_id}")
#     shap.summary_plot(
#         shap_array[:, :, class_id],  # SHAP values for this class
#         X_test_scaled[:100],
#         feature_names=model_features,
#         show=True
#     )


In [None]:
# if isinstance(X_test_scaled, np.ndarray):
#     X_test_scaled = pd.DataFrame(X_test_scaled, columns=model_features)

# for class_id in range(num_classes):
#     print(f"\n🧠 SHAP Decision Plot for Class {class_id}")
    
#     shap.decision_plot(
#         explainer.expected_value[class_id],
#         shap_array[:10, :, class_id],
#         X_test_scaled.iloc[:10],
#         feature_names=model_features,
#         show=True
#     )