In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from scipy.stats import entropy
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
import torch
import torch.nn as nn
import torch.optim as optim

data=np.load("../datasets/labeled/first_batch_multi_labels.npz")
data2=np.load("../datasets/labeled/second_batch_multi_labels.npz")

X1=data["X"]
y1=data["y"]

X2=data2["X"]
y2=data2["y"]

X=np.concatenate((X1,X2),axis=0)
y=np.concatenate((y1,y2),axis=0)


# X=data["X"]
# y=data["y"]
XX=pd.DataFrame(X)
yy=pd.DataFrame(y)
XX.rename(columns={0:"user",1:"item",2:"rating"},inplace=True)
yy.rename(columns={0:"user",1:"label"},inplace=True)

In [None]:
# Merge labels into main dataset
XX = XX.merge(yy, on="user", how="left")

print(XX.shape)
XX.head()

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis, entropy
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

EPS = 1e-6

def engineer_features(XX, yy):
    user_features = XX.groupby("user").agg(
        review_count=("rating", "count"),
        avg_rating=("rating", "mean"),
        std_rating=("rating", "std"),
        like_count=("rating", lambda x: (x == 10).sum()),
        dislike_count=("rating", lambda x: (x == -10).sum()),
        unknown_count=("rating", lambda x: (x == 1).sum()),
        neutral_count=("rating", lambda x: (x == 0).sum()),
    ).reset_index()
    
    # --- PROPORTIONAL FEATURES ---
    user_features["like_pct"] = user_features["like_count"] / (user_features["review_count"] + EPS)
    user_features["dislike_pct"] = user_features["dislike_count"] / (user_features["review_count"] + EPS)
    user_features["unknown_pct"] = user_features["unknown_count"] / (user_features["review_count"] + EPS)
    user_features["neutral_pct"] = user_features["neutral_count"] / (user_features["review_count"] + EPS)
    
    # --- RATING DISTRIBUTION ENTROPY ---
    def calc_entropy(row):
        probs = [
            row["like_pct"], 
            row["dislike_pct"], 
            row["unknown_pct"], 
            row["neutral_pct"]
        ]
        probs = [p for p in probs if p > 0]
        return entropy(probs) if probs else 0
    
    user_features["rating_entropy"] = user_features.apply(calc_entropy, axis=1)
    
    # --- MOVIE POPULARITY FEATURES ---
    movie_popularity = XX.groupby("item").size().reset_index(name="movie_popularity")
    XX_with_pop = XX.merge(movie_popularity, on="item")
    
    pop_features = XX_with_pop.groupby("user").agg(
        avg_movie_popularity=("movie_popularity", "mean"),
        std_movie_popularity=("movie_popularity", "std"),
        min_movie_popularity=("movie_popularity", "min"),
        max_movie_popularity=("movie_popularity", "max"),
    ).reset_index()

    # Define movie popularity percentiles
    movie_popularity = XX['item'].value_counts().reset_index()
    movie_popularity.columns = ['item', 'popularity']
    threshold = movie_popularity['popularity'].quantile(0.1)
    rare_movies = movie_popularity[movie_popularity['popularity'] <= threshold]['item'].values
    
    XX['is_rare_movie'] = XX['item'].isin(rare_movies).astype(int)
    
    rare_stats = XX.groupby('user')['is_rare_movie'].mean().reset_index(name='rare_movies_watched_pct')
    user_features = user_features.merge(rare_stats, on='user', how='left')
    
    # --- UNIQUE MOVIES AND DIVERSITY ---
    unique_items = XX.groupby("user")["item"].nunique().reset_index()
    unique_items.columns = ["user", "unique_movies"]
    
    # --- DEVIATION FROM POPULATION FEATURES ---
    movie_avg_rating = XX.groupby("item")["rating"].mean().reset_index(name="movie_avg_rating")
    
    XX_with_avg = XX.merge(movie_avg_rating, on="item")
    XX_with_avg["rating_deviation"] = XX_with_avg["rating"] - XX_with_avg["movie_avg_rating"]
    XX_with_avg["abs_rating_deviation"] = np.abs(XX_with_avg["rating_deviation"])
    
    deviation_features = XX_with_avg.groupby("user").agg(
        mean_deviation=("rating_deviation", "mean"),
        std_deviation=("rating_deviation", "std"),
        mean_abs_deviation=("abs_rating_deviation", "mean"),
        max_abs_deviation=("abs_rating_deviation", "max"),
    ).reset_index()
    
    # --- SEQUENTIAL PATTERN FEATURES ---
    XX_sorted = XX.sort_values(["user", "item"])
    XX_sorted["next_rating"] = XX_sorted.groupby("user")["rating"].shift(-1)
    XX_sorted["rating_diff"] = XX_sorted["next_rating"] - XX_sorted["rating"]
    XX_sorted["abs_rating_diff"] = np.abs(XX_sorted["rating_diff"])
    
    XX_sorted = XX_sorted.dropna(subset=["rating_diff"])

    XX_sorted["rating_direction"] = XX_sorted["rating_diff"].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    XX_sorted['direction_switch'] = XX_sorted['rating_direction'] != XX_sorted.groupby("user")['rating_direction'].shift(1)
    switch_count = XX_sorted.groupby("user")["direction_switch"].sum().reset_index(name="change_direction_count")
    user_features = user_features.merge(switch_count, on="user", how="left")
    
    sequence_features = XX_sorted.groupby("user").agg(
        mean_rating_diff=("rating_diff", "mean"),
        std_rating_diff=("rating_diff", "std"),
        mean_abs_rating_diff=("abs_rating_diff", "mean"),
        max_abs_rating_diff=("abs_rating_diff", "max"),
        rating_changes_count=("rating_diff", lambda x: (x != 0).sum()),
    ).reset_index()
    
    sequence_features["rating_changes_pct"] = sequence_features["rating_changes_count"] / (
        user_features.set_index("user")["review_count"] - 1 + EPS
    ).reindex(sequence_features["user"]).values
    
    
    # --- COMBINE ALL USER-LEVEL FEATURES ---
    all_features = user_features.merge(pop_features, on="user", how="left")
    all_features = all_features.merge(unique_items, on="user", how="left")
    all_features = all_features.merge(deviation_features, on="user", how="left")
    all_features = all_features.merge(sequence_features, on="user", how="left")
    all_features = all_features.fillna(0)
    
    # --- UNSUPERVISED ANOMALY DETECTION FEATURES ---
    feature_cols = [col for col in all_features.columns 
                   if col not in ["user", "label", "is_anomalous"] 
                   and all_features[col].dtype in [np.float64, np.int64]]
    
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(all_features[feature_cols])
    
    # --- COMBINE WITH CLASS LABELS ---
    all_features = all_features.merge(yy, on="user", how="left")
    all_features["is_anomalous"] = (all_features["label"] != 0).astype(int)
    
    # --- ADVANCED INTERACTION FEATURES ---
    all_features["like_dislike_ratio"] = all_features["like_count"] / (all_features["dislike_count"] + EPS)
    all_features["rating_range"] = all_features["max_abs_rating_diff"]
    all_features["popularity_vs_deviation"] = all_features["avg_movie_popularity"] * all_features["mean_abs_deviation"]
    all_features["entropy_by_count"] = all_features["rating_entropy"] * np.log1p(all_features["review_count"])
    
    # --- BINNING FEATURES ---
    all_features["review_count_bin"] = pd.qcut(all_features["review_count"], 
                                             q=5, labels=False, duplicates="drop")
    
    return all_features


user_features = engineer_features(XX, yy)

min_max_df = XX.groupby("user")["item"].agg(min_movie="min", max_movie="max", median_movie="median", variance_movie="var").reset_index()
user_features = user_features.merge(min_max_df, on="user", how="left")

XX["item_rating"] = XX["item"] * XX["rating"]
sum_rating = XX.groupby("user")["rating"].sum().reset_index(name="sum_rating")
sum_product = XX.groupby("user")["item_rating"].sum().reset_index(name="sum_item_rating")

user_features = user_features.merge(sum_product, on="user", how="left")
user_features = user_features.merge(sum_rating, on="user", how="left")

user_features["average_product"] = user_features["sum_item_rating"] / (user_features["review_count"])

user_features["product_above_zero"] = (user_features["sum_item_rating"] > 0).astype(int)
user_features["sum_above_zero"] = (user_features["sum_rating"] > 0).astype(int)
user_features["avg_product_vs_avg_rating"] = user_features["average_product"] / (user_features["avg_rating"] + EPS)

# --- Round ratio columns to 2 decimals ---
ratio_cols = [col for col in user_features.columns if col.endswith("_ratio")]
for c in ratio_cols:
    user_features[c] = user_features[c].round(2)

XX_sorted = XX.sort_values(by=['user', 'item'])
XX_sorted['item_diff'] = XX_sorted.groupby('user')['item'].diff().fillna(0)
gap_stats = XX_sorted.groupby('user')['item_diff'].agg(['mean', 'std', 'max'])
gap_stats.columns = ['gap_mean', 'gap_std', 'gap_max']
user_features = user_features.merge(gap_stats, on='user', how='left')

# Final preview
print(user_features.shape)
# user_features.to_csv("user_features.csv", index=False)
user_features.head()

In [None]:
from tqdm import tqdm
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier

df = user_features.copy()
df = df.drop(columns="is_anomalous")
print(df.shape)
df.head()

In [None]:
df.drop(columns=["user"], inplace=True, errors="ignore")
correlation_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
correlated_features = set()
threshold = 0.90

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

df.drop(columns=correlated_features, inplace=True, errors="ignore")
print(f"Dropped correlated features: {correlated_features}")

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Define features and target
X = df.drop(columns=["label"])
y = df["label"]

print(X.columns)
X.head()

In [None]:
from imblearn.combine import SMOTETomek

print("🔹 Original Class Distribution:", Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=16)

smote_tomek = SMOTETomek(random_state=16)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

print("🔹 Resampled Class Distribution (After SMOTE):", Counter(y_train_resampled))

poly = PolynomialFeatures(degree=1, interaction_only=False, include_bias=True)
scaler = StandardScaler()
X_train_poly = poly.fit_transform(scaler.fit_transform(X_train_resampled))
X_test_poly = poly.transform(scaler.transform(X_test))


In [None]:
# import optuna
# from sklearn.metrics import roc_auc_score
# import xgboost as xgb

# def objective(trial):
#     params = {
#         "objective": "multi:softprob",
#         "num_class": len(np.unique(y_train_resampled)),
#         "eval_metric": "mlogloss",
#         "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
#         "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#         "max_depth": trial.suggest_int("max_depth", 3, 9),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 5),
#         "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_uniform("gamma", 0, 0.2),
#         "random_state": 16,
#         "n_jobs": -1
#     }

#     model = xgb.XGBClassifier(**params)
#     model.fit(X_train_poly, y_train_resampled)
#     y_pred_proba = model.predict_proba(X_test_poly)
    
#     auc_scores = []
#     for i in range(y_pred_proba.shape[1]):
#         binary_true = (y_test == i).astype(int)
#         try:
#             auc = roc_auc_score(binary_true, y_pred_proba[:, i])
#             auc_scores.append(auc)
#         except:
#             auc_scores.append(0)
    
#     return np.mean(auc_scores)

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# best_params = study.best_params
# print("Best Parameters:", best_params)


In [None]:
import xgboost as xgb
# Initialize and train the XGBoost classifier for multi-class prediction
num_classes = len(np.unique(y))
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
    eval_metric='mlogloss',
    random_state=16,
    n_estimators=1000,
    learning_rate=0.0674590634280927,
    max_depth=9,
    min_child_weight=2,
    subsample=0.7807606824615063,
    colsample_bytree=0.7006957833429297,
    gamma=0.16240296667488197,
    n_jobs=1
)

xgb_model.fit(X_train_poly, y_train_resampled)

y_pred_proba_xgb = xgb_model.predict_proba(X_test_poly)

print("XGBoost AUC Scores per Class:")
auc_per_class_xgb = {}
for i in range(y_pred_proba_xgb.shape[1]):
    binary_true = (y_test == i).astype(int)
    try:
        auc = roc_auc_score(binary_true, y_pred_proba_xgb[:, i])
        auc_per_class_xgb[i] = auc
        print(f"  Class {i}: AUC = {auc:.3f}")
    except Exception as e:
        auc_per_class_xgb[i] = None
        print(f"  Class {i}: AUC could not be computed")

k = 5
AUC_0 = auc_per_class_xgb[0]
anomaly_aucs = [auc_per_class_xgb[i] for i in range(1, k+1) if i in auc_per_class_xgb]

final_metric = (0.5 * AUC_0) + (0.5 / k) * sum(anomaly_aucs)

print(f"\nFinal Evaluation Metric: {final_metric:.3f}")

In [None]:
import joblib

# Save objects
model_features = list(X_train_resampled.columns)
joblib.dump(model_features, "model_features.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(poly, "poly.pkl")
joblib.dump(xgb_model, "model.pkl")