In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Now you can load your data from Google Drive. Replace `"path/to/your/data.csv"` with the actual path to your file.

In [None]:
!pip install numpy scipy scikit-learn pyyaml
!nvidia-smi
!pip install torch==2.4.1+cu124 torchvision==0.19.1+cu124 torchaudio==2.4.1+cu124 --index-url https://download.pytorch.org/whl/cu124
import torch
print(torch.__version__)
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html
import dgl

Sun Nov  9 08:45:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   37C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [None]:
# import pandas as pd

# file_path = ''  # Replace with your file path
# try:
#     df = pd.read_csv(file_path)
#     display(df.head())
# except FileNotFoundError:
#     print(f"Error: File not found at {file_path}")
# except Exception as e:
#     print(f"An error occurred: {e}")

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [None]:
# =========================
# GE-GNN — Inference Script
# (checkpoint-compatible)
# =========================

import os, sys, copy, yaml, argparse, warnings, ast, time
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import scipy.sparse as sp
import scipy.io

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import category_encoders as ce
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# -----------------------
# 0) Paths & Config
# -----------------------
CONFIG_PATH = "/content/drive/MyDrive/GE-GNN/config/amazon.yaml"
NEW_CSV     = "/content/drive/MyDrive/GE-GNN/MY_work/DataSets/sports_test_inference.csv" # This path is for inference only, not evaluation
MODEL_PATH  = "/content/drive/MyDrive/GE-GNN/result/sports_outdoors_20k_F_model_head4.pt"
OUT_DIR     = "/content/drive/MyDrive/GE-GNN/result"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------
# 1) Load config + device
# -----------------------
with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

args = argparse.Namespace(**cfg)

# Device resolution (allow int or "cuda:x" in yaml)
if isinstance(args.cuda, int):
    cuda_idx = args.cuda
elif isinstance(args.cuda, str) and args.cuda.isdigit():
    cuda_idx = int(args.cuda)
else:
    try:
        cuda_idx = int(str(args.cuda).split(":")[-1])
    except Exception:
        cuda_idx = 0

if torch.cuda.is_available():
    if cuda_idx >= torch.cuda.device_count():
        print(f"CUDA index {cuda_idx} out of range. Using cuda:0")
        cuda_idx = 0
    device = torch.device(f"cuda:{cuda_idx}")
    torch.cuda.set_device(cuda_idx)
else:
    device = torch.device("cpu")

print("--- Using device:", device)

# -----------------------
# 2) Load new CSV
# -----------------------
print("\n--- Loading new data for inference ---")
# Load the evaluation data with labels instead of the inference data without labels
EVAL_CSV = "/content/drive/MyDrive/GE-GNN/DataSets/Kaggle_Amazon_Reviews_CSV/Cell_Phones_and_Accessories_50k.csv"
df = pd.read_csv(EVAL_CSV)
print(f"Loaded {len(df)} rows from:\n{EVAL_CSV}")


df_orig = df.copy() # Keep original for output, includes 'class'

# Ensure columns exist
for col in ["helpful", "reviewTime", "reviewerID", "reviewText", "summary", "reviewerName", "asin", "overall", "class"]: # Added 'class' here
    if col not in df.columns:
        raise RuntimeError(f"Required column missing in CSV: {col}")

# helpful → safe parse
def _safe_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            pass
    return [0, 0]

df["helpful"] = df["helpful"].apply(_safe_list)
df["helpful_votes"]   = df["helpful"].apply(lambda x: int(x[0]) if len(x)>=1 else 0)
df["unhelpful_votes"] = df["helpful"].apply(lambda x: int(x[1]) if len(x)>=2 else 0)
df["total_votes"] = df["helpful_votes"] + df["unhelpful_votes"]
df["helpful_unhelpful_ratio"] = df.apply(
    lambda r: (r["helpful_votes"] / r["total_votes"]) if r["total_votes"]>0 else 0.0, axis=1
)

# Dates & gaps
df["reviewTime_dt"] = pd.to_datetime(df["reviewTime"], format="%m %d, %Y", errors="coerce")
df_sorted = df.sort_values(by=["reviewerID", "reviewTime_dt"])
df_sorted["prev_reviewTime_dt"] = df_sorted.groupby("reviewerID")["reviewTime_dt"].shift(1)
df_sorted["day_gap"] = (df_sorted["reviewTime_dt"] - df_sorted["prev_reviewTime_dt"]).dt.days.fillna(0)

# restore row order and attach day_gap
df = df.loc[df_sorted.index].copy()
df["day_gap"] = df_sorted["day_gap"].astype(int)
df["same_day_indicator"] = (df["day_gap"] == 0).astype(int)

# totals by reviewer
df["reviewerID"] = df["reviewerID"].astype(str)
df["total_helpful_votes"]   = df.groupby("reviewerID")["helpful_votes"].transform("sum")
df["total_unhelpful_votes"] = df.groupby("reviewerID")["unhelpful_votes"].transform("sum")

# word count
df["review_word_count"] = df["reviewText"].apply(lambda x: len(str(x).split()))

# sentiment
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
df["sentiment_score"] = df["reviewText"].apply(lambda x: analyzer.polarity_scores(str(x))["compound"])

# text nulls
df["reviewText"]   = df["reviewText"].fillna("")
df["summary"]      = df["summary"].fillna("")
df["reviewerName"] = df["reviewerName"].fillna("")

# TFIDF + SVD (same dims used earlier)
tfidf_review = TfidfVectorizer(max_features=5000)
svd_review   = TruncatedSVD(n_components=7, random_state=42)
review_svd   = svd_review.fit_transform(tfidf_review.fit_transform(df["reviewText"]))
review_svd_df = pd.DataFrame(review_svd, columns=[f"review_text_svd_{i}" for i in range(review_svd.shape[1])])

tfidf_sum = TfidfVectorizer(max_features=2000)
svd_sum   = TruncatedSVD(n_components=3, random_state=42)
sum_svd   = svd_sum.fit_transform(tfidf_sum.fit_transform(df["summary"]))
sum_svd_df = pd.DataFrame(sum_svd, columns=[f"summary_svd_{i}" for i in range(sum_svd.shape[1])])

# Frequency encoding (re-fit for inference set — matches your previous approach)
enc_reviewer = ce.CountEncoder(cols=["reviewerID"])
enc_asin     = ce.CountEncoder(cols=["asin"])
enc_rname    = ce.CountEncoder(cols=["reviewerName"])

rev_enc  = enc_reviewer.fit_transform(df["reviewerID"]).rename(columns={"reviewerID": "reviewerID_encoded"})
asin_enc = enc_asin.fit_transform(df["asin"]).rename(columns={"asin": "asin_encoded"})
rnm_enc  = enc_rname.fit_transform(df["reviewerName"]).rename(columns={"reviewerName": "reviewerName_encoded"})

num_feats = df[[
    "overall","helpful_votes","unhelpful_votes","helpful_unhelpful_ratio",
    "day_gap","same_day_indicator","review_word_count","sentiment_score",
    "total_helpful_votes","total_unhelpful_votes","total_votes"
]]

X = pd.concat([
    num_feats.reset_index(drop=True),
    review_svd_df.reset_index(drop=True),
    sum_svd_df.reset_index(drop=True),
    rev_enc.reset_index(drop=True),
    asin_enc.reset_index(drop=True),
    rnm_enc.reset_index(drop=True),
], axis=1).astype(np.float32)

print("Feature engineering complete.")
print("Feature matrix shape:", X.shape)

# ---------------------------------------------------
# 3) Build heterograph: ('r','p','r'), ('r','s','r'), ('r','v','r')
# ---------------------------------------------------
N = len(df)
idx_map = {old: i for i, old in enumerate(df.index)}

upu_src, upu_dst = [], []
usu_src, usu_dst = [], []
uvu_src, uvu_dst = [], []

# same product (asin) → 'p'
for asin, g in df.groupby("asin"):
    nodes = [idx_map[i] for i in g.index.tolist()]
    if len(nodes) > 1:
        for i in range(len(nodes)):
            for j in range(len(nodes)):
                if i != j:
                    upu_src.append(nodes[i]); upu_dst.append(nodes[j])

# same user constraints
for uid, g in df.groupby("reviewerID"):
    rows = g.index.tolist()
    for i in range(len(rows)):
        for j in range(len(rows)):
            if i == j:
                continue
            i0, j0 = rows[i], rows[j]
            a = idx_map[i0]; b = idx_map[j0]
            # U-S-U: same star rating == simplistic sentiment proxy used during training
            if df.loc[i0, "overall"] == df.loc[j0, "overall"]:
                usu_src.append(a); usu_dst.append(b)
            # U-V-U: similar length (<=10 words)
            if abs(df.loc[i0, "review_word_count"] - df.loc[j0, "review_word_count"]) <= 10:
                uvu_src.append(a); uvu_dst.append(b)

# tensors
upu_src = torch.tensor(upu_src, dtype=torch.int32)
upu_dst = torch.tensor(upu_dst, dtype=torch.int32)
usu_src = torch.tensor(usu_src, dtype=torch.int32)
usu_dst = torch.tensor(usu_dst, dtype=torch.int32)
uvu_src = torch.tensor(uvu_src, dtype=torch.int32)
uvu_dst = torch.tensor(uvu_dst, dtype=torch.int32)

graph_data = {
    ("r","p","r"): (upu_src, upu_dst),
    ("r","s","r"): (usu_src, usu_dst),
    ("r","v","r"): (uvu_src, uvu_dst),
}
g = dgl.heterograph(graph_data, num_nodes_dict={"r": N})
g.nodes["r"].data["feat"] = torch.from_numpy(X.values).float()

for et in g.etypes:
    g = dgl.add_self_loop(g, etype=et)

g = g.to(device)
g.nodes["r"].data["feat"] = g.nodes["r"].data["feat"].to(device)

if SAVE_INTERMEDIATE:
    dgl_path = os.path.join(OUT_DIR, "evaluation_data.dgl")
    dgl.save_graphs(dgl_path, g)
    print(f"Saved inference DGL graph to: {dgl_path}")

# --------------------------------
# 4) Model (checkpoint-compatible names!)
# --------------------------------
class RelationAware_Compat(nn.Module):
    # name path in ckpt: rel_aware.d_lin.*
    def __init__(self, input_dim, output_dim, dropout):
        super().__init__()
        self.d_lin = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, dst):
        src = self.d_lin(src)
        dst = self.d_lin(dst)
        diff = src - dst
        return self.tanh(src + dst + diff)

class HLayer_Compat(nn.Module):
    # keys in ckpt: blocks.<etype>.[0|1].(…)
    # this block uses w_linear, relation_aware, atten
    def __init__(self, input_dim, output_dim, head, rel_aware, etype, dropout, if_sum=False):
        super().__init__()
        self.etype = etype
        self.head  = head
        self.hd    = output_dim
        self.if_sum = if_sum

        self.atten = nn.Linear(3*output_dim, 1)
        self.leakyrelu = nn.LeakyReLU()
        self.softmax = nn.Softmax(dim=1)
        self.w_linear = nn.Linear(input_dim, output_dim*head)
        self.relation_aware = rel_aware

    def forward(self, g, h):
        with g.local_scope():
            # store original features for edge sign
            g.nodes["r"].data["feat"] = h
            # edge relation score
            g.apply_edges(self.sign_edges, etype=self.etype)
            # projection
            h_proj = self.w_linear(h)
            g.nodes["r"].data["h"] = h_proj
            # message passing on this etype
            g.update_all(self.message, self.reduce, etype=self.etype)
            out = g.nodes["r"].data["out"]
            edge_s = g.nodes["r"].data["s"]
            if not self.if_sum:
                return edge_s, out, h_proj.view(-1, self.head*self.hd)
            else:
                return edge_s, out, h_proj.view(-1, self.head, self.hd).sum(-2)

    def message(self, edges):
        src_f  = edges.src["h"].view(-1, self.head, self.hd)
        dst_f  = edges.dst["h"].view(-1, self.head, self.hd)
        edge_s = edges.data["edge_sum"].view(-1, self.head, self.hd)
        z = torch.cat([src_f, dst_f, edge_s], dim=-1)
        alpha = self.atten(z)
        alpha = self.leakyrelu(alpha)
        return {"atten": alpha, "sf": src_f, "edge_s": edge_s}

    def reduce(self, nodes):
        alpha = nodes.mailbox["atten"]
        sf    = nodes.mailbox["sf"]
        alpha = self.softmax(alpha)
        out = torch.sum(alpha * sf, dim=1)  # (N, head, hd)
        if not self.if_sum:
            out = out.view(-1, self.head*self.hd)
            edge_s = torch.mean(nodes.mailbox["edge_s"], dim=1).view(-1, self.head*self.hd)
            return {"out": out, "s": edge_s}
        else:
            out = out.sum(dim=-2)
            edge_s = torch.sum(torch.mean(nodes.mailbox["edge_s"], dim=1), dim=-2)
            return {"out": out, "s": edge_s}

    def sign_edges(self, edges):
        src = edges.src["feat"]
        dst = edges.dst["feat"]
        edge_sum = self.relation_aware(src, dst)
        return {"edge_sum": edge_sum}

class Gate_Compat(nn.Module):
    def __init__(self, head, output_dim, dropout, if_sum=False):
        super().__init__()
        self.output_dim = output_dim
        self.head = head
        if not if_sum:
            self.beta = nn.Parameter(torch.empty(2*head*output_dim, 1))
        else:
            self.beta = nn.Parameter(torch.empty(2*output_dim, 1))
        nn.init.xavier_normal_(self.beta, gain=1.414)
        self.sigmoid = nn.Sigmoid()
    def forward(self, edge_sum, out, h):
        beta = torch.cat([edge_sum, out], dim=1)
        gate = self.sigmoid(beta @ self.beta)
        return gate * out + (1 - gate) * h

class MultiRelationGELayer_Compat(nn.Module):
    # ckpt names: layers.X.lin, layers.X.rel_aware, layers.X.blocks[etype][0/1]
    def __init__(self, input_dim, output_dim, head, graph, dropout, if_sum=False):
        super().__init__()
        rels = list(graph.etypes)
        if "homo" in rels:
            rels.remove("homo")
        self.relations = rels
        self.if_sum = if_sum

        self.rel_aware = RelationAware_Compat(input_dim, output_dim*head, dropout)
        if not if_sum:
            self.lin = nn.Linear(len(self.relations)*output_dim*head, output_dim*head)
        else:
            self.lin = nn.Linear(len(self.relations)*output_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

        self.blocks = nn.ModuleDict()
        for e in self.relations:
            seq = nn.ModuleList()
            seq.append(HLayer_Compat(input_dim, output_dim, head, self.rel_aware, e, dropout, if_sum))
            seq.append(Gate_Compat(head, output_dim, dropout, if_sum))
            self.blocks[e] = seq

    def forward(self, g, h):
        outs = []
        for e, seq in self.blocks.items():
            edge_s, out, h_proj = seq[0](g, h)
            he = seq[1](edge_s, out, h_proj)
            outs.append(he)
        x = torch.cat(outs, dim=1)
        x = self.dropout(x)
        x = self.lin(x)
        return x

class GE_GNN_Compat(nn.Module):
    # ckpt top-level list name: layers
    def __init__(self, args, g):
        super().__init__()
        # detect input dim
        if "r" in g.ntypes and "feat" in g.nodes["r"].data:
            in_dim = g.nodes["r"].data["feat"].shape[1]
        elif "feature" in g.ndata:
            in_dim = g.ndata["feature"].shape[1]
        else:
            raise RuntimeError("No node features found ('feat' or 'feature').")
        self.layers = nn.ModuleList()
        if args.n_layer == 1:
            self.layers.append(MultiRelationGELayer_Compat(in_dim, args.n_class, args.head, g, args.dropout, if_sum=True))
        else:
            self.layers.append(MultiRelationGELayer_Compat(in_dim, args.intra_dim, args.head, g, args.dropout, if_sum=False))
            for _ in range(1, args.n_layer-1):
                self.layers.append(MultiRelationGELayer_Compat(args.intra_dim*args.head, args.intra_dim, args.head, g, args.dropout, if_sum=False))
            self.layers.append(MultiRelationGELayer_Compat(args.intra_dim*args.head, args.n_class, args.head, g, args.dropout, if_sum=True))
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(args.dropout)

    def forward(self, g):
        if "r" in g.ntypes and "feat" in g.nodes["r"].data:
            h = g.nodes["r"].data["feat"].float()
        elif "feature" in g.ndata:
            h = g.ndata["feature"].float()
        else:
            raise RuntimeError("Missing features for forward.")
        for i, layer in enumerate(self.layers):
            h = layer(g, h)
            if i < len(self.layers) - 1:
                h = self.relu(h)
                h = self.drop(h)
        return h


# ---------------------------------------------------
# 5) Load model + state_dict strictly
# ---------------------------------------------------
print("\n--- Loading model ---")
model = GE_GNN_Compat(args, g).to(device)
state = torch.load(MODEL_PATH, map_location=device)
# Use strict=False here to see *all* missing/unexpected keys first.
missing, unexpected = model.load_state_dict(state, strict=False)

# If there are mismatches, print for visibility
if len(unexpected) > 0 or len(missing) > 0:
    print("Note: non-strict load due to mismatches.")
    if len(missing) > 0:
        print("Missing keys:", missing)
    if len(unexpected) > 0:
        print("Unexpected keys:", unexpected)
else:
    print("Model loaded successfully (strict=True would have worked).")


model.eval()
print("Model ready.")

# --------------------------------
# 6) Inference
# --------------------------------
with torch.no_grad():
    logits = model(g)                     # (N, n_class)
    probs  = F.softmax(logits, dim=1)     # (N, n_class)
    fraud_prob = probs[:,1].detach().cpu().numpy()
    pred_cls   = logits.argmax(1).detach().cpu().numpy()

# --------------------------------
# 7) Export Predictions
# --------------------------------
df_out = df_orig.copy() # Start with the original dataframe that includes 'class'
df_out['predicted_fraud_class'] = pred_cls
df_out['fraud_probability']     = fraud_prob
# Keep the original 'class' column as is, no need to rename to 'label'

print("\nSample predictions:")
print(df_out[['reviewerID','asin','overall','predicted_fraud_class','fraud_probability', 'class']].head(10))

stamp = time.strftime("%Y%m%d_%H%M%S")
OUT_CSV = os.path.join(OUT_DIR, f"evaluation_predictions_{stamp}.csv")
df_out.to_csv(OUT_CSV, index=False)
print(f"\n✅ Inference complete. Predictions saved to:\n{OUT_CSV}")

--- Using device: cuda:0

--- Loading new data for inference ---
Loaded 100000 rows from:
/content/drive/MyDrive/GE-GNN/DataSets/Kaggle_Amazon_Reviews_CSV/Cell_Phones_and_Accessories_50k.csv
Feature engineering complete.
Feature matrix shape: (100000, 24)

--- Loading model ---
Model loaded successfully (strict=True would have worked).
Model ready.

Sample predictions:
       reviewerID        asin  overall  predicted_fraud_class  \
0  A1ZKFGF1OJAC2L  B005AOKW8Q      3.0                      1   
1  A1GGBZT79BE9VB  B006ECNCMG      1.0                      1   
2  A2MBQUAQT2B3J6  B00GTGETFG      4.0                      0   
3  A101MJE2PP14IX  B0062F2AQ4      2.0                      1   
4  A1SWJHGNJ50BBJ  B00FUXV6QO      1.0                      1   
5  A2P3Z35R396ZKL  B00ATWR0L6      3.0                      1   
6   AYZLKZ40XI1YS  B00CFVNXTC      1.0                      0   
7  A13NWKT7VUXSG7  B00B9OTEVM      3.0                      0   
8  A22R6VF3GAKA0Q  B003UVM2UC      2.0     

In [None]:
# =========================
# GE-GNN — FULL Inference Pipeline
# (checkpoint & encoder-compatible)
# =========================

import os, sys, copy, yaml, argparse, warnings, ast, time, pickle, json
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import scipy.sparse as sp
import scipy.io

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)

import category_encoders as ce
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# -----------------------
# 0) Paths & Config
# -----------------------
CONFIG_PATH   = "/content/drive/MyDrive/GE-GNN/config/amazon.yaml"
# Use a file that HAS labels to get metrics; if not, it will just run inference
EVAL_CSV      = "/content/drive/MyDrive/GE-GNN/MY_work/DataSets/sports_test_inference.csv"
MODEL_PATH    = "/content/drive/MyDrive/GE-GNN/result/sports_outdoors_20k_F_model_head4.pt"
OUT_DIR       = "/content/drive/MyDrive/GE-GNN/result"
ARTIFACT_DIR  = "/content/drive/MyDrive/GE-GNN/artifacts"   # <— new: where encoders/mappings live
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# Filenames for artifacts
TFIDF_REVIEW_PKL   = os.path.join(ARTIFACT_DIR, "tfidf_review.pkl")
SVD_REVIEW_PKL     = os.path.join(ARTIFACT_DIR, "svd_review.pkl")
TFIDF_SUMMARY_PKL  = os.path.join(ARTIFACT_DIR, "tfidf_summary.pkl")
SVD_SUMMARY_PKL    = os.path.join(ARTIFACT_DIR, "svd_summary.pkl")

ENC_REVIEWER_PKL   = os.path.join(ARTIFACT_DIR, "countenc_reviewerID.pkl")
ENC_ASIN_PKL       = os.path.join(ARTIFACT_DIR, "countenc_asin.pkl")
ENC_RNAME_PKL      = os.path.join(ARTIFACT_DIR, "countenc_reviewerName.pkl")

# Optional ID maps (not required for inference graph, but saved for reproducibility)
REVIEWER_MAP_JSON  = os.path.join(ARTIFACT_DIR, "reviewerID_to_index.json")
ASIN_MAP_JSON      = os.path.join(ARTIFACT_DIR, "asin_to_index.json")

# Behavior: if artifacts are missing, fit encoders on current data and save
SAVE_ARTIFACTS_IF_MISSING = True

# -----------------------
# 1) Load config + device
# -----------------------
with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

args = argparse.Namespace(**cfg)

# Device resolution (allow int or "cuda:x" in yaml)
if isinstance(args.cuda, int):
    cuda_idx = args.cuda
elif isinstance(args.cuda, str) and args.cuda.isdigit():
    cuda_idx = int(args.cuda)
else:
    try:
        cuda_idx = int(str(args.cuda).split(":")[-1])
    except Exception:
        cuda_idx = 0

if torch.cuda.is_available():
    if cuda_idx >= torch.cuda.device_count():
        print(f"CUDA index {cuda_idx} out of range. Using cuda:0")
        cuda_idx = 0
    device = torch.device(f"cuda:{cuda_idx}")
    torch.cuda.set_device(cuda_idx)
else:
    device = torch.device("cpu")

print("--- Using device:", device)

# -----------------------
# 2) Load CSV & FE
# -----------------------
print("\n--- Loading new data for inference ---")
df = pd.read_csv(EVAL_CSV)
print(f"Loaded {len(df)} rows from:\n{EVAL_CSV}")

df_orig = df.copy()  # keep the original, may include 'class'

# Ensure required columns exist (class is optional)
needed = ["helpful", "reviewTime", "reviewerID", "reviewText", "summary", "reviewerName", "asin", "overall"]
missing = [c for c in needed if c not in df.columns]
if missing:
    raise RuntimeError(f"Required column(s) missing in CSV: {missing}")

# parse helpful safely
def _safe_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            pass
    return [0, 0]

df["helpful"] = df["helpful"].apply(_safe_list)
df["helpful_votes"]   = df["helpful"].apply(lambda x: int(x[0]) if len(x)>=1 else 0)
df["unhelpful_votes"] = df["helpful"].apply(lambda x: int(x[1]) if len(x)>=2 else 0)
df["total_votes"] = df["helpful_votes"] + df["unhelpful_votes"]
df["helpful_unhelpful_ratio"] = df.apply(
    lambda r: (r["helpful_votes"] / r["total_votes"]) if r["total_votes"]>0 else 0.0, axis=1
)

# time features
df["reviewTime_dt"] = pd.to_datetime(df["reviewTime"], format="%m %d, %Y", errors="coerce")
df_sorted = df.sort_values(by=["reviewerID", "reviewTime_dt"])
df_sorted["prev_reviewTime_dt"] = df_sorted.groupby("reviewerID")["reviewTime_dt"].shift(1)
df_sorted["day_gap"] = (df_sorted["reviewTime_dt"] - df_sorted["prev_reviewTime_dt"]).dt.days.fillna(0)

df = df.loc[df_sorted.index].copy()
df["day_gap"] = df_sorted["day_gap"].astype(int)
df["same_day_indicator"] = (df["day_gap"] == 0).astype(int)

# totals by reviewer
df["reviewerID"] = df["reviewerID"].astype(str)
df["total_helpful_votes"]   = df.groupby("reviewerID")["helpful_votes"].transform("sum")
df["total_unhelpful_votes"] = df.groupby("reviewerID")["unhelpful_votes"].transform("sum")

# word count
df["review_word_count"] = df["reviewText"].apply(lambda x: len(str(x).split()))

# sentiment
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
df["sentiment_score"] = df["reviewText"].apply(lambda x: sid.polarity_scores(str(x))["compound"])

# nulls
df["reviewText"]   = df["reviewText"].fillna("")
df["summary"]      = df["summary"].fillna("")
df["reviewerName"] = df["reviewerName"].fillna("")
df["asin"]         = df["asin"].astype(str)

# -----------------------
# 3) Load/fit encoders
# -----------------------
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def try_load_or_fit_tfidf_svd(text_series, tfidf_path, svd_path, tfidf_kwargs, svd_kwargs):
    """Load TFIDF & SVD; if missing and allowed, fit on current data and save."""
    tfidf = None
    svd = None
    if os.path.exists(tfidf_path) and os.path.exists(svd_path):
        try:
            tfidf = load_pickle(tfidf_path)
            svd = load_pickle(svd_path)
            X = tfidf.transform(text_series)
            Xs = svd.transform(X)
            return Xs, tfidf, svd, False
        except Exception as e:
            print(f"Warning: Failed to load TFIDF/SVD from disk ({e}). Will refit.")
    if not SAVE_ARTIFACTS_IF_MISSING:
        raise RuntimeError(f"Missing TFIDF/SVD artifacts: {tfidf_path}, {svd_path}")
    # fit
    tfidf = TfidfVectorizer(**tfidf_kwargs)
    X = tfidf.fit_transform(text_series)
    svd = TruncatedSVD(**svd_kwargs)
    Xs = svd.fit_transform(X)
    # save
    with open(tfidf_path, "wb") as f: pickle.dump(tfidf, f)
    with open(svd_path, "wb") as f:   pickle.dump(svd, f)
    return Xs, tfidf, svd, True

def try_load_or_fit_countenc(series, pkl_path, colname):
    """Load CountEncoder; if missing and allowed, fit & save."""
    enc = None
    if os.path.exists(pkl_path):
        try:
            enc = load_pickle(pkl_path)
            # transform expects a DataFrame
            out = enc.transform(pd.DataFrame({colname: series}))
            return out.rename(columns={colname: f"{colname}_encoded"}), enc, False
        except Exception as e:
            print(f"Warning: Failed to load CountEncoder from disk ({e}). Will refit.")
    if not SAVE_ARTIFACTS_IF_MISSING:
        raise RuntimeError(f"Missing CountEncoder artifact: {pkl_path}")

    enc = ce.CountEncoder(cols=[colname], handle_unknown=0, handle_missing=0)  # handle unseen/missing as 0
    out = enc.fit_transform(pd.DataFrame({colname: series}))
    # save
    with open(pkl_path, "wb") as f: pickle.dump(enc, f)
    return out.rename(columns={colname: f"{colname}_encoded"}), enc, True

print("\n--- Loading/Fitting text/vector encoders ---")
review_svd, tfidf_review, svd_review, fitted_review = try_load_or_fit_tfidf_svd(
    df["reviewText"],
    TFIDF_REVIEW_PKL, SVD_REVIEW_PKL,
    tfidf_kwargs=dict(max_features=5000),
    svd_kwargs=dict(n_components=7, random_state=42)
)
summary_svd, tfidf_summary, svd_summary, fitted_summary = try_load_or_fit_tfidf_svd(
    df["summary"],
    TFIDF_SUMMARY_PKL, SVD_SUMMARY_PKL,
    tfidf_kwargs=dict(max_features=2000),
    svd_kwargs=dict(n_components=3, random_state=42)
)

print("\n--- Loading/Fitting categorical encoders ---")
rev_enc_df, enc_reviewer, fitted_rev = try_load_or_fit_countenc(df["reviewerID"], ENC_REVIEWER_PKL, "reviewerID")
asin_enc_df, enc_asin, fitted_asin = try_load_or_fit_countenc(df["asin"], ENC_ASIN_PKL, "asin")
rnm_enc_df, enc_rname, fitted_rnm = try_load_or_fit_countenc(df["reviewerName"], ENC_RNAME_PKL, "reviewerName")

# Optional: save ID -> idx maps for reproducibility (not required for inference graph)
try:
    if fitted_rev:
        rev_ids = pd.Series(df["reviewerID"].unique())
        reviewer_map = {k: int(v) for v, k in enumerate(rev_ids)}
        with open(REVIEWER_MAP_JSON, "w") as f: json.dump(reviewer_map, f)
    if fitted_asin:
        asin_ids = pd.Series(df["asin"].unique())
        asin_map = {k: int(v) for v, k in enumerate(asin_ids)}
        with open(ASIN_MAP_JSON, "w") as f: json.dump(asin_map, f)
except Exception as e:
    print(f"Note: Failed saving optional ID maps: {e}")

# Assemble features
review_svd_df = pd.DataFrame(review_svd, columns=[f"review_text_svd_{i}" for i in range(review_svd.shape[1])])
summary_svd_df = pd.DataFrame(summary_svd, columns=[f"summary_svd_{i}" for i in range(summary_svd.shape[1])])
num_feats = df[[
    "overall","helpful_votes","unhelpful_votes","helpful_unhelpful_ratio",
    "day_gap","same_day_indicator","review_word_count","sentiment_score",
    "total_helpful_votes","total_unhelpful_votes","total_votes"
]]

X = pd.concat([
    num_feats.reset_index(drop=True),
    review_svd_df.reset_index(drop=True),
    summary_svd_df.reset_index(drop=True),
    rev_enc_df.reset_index(drop=True),
    asin_enc_df.reset_index(drop=True),
    rnm_enc_df.reset_index(drop=True),
], axis=1).astype(np.float32)

print("\nFeature engineering complete.")
print("Feature matrix shape:", X.shape)

# ---------------------------------------------------
# 4) Build heterograph: ('r','p','r'), ('r','s','r'), ('r','v','r')
# ---------------------------------------------------
N = len(df)
idx_map = {old: i for i, old in enumerate(df.index)}

upu_src, upu_dst = [], []
usu_src, usu_dst = [], []
uvu_src, uvu_dst = [], []

# same product (asin) → 'p'
for asin, g_ in df.groupby("asin"):
    nodes = [idx_map[i] for i in g_.index.tolist()]
    if len(nodes) > 1:
        for i in range(len(nodes)):
            for j in range(len(nodes)):
                if i != j:
                    upu_src.append(nodes[i]); upu_dst.append(nodes[j])

# same user constraints
for uid, g_ in df.groupby("reviewerID"):
    rows = g_.index.tolist()
    for i in range(len(rows)):
        for j in range(len(rows)):
            if i == j:
                continue
            i0, j0 = rows[i], rows[j]
            a = idx_map[i0]; b = idx_map[j0]
            # U-S-U: same overall rating proxy
            if df.loc[i0, "overall"] == df.loc[j0, "overall"]:
                usu_src.append(a); usu_dst.append(b)
            # U-V-U: similar review length
            if abs(df.loc[i0, "review_word_count"] - df.loc[j0, "review_word_count"]) <= 10:
                uvu_src.append(a); uvu_dst.append(b)

# tensors
upu_src = torch.tensor(upu_src, dtype=torch.int32)
upu_dst = torch.tensor(upu_dst, dtype=torch.int32)
usu_src = torch.tensor(usu_src, dtype=torch.int32)
usu_dst = torch.tensor(usu_dst, dtype=torch.int32)
uvu_src = torch.tensor(uvu_src, dtype=torch.int32)
uvu_dst = torch.tensor(uvu_dst, dtype=torch.int32)

graph_data = {
    ("r","p","r"): (upu_src, upu_dst),
    ("r","s","r"): (usu_src, usu_dst),
    ("r","v","r"): (uvu_src, uvu_dst),
}
g = dgl.heterograph(graph_data, num_nodes_dict={"r": N})
g.nodes["r"].data["feat"] = torch.from_numpy(X.values).float()

for et in g.etypes:
    g = dgl.add_self_loop(g, etype=et)

g = g.to(device)
g.nodes["r"].data["feat"] = g.nodes["r"].data["feat"].to(device)

# --------------------------------
# 5) Model (checkpoint-compatible names)
# --------------------------------
class RelationAware_Compat(nn.Module):
    # ckpt path: rel_aware.d_lin.*
    def __init__(self, input_dim, output_dim, dropout):
        super().__init__()
        self.d_lin = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, dst):
        src = self.d_lin(src)
        dst = self.d_lin(dst)
        diff = src - dst
        return self.tanh(src + dst + diff)

class HLayer_Compat(nn.Module):
    # ckpt path: blocks.<etype>.[0|1].(...)
    def __init__(self, input_dim, output_dim, head, rel_aware, etype, dropout, if_sum=False):
        super().__init__()
        self.etype = etype
        self.head  = head
        self.hd    = output_dim
        self.if_sum = if_sum

        self.atten = nn.Linear(3*output_dim, 1)
        self.leakyrelu = nn.LeakyReLU()
        self.softmax = nn.Softmax(dim=1)
        self.w_linear = nn.Linear(input_dim, output_dim*head)
        self.relation_aware = rel_aware

    def forward(self, g, h):
        with g.local_scope():
            g.nodes["r"].data["feat"] = h
            g.apply_edges(self.sign_edges, etype=self.etype)
            h_proj = self.w_linear(h)
            g.nodes["r"].data["h"] = h_proj
            g.update_all(self.message, self.reduce, etype=self.etype)
            out = g.nodes["r"].data["out"]
            edge_s = g.nodes["r"].data["s"]
            if not self.if_sum:
                return edge_s, out, h_proj.view(-1, self.head*self.hd)
            else:
                return edge_s, out, h_proj.view(-1, self.head, self.hd).sum(-2)

    def message(self, edges):
        src_f  = edges.src["h"].view(-1, self.head, self.hd)
        dst_f  = edges.dst["h"].view(-1, self.head, self.hd)
        edge_s = edges.data["edge_sum"].view(-1, self.head, self.hd)
        z = torch.cat([src_f, dst_f, edge_s], dim=-1)
        alpha = self.atten(z)
        alpha = self.leakyrelu(alpha)
        return {"atten": alpha, "sf": src_f, "edge_s": edge_s}

    def reduce(self, nodes):
        alpha = nodes.mailbox["atten"]
        sf    = nodes.mailbox["sf"]
        alpha = self.softmax(alpha)
        out = torch.sum(alpha * sf, dim=1)
        if not self.if_sum:
            out = out.view(-1, self.head*self.hd)
            edge_s = torch.mean(nodes.mailbox["edge_s"], dim=1).view(-1, self.head*self.hd)
            return {"out": out, "s": edge_s}
        else:
            out = out.sum(dim=-2)
            edge_s = torch.sum(torch.mean(nodes.mailbox["edge_s"], dim=1), dim=-2)
            return {"out": out, "s": edge_s}

    def sign_edges(self, edges):
        src = edges.src["feat"]
        dst = edges.dst["feat"]
        edge_sum = self.relation_aware(src, dst)
        return {"edge_sum": edge_sum}

class Gate_Compat(nn.Module):
    def __init__(self, head, output_dim, dropout, if_sum=False):
        super().__init__()
        self.output_dim = output_dim
        self.head = head
        if not if_sum:
            self.beta = nn.Parameter(torch.empty(2*head*output_dim, 1))
        else:
            self.beta = nn.Parameter(torch.empty(2*output_dim, 1))
        nn.init.xavier_normal_(self.beta, gain=1.414)
        self.sigmoid = nn.Sigmoid()
    def forward(self, edge_sum, out, h):
        beta = torch.cat([edge_sum, out], dim=1)
        gate = self.sigmoid(beta @ self.beta)
        return gate * out + (1 - gate) * h

class MultiRelationGELayer_Compat(nn.Module):
    # ckpt names: layers.X.lin, layers.X.rel_aware, layers.X.blocks[etype][0/1]
    def __init__(self, input_dim, output_dim, head, graph, dropout, if_sum=False):
        super().__init__()
        rels = list(graph.etypes)
        if "homo" in rels:
            rels.remove("homo")
        self.relations = rels
        self.if_sum = if_sum

        self.rel_aware = RelationAware_Compat(input_dim, output_dim*head, dropout)
        if not if_sum:
            self.lin = nn.Linear(len(self.relations)*output_dim*head, output_dim*head)
        else:
            self.lin = nn.Linear(len(self.relations)*output_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

        self.blocks = nn.ModuleDict()
        for e in self.relations:
            seq = nn.ModuleList()
            seq.append(HLayer_Compat(input_dim, output_dim, head, self.rel_aware, e, dropout, if_sum))
            seq.append(Gate_Compat(head, output_dim, dropout, if_sum))
            self.blocks[e] = seq

    def forward(self, g, h):
        outs = []
        for e, seq in self.blocks.items():
            edge_s, out, h_proj = seq[0](g, h)
            he = seq[1](edge_s, out, h_proj)
            outs.append(he)
        x = torch.cat(outs, dim=1)
        x = self.dropout(x)
        x = self.lin(x)
        return x

class GE_GNN_Compat(nn.Module):
    # top-level ckpt list name: layers
    def __init__(self, args, g):
        super().__init__()
        if "r" in g.ntypes and "feat" in g.nodes["r"].data:
            in_dim = g.nodes["r"].data["feat"].shape[1]
        elif "feature" in g.ndata:
            in_dim = g.ndata["feature"].shape[1]
        else:
            raise RuntimeError("No node features found ('feat' or 'feature').")
        self.layers = nn.ModuleList()
        if args.n_layer == 1:
            self.layers.append(MultiRelationGELayer_Compat(in_dim, args.n_class, args.head, g, args.dropout, if_sum=True))
        else:
            self.layers.append(MultiRelationGELayer_Compat(in_dim, args.intra_dim, args.head, g, args.dropout, if_sum=False))
            for _ in range(1, args.n_layer-1):
                self.layers.append(MultiRelationGELayer_Compat(args.intra_dim*args.head, args.intra_dim, args.head, g, args.dropout, if_sum=False))
            self.layers.append(MultiRelationGELayer_Compat(args.intra_dim*args.head, args.n_class, args.head, g, args.dropout, if_sum=True))
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(args.dropout)

    def forward(self, g):
        if "r" in g.ntypes and "feat" in g.nodes["r"].data:
            h = g.nodes["r"].data["feat"].float()
        elif "feature" in g.ndata:
            h = g.ndata["feature"].float()
        else:
            raise RuntimeError("Missing features for forward.")
        for i, layer in enumerate(self.layers):
            h = layer(g, h)
            if i < len(self.layers) - 1:
                h = self.relu(h)
                h = self.drop(h)
        return h

# ---------------------------------------------------
# 6) Load model & run
# ---------------------------------------------------
print("\n--- Loading model ---")
model = GE_GNN_Compat(args, g).to(device)
state = torch.load(MODEL_PATH, map_location=device)
missing, unexpected = model.load_state_dict(state, strict=False)
if missing or unexpected:
    print("Note: non-strict load due to mismatches.")
    if missing:   print("  Missing keys:", missing)
    if unexpected:print("  Unexpected keys:", unexpected)
else:
    print("Model loaded successfully (strict=True would have worked).")

model.eval()
print("Model ready.")

with torch.no_grad():
    logits = model(g)                     # (N, n_class)
    probs  = F.softmax(logits, dim=1)     # (N, n_class)
    fraud_prob = probs[:,1].detach().cpu().numpy()
    pred_cls   = logits.argmax(1).detach().cpu().numpy()

# ---------------------------------------------------
# 7) Export + optional metrics
# ---------------------------------------------------
df_out = df_orig.copy()
df_out['predicted_fraud_class'] = pred_cls
df_out['fraud_probability']     = fraud_prob

print("\nSample predictions:")
cols_show = ['reviewerID','asin','overall','predicted_fraud_class','fraud_probability']
if 'class' in df_out.columns:
    cols_show.append('class')
print(df_out[cols_show].head(10))

stamp = time.strftime("%Y%m%d_%H%M%S")
OUT_CSV = os.path.join(OUT_DIR, f"inference_predictions_{stamp}.csv")
df_out.to_csv(OUT_CSV, index=False)
print(f"\n✅ Inference complete. Predictions saved to:\n{OUT_CSV}")

# Metrics if ground-truth is available
if 'class' in df_out.columns:
    y_true = df_out['class'].astype(int).values
    y_pred = df_out['predicted_fraud_class'].astype(int).values
    y_prob = df_out['fraud_probability'].astype(float).values

    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    try:
        auc  = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else 0.5
    except ValueError:
        auc = 0.5
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    gmean = float(np.sqrt(tpr*tnr)) if tpr>0 and tnr>0 else 0.0

    print("\n================= MODEL EVALUATION =================")
    print(f"Accuracy:      {acc:.4f}")
    print(f"Precision:     {prec:.4f}")
    print(f"Recall:        {rec:.4f}")
    print(f"F1 Score:      {f1:.4f}")
    print(f"AUC:           {auc:.4f}")
    print(f"G-Mean:        {gmean:.4f}")
    print("Confusion Matrix:")
    print(cm)
else:
    print("\n(No 'class' column found — skipped metrics.)")


--- Using device: cuda:0

--- Loading new data for inference ---
Loaded 500 rows from:
/content/drive/MyDrive/GE-GNN/MY_work/DataSets/sports_test_inference.csv

--- Loading/Fitting text/vector encoders ---

--- Loading/Fitting categorical encoders ---

Feature engineering complete.
Feature matrix shape: (500, 24)

--- Loading model ---
Model loaded successfully (strict=True would have worked).
Model ready.

Sample predictions:
       reviewerID        asin  overall  predicted_fraud_class  \
0  A140N2G9RBBP4R  B00176T9OY      3.0                      1   
1   AMWFW5H88D9KC  B004HKICCW      5.0                      0   
2   A778RZMTRYBVQ  B005CU6GBU      3.0                      0   
3  A323TVT239WQG3  B009FMNFYC      4.0                      0   
4  A1KULI5HW8BVMH  B00AZPWV7K      5.0                      0   
5  A1DKVLJ1E91FMX  B00CHKL7EE      1.0                      1   
6   AMMCJ1AA1FWHV  B006IXB73C      3.0                      0   
7   AE8WJT1P21Y39  B00CSZK708      4.0           

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
import numpy as np

# ------------------------------
# Ensure the CSV has true labels
# ------------------------------
if "class" not in df_out.columns: # Changed 'label' to 'class'
    raise RuntimeError("Your CSV must contain a 'class' column with 0/1 ground truth.")

y_true = df_out["class"].astype(int).values # Changed 'label' to 'class'
y_pred = df_out["predicted_fraud_class"].astype(int).values
y_prob = df_out["fraud_probability"].astype(float).values

# Basic metrics
acc  = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec  = recall_score(y_true, y_pred, zero_division=0)
f1   = f1_score(y_true, y_pred, zero_division=0)

# AUC (must pass probabilities)
try:
    auc  = roc_auc_score(y_true, y_prob)
except ValueError:
    auc = 0.0  # if only one class present

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# G-Mean like your training code
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
gmean = np.sqrt(tpr * tnr)

print("\n================= MODEL EVALUATION =================")
print(f"Accuracy:      {acc:.4f}")
print(f"Precision:     {prec:.4f}")
print(f"Recall:        {rec:.4f}")
print(f"F1 Score:      {f1:.4f}")
print(f"AUC:           {auc:.4f}")
print(f"G-Mean:        {gmean:.4f}")
print("Confusion Matrix:")
print(cm)
print("====================================================\n")

# ----------------------------------------------------
# Print details for predicted fraud and non-fraud reviews
# ----------------------------------------------------
print("\n--- Predicted Fraud Reviews (First 5) ---")
predicted_fraud_df = df_out[df_out['predicted_fraud_class'] == 1]
display(predicted_fraud_df.head())

print("\n--- Predicted Non-Fraud Reviews (First 5) ---")
predicted_non_fraud_df = df_out[df_out['predicted_fraud_class'] == 0]
display(predicted_non_fraud_df.head())


Accuracy:      0.5240
Precision:     0.5181
Recall:        0.6880
F1 Score:      0.5911
AUC:           0.5365
G-Mean:        0.4977
Confusion Matrix:
[[ 90 160]
 [ 78 172]]


--- Predicted Fraud Reviews (First 5) ---


Unnamed: 0,_id,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category,class,predicted_fraud_class,fraud_probability
0,{'$oid': '5a132792741a2384e855ebe7'},A140N2G9RBBP4R,B00176T9OY,na,"[0, 0]",This radio was ordered to provide notification...,3.0,Midland Radio,1376870400,"08 19, 2013",Sports_and_Outdoors,0.0,1,1.0
5,{'$oid': '5a1327f5741a2384e876bf7c'},A1DKVLJ1E91FMX,B00CHKL7EE,"Pike ""Pike Hunter""","[0, 0]",Sending them back. These are 6.5 inches.,1.0,Caveat emptor!,1404950400,"07 10, 2014",Sports_and_Outdoors,0.0,1,1.0
8,{'$oid': '5a1327a8741a2384e85d2e7b'},A3SYWN34IVBN08,B002AQHM3U,Terminal Guard,"[96, 103]",Let me first say that nothing can replace the ...,5.0,Paint it Black,1249516800,"08 6, 2009",Sports_and_Outdoors,1.0,1,1.0
9,{'$oid': '5a1327db741a2384e86dc9d8'},AAINFA1I7INLL,B0068FAQIM,scisskid,"[0, 0]",Tube blew in my face at 35 psi while inflating...,2.0,"Good seller, terrible tube.",1405036800,"07 11, 2014",Sports_and_Outdoors,0.0,1,1.0
10,{'$oid': '5a1327f3741a2384e8763123'},A9Q28YTLYREO7,B00BRBC8F6,"mistermaxxx08 ""mistermaxxx08""","[0, 0]",always dug Jesus Shuttleworth from back in t...,5.0,great clutch player,1388707200,"01 3, 2014",Sports_and_Outdoors,1.0,1,1.0



--- Predicted Non-Fraud Reviews (First 5) ---


Unnamed: 0,_id,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category,class,predicted_fraud_class,fraud_probability
1,{'$oid': '5a1327c7741a2384e867945d'},AMWFW5H88D9KC,B004HKICCW,Eric,"[0, 0]",This light far exceeded my expectations. It ho...,5.0,Long Lasting and Very Bright,1361664000,"02 24, 2013",Sports_and_Outdoors,1.0,0,2.539939e-08
2,{'$oid': '5a1327d3741a2384e86b7575'},A778RZMTRYBVQ,B005CU6GBU,"BENMESSAOUD ""Fishing Fan""","[0, 0]",I bought the hook remover since 3 months but o...,3.0,Good but not necessarily easy to use,1396656000,"04 5, 2014",Sports_and_Outdoors,0.0,0,1.466615e-08
3,{'$oid': '5a1327ec741a2384e8737ef0'},A323TVT239WQG3,B009FMNFYC,tacitworm,"[0, 0]",Great for massage therapy. Strong odor of rub...,4.0,Joe's Lacrosse Balls,1389916800,"01 17, 2014",Sports_and_Outdoors,1.0,0,2.95687e-11
4,{'$oid': '5a1327f1741a2384e8753e64'},A1KULI5HW8BVMH,B00AZPWV7K,Daniel,"[0, 0]","I use this for a dual function, first my wife ...",5.0,Daul function,1399939200,"05 13, 2014",Sports_and_Outdoors,1.0,0,1.55184e-08
6,{'$oid': '5a1327dc741a2384e86e292a'},AMMCJ1AA1FWHV,B006IXB73C,JamesTRabon,"[0, 0]",The paddles are extremely cheap on a windy day...,3.0,Ok,1397692800,"04 17, 2014",Sports_and_Outdoors,0.0,0,8.088985e-08
