In [1]:
!pip install torch-geometric 
!pip install nlpaug
import os, re, json, torch, torch.nn as nn
import numpy as np, pandas as pd
from tqdm import tqdm
from sklearn.metrics import (accuracy_score, f1_score, classification_report, 
                             confusion_matrix)
import matplotlib.pyplot as plt, seaborn as sns
from transformers import AutoTokenizer, AutoModel
from torch_geometric.data import Data
from torch_geometric.nn import GATConv


Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [2]:
STEP = 512
THRESHOLD = 0.60 
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

#Paths
TEST_PATH = "/kaggle/input/deception/test.jsonl"  
CHECKPOINT_PATH = "/kaggle/input/besssssst/best_model_checkpoint (2).pt"     
TRAIN_PATH = "/kaggle/input/deception/train (1).jsonl"
VAL_PATH = "/kaggle/input/deception/validation.jsonl"


import torch.nn.functional as F

class DialogueActHead(nn.Module):
    def __init__(self): super().__init__(); self.fc=nn.Linear(768,6)
    def forward(self,h): return self.fc(h)

class PowerEmb(nn.Module):
    def __init__(self):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(1, 32),
            nn.ReLU(),
            nn.Linear(32, 32)
        )

    def forward(self, x):
        return self.mlp(x)

class GraphEncoder(nn.Module):
    def __init__(self): 
        super().__init__()
        self.g1=GATConv(806,128,heads=4,dropout=0.3)
        self.g2=GATConv(128*4,128,heads=1,dropout=0.3)
    def forward(self,x,e):
        x=F.elu(self.g1(x,e)); return F.elu(self.g2(x,e))

class MLDRM(nn.Module):
    def __init__(self, u_cache):
        super().__init__()
        self.u_cache = u_cache
        self.act = DialogueActHead()
        self.power = PowerEmb()
        self.graph = GraphEncoder()
        self.cls = nn.Sequential(nn.Dropout(0.4), nn.Linear(128,64),
                                nn.ReLU(), nn.Linear(64,2))
    def forward(self, d):
        u = self.u_cache.to(d.y.device)
        a = self.act(u).detach()
        p = self.power(d.power_diff)
        feats = torch.cat([u,a,p], dim=1)
        g = self.graph(feats, d.edge_index)
        return self.cls(g)


def load_jsonl(fp, split_name):
    rows = []
    with open(fp) as f:
        for ln in f:
            g = json.loads(ln)
            for i, msg in enumerate(g["messages"]):
                if g["sender_labels"][i] == "NOANNOTATION":
                    continue
                rows.append(dict(
                    message = msg,
                    processed = re.sub(r"[^\w\s]", "", msg.lower()),
                    power_diff = -int(g["game_score_delta"][i]),
                    is_deceptive = 0 if g["sender_labels"][i] else 1,
                    speaker = g["speakers"][i],
                    year = int(g["years"][i]),
                    season = g["seasons"][i],
                    split = split_name
                ))
    return pd.DataFrame(rows)

# Option 1: Load test data only for quick testing
print("Loading test data...")
test_df = load_jsonl(TEST_PATH, "test")
print(f"Loaded {len(test_df)} test messages")

# Option 2: Load and process all data like in original code
# This is to reproduce the original training environment
print("Loading all data to create same full dataset as training...")
try:
    train_df = load_jsonl(TRAIN_PATH, "train")
    val_df = load_jsonl(VAL_PATH, "val")
    full_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
    print(f"Full dataset: {len(full_df)} messages")
    print(f"Distribution: {full_df.split.value_counts().to_dict()}")
except Exception as e:
    print(f"Error loading all data: {e}")
    print("Continuing with test data only...")
    full_df = test_df.copy()


print(f"Loading model from {CHECKPOINT_PATH}...")
checkpoint = torch.load(CHECKPOINT_PATH, map_location=device)
saved_cache = checkpoint["u_cache"].to(device)  # This contains embeddings from original training

print(f"Saved cache shape: {saved_cache.shape}")
if len(full_df) != saved_cache.shape[0]:
    print(f"WARNING: Dataset size ({len(full_df)}) doesn't match saved embeddings ({saved_cache.shape[0]})")
    print("Will need to regenerate embeddings for test data only")


print("Loading DistilBERT tokenizer and model...")
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert = AutoModel.from_pretrained("distilbert-base-uncased").cpu().eval()

@torch.inference_mode()
def cls_embed(batch):
    t = tok(batch, return_tensors="pt", padding=True,
            truncation=True, max_length=96)
    return bert(**t).last_hidden_state[:,0,:]

print("Generating BERT embeddings for test data...")
chunks = []
for i in tqdm(range(0, len(test_df), STEP), desc="BERT-CPU"):
    chunks.append(cls_embed(list(test_df.processed.iloc[i:i+STEP])))
test_embeddings = torch.cat(chunks)  # (N, 768) on CPU


print("Creating graph data for test set...")
power = torch.tensor(test_df.power_diff.values, dtype=torch.float32).unsqueeze(1).to(device)
labels = torch.tensor(test_df.is_deceptive.values, dtype=torch.long).to(device)


edges = []
for spk, grp in test_df.groupby("speaker"):
    idx = list(grp.sort_values(["year", "season"]).index)
    for a, b in zip(idx, idx[1:]): 
        edges.extend([[a, b], [b, a]])

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous().to(device)
test_mask = torch.ones(len(test_df), dtype=torch.bool, device=device)

# Create test-only data object
test_data = Data(
    edge_index=edge_index,
    power_diff=power,
    y=labels,
    test_mask=test_mask
)


print("Running inference with test embeddings...")
model = MLDRM(test_embeddings).to(device)

# Load the trained weights
model.load_state_dict(checkpoint["state_dict"])
model.eval()

with torch.no_grad():
    test_logits = model(test_data)
    test_probs = torch.softmax(test_logits, dim=1)[:, 1].cpu().numpy()

test_pred = (test_probs >= THRESHOLD).astype(int)
test_true = test_data.y.cpu().numpy()

print("\n===== Evaluation Results =====")
acc = accuracy_score(test_true, test_pred)
f1m = f1_score(test_true, test_pred, average='macro', zero_division=0)
f1d = f1_score(test_true, test_pred, pos_label=1, zero_division=0)
print(f"Test Accuracy: {acc:.4f}")
print(f"Test Macro F1: {f1m:.4f}")
print(f"Test Deceptive F1: {f1d:.4f}")
print("\nClassification Report:")
print(classification_report(test_true, test_pred, zero_division=0))





Using device: cuda
Loading test data...
Loaded 2741 test messages
Loading all data to create same full dataset as training...
Full dataset: 17289 messages
Distribution: {'train': 13132, 'test': 2741, 'val': 1416}
Loading model from /kaggle/input/besssssst/best_model_checkpoint (2).pt...


  checkpoint = torch.load(CHECKPOINT_PATH, map_location=device)


Saved cache shape: torch.Size([29239, 768])
Will need to regenerate embeddings for test data only
Loading DistilBERT tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Generating BERT embeddings for test data...


BERT-CPU: 100%|██████████| 6/6 [02:19<00:00, 23.22s/it]


Creating graph data for test set...
Running inference with test embeddings...

===== Evaluation Results =====
Test Accuracy: 0.8325
Test Macro F1: 0.5384
Test Deceptive F1: 0.1700

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      2501
           1       0.15      0.20      0.17       240

    accuracy                           0.83      2741
   macro avg       0.54      0.54      0.54      2741
weighted avg       0.85      0.83      0.84      2741



# Save Results

In [3]:
test_df['predicted_deceptive'] = test_pred
test_df['deception_probability'] = test_probs

test_df.to_csv("test_predictions.csv", index=False)
print("Saved predictions to test_predictions.csv")


Saved predictions to test_predictions.csv
