## Install Dependencies

In [None]:
!pip install llm2vec

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
!pip install jsonlines

In [None]:
import torch
import pandas as pd
from tqdm import tqdm
from llm2vec import LLM2Vec
import jsonlines

In [None]:
llm2vec_model = LLM2Vec.from_pretrained(
    "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
    peft_model_name_or_path="McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised",
    device_map="cuda" if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.bfloat16,
)

## Create Embeddings

In [None]:
# Message Cleaning
def soft_clean(text):
    text = text.replace('\n', ' ').strip()
    return text

In [None]:
# Load dataset
def load_flattened_dataset(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for game in reader:
            for i in range(len(game["messages"])):
                if game["sender_labels"][i] == "NOANNOTATION":
                    continue
                data.append({
                    "message": game["messages"][i],
                    "sender_label": int(game["sender_labels"][i] == False),
                    "receiver_label": game["receiver_labels"][i],
                    "speaker": game["speakers"][i],
                    "receiver": game["receivers"][i],
                    "abs_msg_idx": game["absolute_message_index"][i],
                    "rel_msg_idx": game["relative_message_index"][i],
                    "season": game["seasons"][i],
                    "year": game["years"][i],
                    "score": game["game_score"][i],
                    "score_delta": float(game["game_score_delta"][i]),
                    "game_id": game["game_id"],
                    "players": game["players"],
                    "message_length": len(game["messages"][i])
                })
    return pd.DataFrame(data)

In [None]:
def embed_messages(df, model):
    embeddings = []
    for msg in tqdm(df["message"], desc="Embedding messages"):
        emb = model.encode(msg)[0]  # Returns 1 vector
        embeddings.append(torch.tensor(emb, dtype=torch.float32))
    return torch.stack(embeddings)

In [None]:
def create_embeddings(file: str, output_path: str):
    df = load_flattened_dataset(file)
    x = embed_messages(df, llm2vec_model)
    y = torch.tensor(df["sender_label"].values, dtype=torch.float32)
    torch.save((x, y), output_path)

In [None]:
test_file = '/kaggle/input/nlp-deception/test.jsonl'
train_file = '/kaggle/input/nlp-deception/train.jsonl'
val_file = '/kaggle/input/nlp-deception/validation.jsonl'

In [None]:
create_embeddings(train_file, '/kaggle/working/train_embeddings.pt')

In [None]:
create_embeddings(val_file, '/kaggle/working/val_embeddings.pt')

In [None]:
create_embeddings(test_file, '/kaggle/working/test_embeddings.pt')

## Inference

In [None]:
inf_file = '/kaggle/input/nlp-deception/validation.jsonl'

In [None]:
create_embeddings(inf_file, '/kaggle/working/inf_embeddings.pt')