In [None]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# === SETTINGS ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 256
batch_size = 32

# === MODEL PATHS ===
level1_paths = [f"/content/drive/MyDrive/FIRE/run_20250628_034630/models/level1_fold{i}.pth" for i in range(1, 6)]
level2_paths = [f"/content/drive/MyDrive/FIRE/run_20250629_090953/models/level2_fold{i}.pth" for i in range(1, 6)]
level3_paths = [f"/content/drive/MyDrive/FIRE/run_20250629_121221/models/level3_fold{i}.pth" for i in range(1, 6)]

# === DATASET ===
class TextDataset(Dataset):
    def __init__(self, texts):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=max_length)
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

# === MODEL LOADER ===
def load_model(path, num_labels):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    return model

# === ENSEMBLE INFERENCE ===
def ensemble_predict(model_paths, dataset, num_labels):
    loader = DataLoader(dataset, batch_size=batch_size)
    all_probs = np.zeros((len(dataset), num_labels))
    for path in model_paths:
        model = load_model(path, num_labels)
        fold_probs = []
        with torch.no_grad():
            for batch in loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                logits = model(**batch).logits
                probs = torch.softmax(logits, dim=1).cpu().numpy()
                fold_probs.append(probs)
        fold_probs = np.concatenate(fold_probs, axis=0)
        all_probs += fold_probs
        del model
        torch.cuda.empty_cache()
    avg_probs = all_probs / len(model_paths)
    preds = np.argmax(avg_probs, axis=1)
    return preds

# === COMMON INFERENCE FUNCTION ===
def hierarchical_predict(df, input_texts):
    dataset = TextDataset(input_texts)
    level1_preds = ensemble_predict(level1_paths, dataset, num_labels=3)
    level2_preds = ["" for _ in df.index]
    level3_preds = ["" for _ in df.index]

    idx_level2 = [i for i, p in enumerate(level1_preds) if p == 2]
    texts_level2 = [input_texts[i] for i in idx_level2]
    dataset_level2 = TextDataset(texts_level2)
    level2_subset_preds = ensemble_predict(level2_paths, dataset_level2, num_labels=3)
    for j, i in enumerate(idx_level2):
        level2_preds[i] = int(level2_subset_preds[j])

    idx_level3 = [i for i in idx_level2 if level2_preds[i] == 0]
    texts_level3 = [input_texts[i] for i in idx_level3]
    dataset_level3 = TextDataset(texts_level3)
    level3_subset_preds = ensemble_predict(level3_paths, dataset_level3, num_labels=4)
    for j, i in enumerate(idx_level3):
        level3_preds[i] = int(level3_subset_preds[j])

    return level1_preds, level2_preds, level3_preds

# === REDDIT ===
reddit = pd.read_csv("/content/drive/MyDrive/FIRE/CRYPTO_REDDIT_TEST.csv").fillna("")
reddit_texts = reddit["MAIN"].astype(str).tolist()
print(" Reddit Inference...")
level1, level2, level3 = hierarchical_predict(reddit, reddit_texts)
reddit["level 1"] = level1
reddit["level 2"] = level2
reddit["level 3"] = level3
reddit = reddit[["title", "selftext", "MAIN", "level 1", "level 2", "level 3"]]
reddit.to_csv("/content/drive/MyDrive/FIRE/crypto_test_reddit.csv", index=False)

# === TWITTER ===
twitter = pd.read_csv("/content/drive/MyDrive/FIRE/CRYPTO_TWITTER_TEST.csv").fillna("")
twitter_texts = twitter["Text"].astype(str).tolist()
print(" Twitter Inference...")
level1, level2, level3 = hierarchical_predict(twitter, twitter_texts)
twitter["Level 1"] = level1
twitter["Level 2"] = level2
twitter["Level 3"] = level3
twitter = twitter[["Text", "Level 1", "Level 2", "Level 3"]]
twitter.to_csv("/content/drive/MyDrive/FIRE/crypto_test_tweet.csv", index=False)

# === YOUTUBE ===
youtube = pd.read_csv("/content/drive/MyDrive/FIRE/CRYPTO_YOUTUBE_TEST.csv").fillna("")
youtube_texts = youtube["MAIN"].astype(str).tolist()
print(" YouTube Inference...")
level1, level2, level3 = hierarchical_predict(youtube, youtube_texts)
youtube["Level1"] = level1
youtube["Level2"] = level2
youtube["Level3"] = level3
youtube = youtube[["comment_id", "MAIN", "Level1", "Level2", "Level3"]]
youtube.to_csv("/content/drive/MyDrive/FIRE/crypto_test_youtube.csv", index=False)

print(" All predictions complete. Zip the 4 CSVs for submission.")




🔍 Reddit Inference...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task

🔍 Twitter Inference...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task

🔍 YouTube Inference...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task

✅ All predictions complete. Zip the 4 CSVs for submission.


In [None]:
import os
import pandas as pd
import torch
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# === Paths
TEST_PATH = "/content/drive/MyDrive/FIRE/CRYPTO_QnA_TEST.csv"
MODEL_DIR = "/content/drive/MyDrive/FIRE/run_20250629_121221/fold_outputs/fold_2/best_model"
OUTPUT_PATH = "/content/drive/MyDrive/FIRE/crypto_test_qna.csv"  # Final output path

# === Load Test Data
df = pd.read_csv(TEST_PATH).fillna("")

# === Combine fields for input
def combine_text(row):
    return f"{row['title']} [SEP] {row['selftext']} [SEP] {row['MAIN']} [SEP] {row['comment_body']}"
df['combined'] = df.apply(combine_text, axis=1)

# === Load Model & Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_DIR)
model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_DIR)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# === Dataset Class
class QnADataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in encoding.items()}

# === DataLoader
dataset = QnADataset(df['combined'].tolist(), tokenizer)
loader = DataLoader(dataset, batch_size=32)

# === Inference
preds = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    for batch in tqdm(loader, desc="Predicting relevance"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        preds.extend(predictions.cpu().tolist())

# === Save to correct CSV format
df["relevance"] = preds
df.drop(columns=["combined"], inplace=True)

# Ensure correct columns & order
final_cols = ["title", "selftext", "MAIN", "comment_body", "relevance"]
df = df[final_cols]

df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved QnA predictions to: {OUTPUT_PATH}")


🔍 Predicting relevance: 100%|██████████| 198/198 [02:43<00:00,  1.21it/s]


✅ Saved QnA predictions to: /content/drive/MyDrive/FIRE/crypto_test_qna.csv


In [None]:
import zipfile

#  CSV file paths
csv_files = {
    "crypto_test_reddit.csv": "/content/drive/MyDrive/FIRE/crypto_test_reddit.csv",
    "crypto_test_tweet.csv": "/content/drive/MyDrive/FIRE/crypto_test_tweet.csv",
    "crypto_test_youtube.csv": "/content/drive/MyDrive/FIRE/crypto_test_youtube.csv",
    "crypto_test_qna.csv": "/content/drive/MyDrive/FIRE/crypto_test_qna.csv"
}

#  Output zip path
zip_path = "/content/drive/MyDrive/FIRE/rushikannan.zip"

#  Create zip file
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for name, path in csv_files.items():
        zipf.write(path, arcname=name)

print(f" Created ZIP for submission at: {zip_path}")


✅ Created ZIP for submission at: /content/drive/MyDrive/FIRE/rushikannan.zip
