In [None]:
# Which pre-trained model to use.
checkpoint = "microsoft/deberta-v3-small" 

In [None]:
nltk_path = "./venv/nltk_data"

import nltk
nltk.download("wordnet", download_dir=nltk_path)
nltk.download("stopwords", download_dir=nltk_path)
nltk.download("omw-1.4", download_dir=nltk_path)
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_path)
nltk.download('punkt_tab',download_dir=nltk_path)

In [None]:
import pandas as pd
import torch

from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

from models.data_augmentation import synonym_replacement
from models.preprocessing import preprocess
from models.data_sampling import class_weighting 


In [None]:
# Checking whether you are running on CPU or GPU.
# If the output here says "cuda" then it's running on GPU. Otherwise it's probably CPU.
# In order to run your code in Colab on the GPU, go to Edit -> Notebook settings -> Hardware accelerator and set it to "GPU".
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [None]:
official_dev_dataset_path ="../dataset/dpm_pcl_test.csv"
official_dev_dataset = pd.read_csv(official_dev_dataset_path)

In [None]:
official_dev_dataset["label"] = official_dev_dataset["label"].apply(lambda x: 0 if (x == 0 or x == 1) else 1)

official_dev_dataset.loc[official_dev_dataset["label"].isna(), "text"] = ""

official_dev_dataset = official_dev_dataset.drop(['par_id', 'art_id', 'keyword', 'country', 'orig_label'], axis=1)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
# Preprocess datasets
preprocessed_official_dev_dataset =  preprocess(official_dev_dataset, remove_stopwords=False)


# Convert into Dataset objects
raw_preprocessed_official_dev_dataset = Dataset.from_pandas(preprocessed_official_dev_dataset["text", "label"])

# Tokenize the datasets
tokenized_preprocessed_official_dev_dataset = raw_preprocessed_official_dev_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
# Load the model's state
def load_model(model, filename):
    model_dir = "./models/saved_models"
    saved_model = torch.load(model_dir + filename)
    model.load_state_dict(saved_model["model_state_dict"])

def get_and_save_predictions(model, dataloader, filename):
    print("Getting predictions...")
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            logits = outputs.logits
            predictions.extend(logits.argmax(dim=-1).cpu().numpy())
    
    print("Saving predictions...")
    
    # Save predictions as a .txt file with one prediction per line
    with open(filename, "w") as f:
        for prediction in predictions:
            f.write(f"{prediction}\n")

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)

# Load the model's state
load_model(model, "deberta_with_preprocessing_synonym_replacement_and_class_weighting.pth")

# Load the datasets
official_dev_dataloader = DataLoader(
    tokenized_preprocessed_official_dev_dataset, batch_size=8, collate_fn=data_collator
)

get_and_save_predictions(model, official_dev_dataloader, "dev.txt")