In [1]:
# Which pre-trained model to use.
checkpoint = "microsoft/deberta-v3-small" 

In [2]:
nltk_path = "./venv/nltk_data"

import nltk
nltk.download("wordnet", download_dir=nltk_path)
nltk.download("stopwords", download_dir=nltk_path)
nltk.download("omw-1.4", download_dir=nltk_path)
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_path)
nltk.download('punkt_tab',download_dir=nltk_path)

[nltk_data] Downloading package wordnet to ./venv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to ./venv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to ./venv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     ./venv/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to ./venv/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
import pandas as pd
import torch

from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

from models.preprocessing import preprocess


In [4]:
# Checking whether you are running on CPU or GPU.
# If the output here says "cuda" then it's running on GPU. Otherwise it's probably CPU.
# In order to run your code in Colab on the GPU, go to Edit -> Notebook settings -> Hardware accelerator and set it to "GPU".
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [5]:
official_dev_dataset_path = "./dataset/dpm_pcl_test.csv"
test_dataset_path = "./dataset/original_datasets/task4_test.tsv"

official_dev_dataset = pd.read_csv(official_dev_dataset_path)
test_dataset = pd.read_csv(test_dataset_path, sep="\t", names=['par_id', 'art_id', 'keyword', 'country', 'text'])

In [6]:
official_dev_dataset["label"] = official_dev_dataset["orig_label"].apply(lambda x: 0 if (x == 0 or x == 1) else 1)

official_dev_dataset.loc[official_dev_dataset["text"].isna(), "text"] = ""
test_dataset.loc[test_dataset["text"].isna(), "text"] = ""

official_dev_dataset = official_dev_dataset.drop(['par_id', 'art_id', 'keyword', 'country', 'orig_label'], axis=1)
test_dataset = test_dataset.drop(['par_id', 'art_id', 'keyword', 'country'], axis=1)

official_dev_dataset.head()

Unnamed: 0,text,label
0,"His present "" chambers "" may be quite humble ,...",1
1,Krueger recently harnessed that creativity to ...,1
2,10:41am - Parents of children who died must ge...,1
3,When some people feel causing problem for some...,1
4,We are alarmed to learn of your recently circu...,1


In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

data_collator = DataCollatorWithPadding(tokenizer)



In [12]:
# Preprocess datasets
preprocessed_official_dev_dataset =  preprocess.preprocess_dataset(official_dev_dataset, remove_stopwords=False)
preprocessed_test_dataset = preprocess.preprocess_dataset(test_dataset, remove_stopwords=False)

# Convert into Dataset objects
raw_preprocessed_official_dev_dataset = Dataset.from_pandas(preprocessed_official_dev_dataset[["text", "label"]])
raw_preprocessed_test_dataset = Dataset.from_pandas(preprocessed_test_dataset[["text"]])

# Tokenize the datasets
tokenized_preprocessed_official_dev_dataset = raw_preprocessed_official_dev_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_preprocessed_test_dataset = raw_preprocessed_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/2094 [00:00<?, ? examples/s]

Map:   0%|          | 0/3832 [00:00<?, ? examples/s]

In [15]:
# Load the model's state
def load_model(model, filename):
    model_dir = "./models/saved_models/"
    saved_model = torch.load(model_dir + filename)
    model.load_state_dict(saved_model["model_state_dict"])

def get_and_save_predictions(model, dataloader, filename):
    print("Getting predictions...")
    
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            logits = outputs.logits
            predictions.extend(logits.argmax(dim=-1).cpu().numpy())
    
    print("Saving predictions...")
    
    # Save predictions as a .txt file with one prediction per line
    with open(filename, "w") as f:
        for prediction in predictions:
            f.write(f"{prediction}\n")

In [16]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)

# Load the model's state
load_model(model, "deberta_with_preprocessing_synonym_replacement_and_class_weighting.pth")

# Load the datasets
official_dev_dataloader = DataLoader(
    tokenized_preprocessed_official_dev_dataset, batch_size=8, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_preprocessed_test_dataset, batch_size=8, collate_fn=data_collator
)

# Make dev.txt and test.txt predictions files
get_and_save_predictions(model, official_dev_dataloader, "dev.txt")
get_and_save_predictions(model, test_dataloader, "test.txt")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Getting predictions...
Saving predictions...
Getting predictions...
Saving predictions...
