In [31]:
# required libraries
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

In [None]:
# give the file path to be predicted
file_path = "./sample.csv"

# give the path of where the model is located
model_path = "./roberta_master2"

In [32]:
# load the file into dataframe
data = pd.read_csv(file_path,encoding='latin-1')
data

Unnamed: 0,prompt
0,"1. Q: For you, and here's one for me. Ok, now ..."
1,"3. Q: Adrian, what's your whole name?"
2,"5. Q: Wow, Adrian [middle name and last name]?"
3,"7. Q: Ok, if I make a mistake I want you to ma..."
4,"9. Q: Ok thank you, how old are you?"
...,...
94,189. Q: Put a mark.
95,"191. Q: Ok, somewhere else?"
96,193. Q: Does she hit you someplace else?
97,"195. Q: Ok, someplace else?"


In [33]:
# convert questions into list format
questions = list(data.iloc[:,0])
questions = list(map(str, questions))

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [35]:
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize your dataset
tokenized_questions = tokenizer(questions, padding=True, truncation=True, max_length=512, return_tensors='pt')

# Prepare the data to be given in batches
question_dataset = TensorDataset(tokenized_questions['input_ids'], tokenized_questions['attention_mask'])
question_loader = DataLoader(question_dataset, sampler=SequentialSampler(question_dataset), batch_size=2)

In [36]:
# load the model
model = torch.load(model_path)
model.to(device)
model.eval()

# make predictions
pred_labels = []
with torch.no_grad():
    for batch in question_loader:
        input_ids, attention_mask = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs[0], axis=1)
        pred_labels.append(preds)

In [37]:
# unpack the list of lists into a single list
pred_labels = [j.tolist() for sub in pred_labels for j in sub]

In [39]:
# add predictions column to the dataframe
data['Predictions'] = pred_labels

In [40]:
# write the final dataframe back to the file 
data.to_csv(file_path,index=False)