# Inference

## Pre settings of the code

In [3]:
import pandas as pd
import numpy as np
import random

import torch
import warnings

from tqdm import tqdm

from ydata_profiling import ProfileReport

from torch.nn import BCEWithLogitsLoss
from transformers import RobertaTokenizerFast, \
RobertaModel, Trainer, TrainingArguments,EvalPrediction, TrainerCallback

from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaClassificationHead
from torch.utils.data import DataLoader

from skmultilearn.model_selection import iterative_train_test_split
%matplotlib inline

## Check the cuda and GPU

In [4]:
print('Torch cuda version: ', torch.version.cuda)
print('Torch cuda is enabled: ', torch.backends.cudnn.enabled)

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

torch.set_float32_matmul_precision('high')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')

Torch cuda version:  11.8
Torch cuda is enabled:  True
Using device: cuda
Device name: NVIDIA GeForce RTX 3060 Laptop GPU


## Load the trained model

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define the path where your model is saved
model_path = 'roberta_trainer'

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
# Load the tokenizer (if you saved it)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [6]:
title_test = pd.read_csv('test_dataset.csv')

title_test

Unnamed: 0,Id,Title,Labels
0,0,Vice President / Director of Systems Engineering,"[0, 0, 0, 0, 0, 1]"
1,3,CTO/Executive Director of Technology Services,"[1, 1, 0, 0, 0, 0]"
2,6,"Chief Information Officer, Platform Services","[1, 0, 0, 0, 0, 0]"
3,8,Chief Information Systems Officer,"[1, 0, 0, 0, 0, 0]"
4,10,"Vice President, Chief Information Security Off...","[1, 0, 0, 0, 0, 0]"
...,...,...,...
441,2004,"Paraplanning, Operations Manager","[0, 0, 0, 1, 0, 0]"
442,2006,Group Finance Reporting Manager,"[0, 0, 0, 1, 0, 0]"
443,2012,Indirect Tax Technology Manager,"[0, 0, 0, 1, 0, 0]"
444,2016,Manager Manufacturing Engineering,"[0, 0, 0, 1, 0, 0]"


## Handling the Data class

In [7]:
# Instantiate a class that will handle the data
class Data_Processing_test():
    def __init__(self, tokenizer, id_column, text_column):
        
        # define the text column from the dataframe
        self.text_column = text_column.tolist()
                    
        # define the id column and transform it to list
        self.id_column = id_column.tolist()
            
# Iter method to get each element at the time and tokenize it using bert        
    def __getitem__(self, index):
        comment_text = str(self.text_column[index])
        comment_text = " ".join(comment_text.split())
        
        inputs = tokenizer.encode_plus(comment_text,
                                       add_special_tokens = True,
                                       max_length= 512,
                                       padding = 'max_length',
                                       return_attention_mask = True,
                                       truncation = True,
                                       return_tensors='pt')
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        id_ = self.id_column[index]
        return {'input_ids':input_ids[0], 'attention_mask':attention_mask[0], 
                'id_':id_}
  
    def __len__(self):
        return len(self.text_column) 

In [8]:
batch_size = 64
# Create a class to process the traininga and test data
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',
                                          padding = 'max_length',
                                          truncation=True, 
                                          max_length = 512)
test_data_pred =  Data_Processing_test(tokenizer,
                                       title_test['Id'], 
                                       title_test['Title'])

# Use the dataloaders class to load the data
dataloaders_dict = {'test': DataLoader(test_data_pred,
                                                 batch_size=batch_size, shuffle=True, num_workers=2)}



## Embeddings and Prediction Save

In [9]:
def prediction_and_embeddings():
    prediction_data_frame_list = []
    embeddings_data_frame_list = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloaders_dict['test'], desc="Predicting")):  # wrap the loop with tqdm
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Feed the sequences to the model, specifying the attention mask
            outputs = model(inputs, attention_mask=attention_mask, output_hidden_states=True)
            
            hidden_states = outputs.hidden_states
            last_hidden_states = hidden_states[-1].mean(dim=1).cpu().numpy()

            ids = np.array(batch['id_'])
            embeddings_df = pd.DataFrame(last_hidden_states, index=ids)
            embeddings_data_frame_list.append(embeddings_df)

            # Apply sigmoid to get probabilities
            sigmoid = torch.nn.Sigmoid()
            probs = sigmoid(torch.Tensor(outputs[0].detach().cpu().data.numpy()))
            
            # Convert probabilities to numpy array
            probs = np.array(probs)
            
            # Store predictions
            y_pred = np.zeros(probs.shape)
            y_pred = probs
            temp_data = pd.DataFrame(zip(batch['id_'], probs), columns=['id', 'target'])
            prediction_data_frame_list.append(temp_data)

    embeddings_file = 'test_embeddings.csv'
    predictions_file = 'test_predictions.csv'
    binary_predictions_file = 'test_binary_predictions.csv'

    all_embeddings_df = pd.concat(embeddings_data_frame_list)
    all_embeddings_df.to_csv(embeddings_file, index_label='id')

    prediction_df = pd.concat(prediction_data_frame_list)
    prediction_df['id'] = prediction_df['id'].apply(lambda x: int(x.item()))
    prediction_df[['Chief Officer', 'Director', 'Individual Contributor/Staff',
             'Manager', 'Owner', 'Vice President']] = pd.DataFrame(prediction_df.target.tolist(), index=prediction_df.index)
    prediction_df = prediction_df.drop(columns='target')
    prediction_df.to_csv(predictions_file, index=False)

    binary_predictions_df = prediction_df.copy()
    binary_predictions_df.iloc[:, 1:] = (prediction_df.iloc[:, 1:] > 0.5).astype(int)
    binary_predictions_df.to_csv(binary_predictions_file, index=False)

prediction_and_embeddings()

Predicting: 100%|██████████| 7/7 [00:05<00:00,  1.20it/s]
