In [None]:
import os
os.environ["HF_HOME"] = "models/huggingface"
os.environ["HF_HUB_CACHE"] = "models/huggingface/hub"
os.environ["HF_ASSETS_CACHE"] = "models/huggingface/assets"
os.environ["HF_TOKEN_PATH"] = "models/huggingface/token"


import torch
import pandas as pd
from transformers.modeling_utils import PreTrainedModel
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers import AutoTokenizer, AutoModelForSequenceClassification

!nvidia-smi

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model_names = [
    "tabularisai/multilingual-sentiment-analysis", #Fast
    # "nlptown/bert-base-multilingual-uncased-sentiment" #Slow
    ]
tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in model_names]
models = [(AutoModelForSequenceClassification.from_pretrained(model_name)).to(device) for model_name in model_names]
sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}

In [None]:
def predict_sentiment(models: list[PreTrainedModel], tokenizers: list[PreTrainedTokenizerFast], texts: list[str]):
 
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}

    # Tensor to store probabilities (num_texts x num_models x num_sentiments)
    tensor = torch.zeros(len(texts), len(models), len(sentiment_map)).to(device)
    
    for model_idx, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        # Tokenize the input texts
        
        inputs = tokenizer.batch_encode_plus(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)#, max_length=512)

        with torch.no_grad():
            # Get model outputs
            outputs = model(**inputs)

        # Compute probabilities
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)  # Shape: (num_texts x num_sentiments)

        # Assign probabilities to the tensor for the current model
        # print(probabilities)
        tensor[:, model_idx, :] = probabilities

    # Aggregate probabilities across models (mean)
    agg_probabilities = tensor.mean(dim=1)  # Shape: (num_texts x num_sentiments)
    # print(agg_probabilities)
    # Get final sentiment predictions
    predicted_indices = torch.argmax(agg_probabilities, dim=-1)  # Shape: (num_texts)

    # Map indices to sentiment labels
    classification = [sentiment_map[p.item()] for p in predicted_indices]

    return classification

def run_batch(models: list[PreTrainedModel], tokenizers: list[PreTrainedTokenizerFast], texts: list[str], batch_size: int = 64):
    
    classifications = [None] * len(texts)  # List to store predictions

    for batch_idx in range(0, len(texts), batch_size):
        batch_texts = texts[batch_idx: batch_idx + batch_size]  # Get the current batch of texts

        # Predict sentiment for the batch
        batch_classifications = predict_sentiment(models, tokenizers, batch_texts)
        
        # Assign the batch's classifications to the corresponding indices
        classifications[batch_idx: batch_idx + batch_size] = batch_classifications

        if device == "cuda":
            torch.cuda.empty_cache()  # Clear CUDA cache after processing each batch (optional)

    return classifications

In [None]:
def predict_sentiment_with_chunking(models: list[PreTrainedModel], 
                                    tokenizers: list[PreTrainedTokenizerFast], 
                                    texts: list[str], 
                                    chunk_size: int = 512):
    """
    Predict sentiment for a list of texts, chunking long texts into manageable sizes.
    """
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}

    # Tensor to store probabilities (num_texts x num_models x num_sentiments)
    tensor = torch.zeros(len(texts), len(models), len(sentiment_map)).to(device)
    
    for model_idx, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        for text_idx, text in enumerate(texts):
            # Tokenize and chunk the text if necessary
            inputs = tokenizer.encode_plus(text, truncation=False, return_tensors="pt").to(device)
            input_ids = inputs['input_ids'].squeeze(0)
            num_chunks = (len(input_ids) + chunk_size - 1) // chunk_size  # Calculate required chunks
            
            chunk_probabilities = []
            
            for i in range(num_chunks):
                chunk_input_ids = input_ids[i * chunk_size: (i + 1) * chunk_size]
                chunk_attention_mask = torch.ones_like(chunk_input_ids).to(device)
                
                chunk_inputs = {
                    'input_ids': chunk_input_ids.unsqueeze(0),
                    'attention_mask': chunk_attention_mask.unsqueeze(0)
                }
                
                with torch.no_grad():
                    outputs = model(**chunk_inputs)
                
                probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)  # Shape: (1 x num_sentiments)
                chunk_probabilities.append(probabilities)
            
            # Aggregate probabilities across chunks
            aggregated_probabilities = torch.mean(torch.stack(chunk_probabilities, dim=0), dim=0)
            
            # Assign probabilities for the current text and model
            tensor[text_idx, model_idx, :] = aggregated_probabilities

    # Aggregate probabilities across models (mean)
    agg_probabilities = tensor.mean(dim=1)  # Shape: (num_texts x num_sentiments)

    # Get final sentiment predictions
    predicted_indices = torch.argmax(agg_probabilities, dim=-1)  # Shape: (num_texts)

    # Map indices to sentiment labels
    classification = [sentiment_map[p.item()] for p in predicted_indices]

    return classification


def run_batch_with_chunking(models: list[PreTrainedModel], 
                            tokenizers: list[PreTrainedTokenizerFast], 
                            texts: list[str], 
                            batch_size: int = 64, 
                            chunk_size: int = 512):
    """
    Run predictions in batches with support for chunking long texts.
    """
    classifications = [None] * len(texts)  # List to store predictions

    for batch_idx in range(0, len(texts), batch_size):
        batch_texts = texts[batch_idx: batch_idx + batch_size]  # Get the current batch of texts

        # Predict sentiment for the batch
        batch_classifications = predict_sentiment_with_chunking(models, tokenizers, batch_texts, chunk_size)
        
        # Assign the batch's classifications to the corresponding indices
        classifications[batch_idx: batch_idx + batch_size] = batch_classifications

        if device == "cuda":
            torch.cuda.empty_cache()  # Clear CUDA cache after processing each batch (optional)

    return classifications


In [None]:
def predict_sentiment_with_chunking(models: list[PreTrainedModel], 
                                    tokenizers: list[PreTrainedTokenizerFast], 
                                    texts: list[str], 
                                    chunk_size: int = 512):
    """
    Predict sentiment for a list of texts, chunking long texts into manageable sizes.
    """
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}

    # Tensor to store probabilities (num_texts x num_models x num_sentiments)
    tensor = torch.zeros(len(texts), len(models), len(sentiment_map)).to(device)
    
    for model_idx, (model, tokenizer) in enumerate(zip(models, tokenizers)):
        for text_idx, text in enumerate(texts):
            # Tokenize and chunk the text if necessary
            inputs = tokenizer.encode_plus(text, truncation=False, return_tensors="pt").to(device)
            input_ids = inputs['input_ids'].squeeze(0)
            num_chunks = (len(input_ids) + chunk_size - 1) // chunk_size  # Calculate required chunks
            
            chunk_probabilities = []
            
            for i in range(num_chunks):
                chunk_input_ids = input_ids[i * chunk_size: (i + 1) * chunk_size]
                chunk_attention_mask = torch.ones_like(chunk_input_ids).to(device)
                
                chunk_inputs = {
                    'input_ids': chunk_input_ids.unsqueeze(0),
                    'attention_mask': chunk_attention_mask.unsqueeze(0)
                }
                
                with torch.no_grad():
                    outputs = model(**chunk_inputs)
                
                probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)  # Shape: (1 x num_sentiments)
                chunk_probabilities.append(probabilities)
            
            # Aggregate probabilities across chunks
            aggregated_probabilities = torch.mean(torch.stack(chunk_probabilities, dim=0), dim=0)
            
            # Assign probabilities for the current text and model
            tensor[text_idx, model_idx, :] = aggregated_probabilities

    # Aggregate probabilities across models (mean)
    agg_probabilities = tensor.mean(dim=1)  # Shape: (num_texts x num_sentiments)

    # Get final sentiment predictions
    predicted_indices = torch.argmax(agg_probabilities, dim=-1)  # Shape: (num_texts)

    # Map indices to sentiment labels
    classification = [sentiment_map[p.item()] for p in predicted_indices]

    return classification
