In [None]:
import os
os.environ["HF_HOME"] = "models/huggingface"
os.environ["HF_HUB_CACHE"] = "models/huggingface/hub"
os.environ["HF_ASSETS_CACHE"] = "models/huggingface/assets"
os.environ["HF_TOKEN_PATH"] = "models/huggingface/token"


import torch
import pandas as pd
from transformers.modeling_utils import PreTrainedModel
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers import AutoTokenizer, AutoModelForSequenceClassification

!nvidia-smi

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model_name = "tabularisai/multilingual-sentiment-analysis"

tokenizers = AutoTokenizer.from_pretrained(model_name)
models = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}

In [None]:
w, x, y, z = predict_sentiment_single_record(models, tokenizers, "happy yipee HURRAH happy yayyyy thank you yayyyyy happy happy happy happy yayyyy thank you yayyyyy BAD BAD")
w, x, y , z

In [3]:
def predict_sentiment_single_record(
        model: PreTrainedModel, 
        tokenizer: PreTrainedTokenizerFast,
        text: str, 
        chunk_size: int = 10,
        top_k: int = 2,
        device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
):
    """
    Predicts sentiment for a single text record using a pre-trained model and tokenizer.
    The text is split into chunks for processing, and the top-k contributing chunks are identified.

    Args:
        model (PreTrainedModel): Pre-trained sentiment analysis model.
        tokenizer (PreTrainedTokenizerFast): Tokenizer for the model.
        text (str): Input text to analyze.
        chunk_size (int): Size of each chunk for processing. Default is 10.
        top_k (int): Number of top contributing chunks to return. Default is 2.
        device (torch.device): Device to run the model on (e.g., "cuda" or "cpu"). Default is "cuda" if available.

    Returns:
        predicted_sentiment (str): Predicted sentiment label.
        probabilities (torch.Tensor): Probabilities for each sentiment class.
        contribute_chunk_indices (List[int]): Indices of the top-k contributing chunks.
        contribute_chunk_text_positions (List[Tuple[int, int]]): Start and end positions of the top-k chunks in the original text.
    """

    sentiment_map = {
        0: "Very Negative", 
        1: "Negative", 
        2: "Neutral", 
        3: "Positive", 
        4: "Very Positive"
    }

    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        truncation=False,
        return_tensors="pt",
        padding=False,
        return_offsets_mapping=True  # Enable offset tracking
    )
    input_ids = inputs["input_ids"].squeeze(0)
    attention_mask = inputs["attention_mask"].squeeze(0)
    offset_mapping = inputs["offset_mapping"].squeeze(0)

    # If the text is shorter than the chunk size, adjust the chunk size
    if len(input_ids) < chunk_size:
        chunk_size = len(input_ids)
    
    # Calculate the number of chunks needed
    total_chunks = (len(input_ids) + chunk_size - 1) // chunk_size 

    # Prepare tensors for chunked input
    pad_token_id = tokenizer.pad_token_id
    chunk_inputs = {
        'input_ids': torch.zeros(total_chunks, chunk_size, dtype=torch.long, device=device), 
        'attention_mask': torch.zeros(total_chunks, chunk_size, dtype=torch.long, device=device)
    }

    # Split input into chunks and pad if necessary
    for i in range(total_chunks):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk_input_ids = input_ids[start: end]
        chunk_attention_mask = attention_mask[start: end]

        # Pad chunks to ensure uniform size
        if len(chunk_input_ids) < chunk_size:
            padding_length = chunk_size - len(chunk_input_ids)

            chunk_input_ids = torch.cat([
                chunk_input_ids, 
                torch.full((padding_length,), pad_token_id, dtype=torch.long)
            ], dim=0)
            chunk_attention_mask = torch.cat([
                chunk_attention_mask, 
                torch.zeros(padding_length, dtype=torch.long)
            ], dim=0)
        
        chunk_inputs["input_ids"][i] = chunk_input_ids
        chunk_inputs["attention_mask"][i] = chunk_attention_mask
    

    # Debug print to check chunks
    print(chunk_inputs)

    # Run the model
    with torch.no_grad():
        outputs = model(**chunk_inputs)
    
    # Average logits across chunks and compute probabilities
    avg_logits = outputs.logits.mean(dim=0)
    probabilities = torch.nn.functional.softmax(avg_logits, dim=-1)
    predicted_label_idx = probabilities.argmax().item()
    predicted_sentiment = sentiment_map[predicted_label_idx]

    # Get top-k contributing chunks using logits
    chunk_probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[:, predicted_label_idx]
    contribute_chunk_indices = chunk_probs.topk(top_k).indices.tolist()

    # Extract positions of contributing chunks in the original text
    contribute_chunk_text_positions: list[tuple[int, int]] = []
    for idx in contribute_chunk_indices:
        chunk_offsets = offset_mapping[idx * chunk_size: (idx + 1) * chunk_size]
        non_pad = (chunk_offsets != 0).any(dim=1)
        if non_pad.any():
            start, _ = chunk_offsets[non_pad][0]  # First non-pad token
            _, end = chunk_offsets[non_pad][-1]   # Last non-pad token
            contribute_chunk_text_positions.append((start.item(), end.item()))
        else:
            # If the chunk is all padding, skip it
            contribute_chunk_text_positions.append((0, 0))
    
    return predicted_sentiment, probabilities, contribute_chunk_indices, contribute_chunk_text_positions

In [None]:
tokenizers[0].batch_encode_plus(["hi how are you", "i am fine", "thanks", "bye"], truncation=False, padding=True, max_length=512, return_tensors="pt")

In [None]:
inputs = tokenizers[0].encode_plus("hi hi hi hi hi hi", truncation=False, return_tensors="pt").to(device)
chunk_input_ids = inputs['input_ids'].squeeze(0)
chunk_attention_mask = inputs['attention_mask'].squeeze(0)
torch.ones_like(chunk_input_ids).to(device), chunk_attention_mask

In [None]:
i = 1
chunk_size = 3
chunk_input_ids[i * chunk_size: (i + 1) * chunk_size], chunk_attention_mask[i * chunk_size: (i + 1) * chunk_size]

In [5]:
# Read the DataFrame from a CSV file
df = pd.read_csv("data/data-1735829992.csv")#, encoding="utf-8")
# df.info()

In [6]:
text = df.sort_values(by=['text'], key=lambda col: col.str.len()).iloc[0].text
# print(text)

In [None]:
inputs = tokenizers[0](df.text.iloc[:2].to_list(), return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs.keys()

In [None]:
run_batch_with_chunking(models, tokenizers, df.text.iloc[:512].to_list())

In [None]:
x

In [None]:
run_batch(models, tokenizers, df.text.iloc[4:14].to_list(), batch_size=2)

In [None]:
run_batch(models, tokenizers, df.text.iloc[:500].to_list(), 64)

In [None]:
predict_sentiment(models, tokenizers, ["Hi, I am a very happy person", "I am a very sad person"])