In [None]:
import os
os.environ["HF_HOME"] = "models/huggingface"
os.environ["HF_HUB_CACHE"] = "models/huggingface/hub"
os.environ["HF_ASSETS_CACHE"] = "models/huggingface/assets"
os.environ["HF_TOKEN_PATH"] = "models/huggingface/token"


import torch
import pandas as pd
from transformers.modeling_utils import PreTrainedModel
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers import AutoTokenizer, AutoModelForSequenceClassification

!nvidia-smi

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model_name = "tabularisai/multilingual-sentiment-analysis"

tokenizers = AutoTokenizer.from_pretrained(model_name)
models = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}

In [None]:
torch.zeros(3, 12).to(device)

In [None]:
w, x, y, z = predict_sentiment_single_record(models, tokenizers, "happy yipee HURRAH happy yayyyy thank you yayyyyy happy happy happy happy yayyyy thank you yayyyyy BAD BAD")
w, x, y , z

In [None]:
pad_sequence

In [20]:
def predict_sentiment_single_record(
        model: PreTrainedModel, 
        tokenizer: PreTrainedTokenizerFast,
        text: str, 
        chunk_size: int = 10,
        device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
):
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}

    # Tokenize the input text
    inputs = tokenizer.encode_plus(text, truncation=False, return_tensors="pt", padding=False).to(device)
    input_ids = inputs['input_ids'].squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0)

    # If the text is shorter than the chunk size, adjust the chunk size
    if len(input_ids) < chunk_size:
        chunk_size = len(input_ids)
    
    # Calculate the number of chunks needed
    num_chunks = (len(input_ids) + chunk_size - 1) // chunk_size  # Calculate required chunks

    # Prepare tensors for chunked input
    chunk_inputs = {
        'input_ids': torch.zeros(num_chunks, chunk_size, dtype=torch.long).to(device),  # Ensure dtype is torch.long
        'attention_mask': torch.zeros(num_chunks, chunk_size, dtype=torch.long).to(device)  # Ensure dtype is torch.long
    }

    for i in range(num_chunks):
        # Extract the corresponding chunk of tokens and attention masks
        chunk_input_ids = input_ids[i * chunk_size: (i + 1) * chunk_size]
        chunk_attention_mask = attention_mask[i * chunk_size: (i + 1) * chunk_size]

        # Padding to ensure all chunks have the same size (chunk_size)
        if len(chunk_input_ids) < chunk_size:
            padding_length = chunk_size - len(chunk_input_ids)
            chunk_input_ids = torch.cat([chunk_input_ids, torch.zeros(padding_length, dtype=torch.long).to(device)], dim=0)
            chunk_attention_mask = torch.cat([chunk_attention_mask, torch.zeros(padding_length, dtype=torch.long).to(device)], dim=0)
        
        chunk_inputs["input_ids"][i] = chunk_input_ids
        chunk_inputs["attention_mask"][i] = chunk_attention_mask

    # Debug print to check chunks
    print(chunk_inputs)

    with torch.no_grad():
        outputs = model(**chunk_inputs)
    
    # Average logits before softmax for better aggregation
    avg_logits = outputs.logits.mean(dim=0)
    probabilities = torch.nn.functional.softmax(avg_logits, dim=-1)
    sentiment_index = probabilities.argmax().item()
    sentiment_class = sentiment_map[sentiment_index]

    # Get top-k contributing chunks (using logits or probabilities)
    chunk_probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[:, sentiment_index]
    top_k = 2
    contribute_indices = chunk_probs.topk(top_k).indices.tolist()

    # Decode contributing chunks
    contribute_texts = []
    for idx in contribute_indices:
        chunk_tokens = chunk_inputs["input_ids"][idx]  # Get the actual chunk from the padded tensor
        chunk_text = tokenizer.decode(chunk_tokens.tolist(), skip_special_tokens=True)  # Convert to list before decoding
        contribute_texts.append(chunk_text)

    return sentiment_class, probabilities, contribute_indices, contribute_texts

    # return chunk_inputs


In [None]:
tokenizers[0].batch_encode_plus(["hi how are you", "i am fine", "thanks", "bye"], truncation=False, padding=True, max_length=512, return_tensors="pt")

In [None]:
inputs = tokenizers[0].encode_plus("hi hi hi hi hi hi", truncation=False, return_tensors="pt").to(device)
chunk_input_ids = inputs['input_ids'].squeeze(0)
chunk_attention_mask = inputs['attention_mask'].squeeze(0)
torch.ones_like(chunk_input_ids).to(device), chunk_attention_mask

In [None]:
i = 1
chunk_size = 3
chunk_input_ids[i * chunk_size: (i + 1) * chunk_size], chunk_attention_mask[i * chunk_size: (i + 1) * chunk_size]

In [5]:
# Read the DataFrame from a CSV file
df = pd.read_csv("data/data-1735829992.csv")#, encoding="utf-8")
# df.info()

In [6]:
text = df.sort_values(by=['text'], key=lambda col: col.str.len()).iloc[0].text
# print(text)

In [None]:
inputs = tokenizers[0](df.text.iloc[:2].to_list(), return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs.keys()

In [None]:
run_batch_with_chunking(models, tokenizers, df.text.iloc[:512].to_list())

In [None]:
x

In [None]:
run_batch(models, tokenizers, df.text.iloc[4:14].to_list(), batch_size=2)

In [None]:
run_batch(models, tokenizers, df.text.iloc[:500].to_list(), 64)

In [None]:
predict_sentiment(models, tokenizers, ["Hi, I am a very happy person", "I am a very sad person"])