In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install torch
!pip install accelerate


In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt


# Read the Input CSV
Then use info and describe to see  dataset info and stats like no of non rull rows etc

In [None]:
df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
#print(df.head())
print(df.info())
print(df.describe())

In [None]:
df["sentiment"].value_counts()

# Drop rows that have NaN values
If the 'text' column has NaN values the we drop the row

In [None]:
df=df.dropna(subset=['text'])
print(df.info())
print(df.describe())


# Convert Pandas Dataframe to HuggingFace Dataset

In [None]:
from datasets import Dataset
dataset=Dataset.from_pandas(df)
print(dataset)

# Split the Dataset into train and test

In [None]:
dataset=dataset.train_test_split(test_size=0.2)

In [None]:
print(dataset["train"])
print(dataset["test"])

# Import and Load the Toeknizer

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# Create mappings from label to ids and ids to label

In [None]:
label2id={
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

id2label={
    0: "negative",
    1: "neutral",
    2: "positive"
}


* Map every single row to respective id.
* 'example' is each row from the dataset.
* We create a 'labels' column and add the mapped ids in that column.
* The .map() method calls the argument function(encode_labels() in this case) for every row in dataset.




In [None]:
def encode_labels(example):
    example["labels"]=label2id[example["sentiment"]]
    return example

dataset=dataset.map(encode_labels)

In [None]:
print(dataset)
print(dataset["train"]["labels"])

# Tokenize the Dataset
* 'batch' is a dictionry of lists: {"text":["Tweet1","Tweet2",...], "label":[1, 2, 0, ...]}
* Therefore batch[text] is a list of strings-["Tweet1","Tweet2",...]
* Tokenizer returns a dictionary: {"input_ids": [....], "attention_mask": [...]}
* input_ids -> each no maps to a token in the input text
* attention_mask([1,1,1,0,0,0]) -> model uses this to ignore padding token etc


  

In [None]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [None]:
tokenized_dataset=dataset.map(tokenize_function, batched=True)


Important to set the format of the tensors are torch or else it will cause problems

In [None]:
tokenized_dataset.set_format("torch")

# Import and Load the model

In [None]:
from transformers import AutoModelForSequenceClassification

model=AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3,
    label2id=label2id,
    id2label=id2label
)

# Set the training arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    report_to="none"
)


# Set the Evaluation Metrics
* logits -> contains the raw scores from each classification 
* predcition -> aggregate raw logit scores to softmax probabilities
* metric.compute() -> aggregate f1 score for each sample,
* F1 score -> we will calculate the F1 score for each class and then take weighted average

In [None]:
import evaluate

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")


# Start the training

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

# Save the Trained Model

In [None]:
trainer.save_model("/kaggle/working/xlm_roberta_tone_model")
tokenizer.save_pretrained("/kaggle/working/xlm_roberta_tone_model")


In [None]:
!zip -r xlm_roberta_tone_model.zip xlm_roberta_tone_model

# Load the saved model for inference

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

MODEL_PATH="/kaggle/input/xlm-roberta-tweetdetection/pytorch/default/1/xlm_roberta_tone_model"

saved_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
saved_model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

saved_model.eval()

2025-12-27 08:23:14.025579: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766823794.354662      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766823794.450445      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766823795.240265      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766823795.240336      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766823795.240339      55 computation_placer.cc:177] computation placer alr

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
saved_model.to(device)


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [6]:
def predict_sentiment(text):
    inputs = saved_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = saved_model(**inputs).logits

    pred_id = torch.argmax(logits, dim=1).item()
    return saved_model.config.id2label[pred_id]


# Use the below cell to manually check the output

In [7]:
predict_sentiment("I am not coming there")


'neutral'

# Generate a submission.csv file for the Competition submission

In [47]:
df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
test_dataset=Dataset.from_pandas(df)
print(df)

          textID                                               text sentiment
0     f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral
1     96d74cb729   Shanghai is also really exciting (precisely -...  positive
2     eee518ae67  Recession hit Veronique Branquinho, she has to...  negative
3     01082688c6                                        happy bday!  positive
4     33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive
...          ...                                                ...       ...
3529  e5f0e6ef4b  its at 3 am, im very tired but i can`t sleep  ...  negative
3530  416863ce47  All alone in this old house again.  Thanks for...  positive
3531  6332da480c   I know what you mean. My little dog is sinkin...  negative
3532  df1baec676  _sutra what is your next youtube video gonna b...  positive
3533  469e15c5a8   http://twitpic.com/4woj2 - omgssh  ang cute n...  positive

[3534 rows x 3 columns]
