In [None]:
!pip install transformers datasets torch scikit-learn pandas



In [None]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizer, XLMRobertaModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DataSets/FAQs dataset.csv')

print(df.head())
print(df.columns)


                                              text             entity  \
0               Mata tax return file karanna oney.  income_tax_filing   
1            Mata tax return submit karanna oneda?  income_tax_filing   
2      Income tax return danna widiyak kiyanawada?  income_tax_filing   
3             Tax return submit karanna puluwanda?  income_tax_filing   
4  Mata income tax return ekak file karanna oneda?  income_tax_filing   

                    intent  
0          file_tax_return  
1        submit_tax_return  
2   how_to_file_tax_return  
3    can_submit_tax_return  
4  need_to_file_tax_return  
Index(['text', ' entity', ' intent'], dtype='object')


In [None]:
df.columns = df.columns.str.strip()

entity_encoder = LabelEncoder()
intent_encoder = LabelEncoder()

df['entity_label'] = entity_encoder.fit_transform(df['entity'])
df['intent_label'] = intent_encoder.fit_transform(df['intent'])

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_texts, val_texts, train_entity_labels, val_entity_labels, train_intent_labels, val_intent_labels = train_test_split(
    df['text'].tolist(),
    df['entity_label'].tolist(),
    df['intent_label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
class EntityIntentDataset(Dataset):
    def __init__(self, encodings, entity_labels, intent_labels):
        self.encodings = encodings
        self.entity_labels = entity_labels
        self.intent_labels = intent_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['entity_labels'] = torch.tensor(self.entity_labels[idx], dtype=torch.long)
        item['intent_labels'] = torch.tensor(self.intent_labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.entity_labels)

train_dataset = EntityIntentDataset(train_encodings, train_entity_labels, train_intent_labels)
val_dataset = EntityIntentDataset(val_encodings, val_entity_labels, val_intent_labels)


In [None]:
class XLMRobertaForMultiTaskClassification(nn.Module):
    def __init__(self, model_name, num_entity_labels, num_intent_labels):
        super(XLMRobertaForMultiTaskClassification, self).__init__()
        self.roberta = XLMRobertaModel.from_pretrained(model_name)
        self.entity_classifier = nn.Linear(self.roberta.config.hidden_size, num_entity_labels)
        self.intent_classifier = nn.Linear(self.roberta.config.hidden_size, num_intent_labels)

    def forward(self, input_ids, attention_mask=None, entity_labels=None, intent_labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]

        entity_logits = self.entity_classifier(pooled_output)
        intent_logits = self.intent_classifier(pooled_output)

        loss = None
        if entity_labels is not None and intent_labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            entity_loss = loss_fct(entity_logits, entity_labels)
            intent_loss = loss_fct(intent_logits, intent_labels)
            loss = entity_loss + intent_loss

        return {'loss': loss, 'entity_logits': entity_logits, 'intent_logits': intent_logits}

model = XLMRobertaForMultiTaskClassification(
    'xlm-roberta-base',
    num_entity_labels=len(entity_encoder.classes_),
    num_intent_labels=len(intent_encoder.classes_)
)

In [None]:
!pip uninstall -y transformers
!pip install transformers==4.50.1

Found existing installation: transformers 4.50.1
Uninstalling transformers-4.50.1:
  Successfully uninstalled transformers-4.50.1
Collecting transformers==4.50.1
  Using cached transformers-4.50.1-py3-none-any.whl.metadata (39 kB)
Using cached transformers-4.50.1-py3-none-any.whl (10.2 MB)
Installing collected packages: transformers
Successfully installed transformers-4.50.1


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,9.0539,8.351834
2,5.3432,4.953611
3,2.2678,3.270554


TrainOutput(global_step=612, training_loss=6.9832531953948775, metrics={'train_runtime': 5504.8883, 'train_samples_per_second': 0.888, 'train_steps_per_second': 0.111, 'total_flos': 0.0, 'train_loss': 6.9832531953948775, 'epoch': 3.0})

In [None]:
entity_classes = list(entity_encoder.classes_)
id2label_entity = {i: label for i, label in enumerate(entity_classes)}
label2id_entity = {label: i for i, label in enumerate(entity_classes)}

intent_classes = list(intent_encoder.classes_)
id2label_intent = {i: label for i, label in enumerate(intent_classes)}
label2id_intent = {label: i for i, label in enumerate(intent_classes)}

config = {
    "id2label_entity": id2label_entity,
    "label2id_entity": label2id_entity,
    "id2label_intent": id2label_intent,
    "label2id_intent": label2id_intent,
    "num_entity_labels": len(entity_encoder.classes_),
    "num_intent_labels": len(intent_encoder.classes_)
}


In [None]:
import os
import pickle
import torch
import json

model_path = "/content/drive/MyDrive/fine_tuned_xlm_roberta"
os.makedirs(model_path, exist_ok=True)

# Build label mappings
entity_classes = list(entity_encoder.classes_)
id2label_entity = {i: label for i, label in enumerate(entity_classes)}
label2id_entity = {label: i for i, label in enumerate(entity_classes)}

intent_classes = list(intent_encoder.classes_)
id2label_intent = {i: label for i, label in enumerate(intent_classes)}
label2id_intent = {label: i for i, label in enumerate(intent_classes)}

# Save model weights
torch.save(model.state_dict(), os.path.join(model_path, "pytorch_model.bin"))

# Save config
config = {
    "id2label_entity": id2label_entity,
    "label2id_entity": label2id_entity,
    "id2label_intent": id2label_intent,
    "label2id_intent": label2id_intent,
    "num_entity_labels": len(entity_encoder.classes_),
    "num_intent_labels": len(intent_encoder.classes_)
}
with open(os.path.join(model_path, "config.json"), "w") as f:
    json.dump(config, f)

# Save tokenizer
tokenizer.save_pretrained(model_path)

# Save encoders
with open(os.path.join(model_path, "entity_encoder.pkl"), "wb") as f:
    pickle.dump(entity_encoder, f)
with open(os.path.join(model_path, "intent_encoder.pkl"), "wb") as f:
    pickle.dump(intent_encoder, f)

print(f"Custom model, config, tokenizer, and encoders saved to {model_path}")


Custom model, config, tokenizer, and encoders saved to /content/drive/MyDrive/fine_tuned_xlm_roberta


In [None]:
!pip install scikit-learn



In [None]:
import torch
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score
from torch.utils.data import DataLoader

In [None]:
def evaluate_model(model, dataloader, entity_encoder, intent_encoder, device):
    model.eval()
    all_entity_preds, all_entity_labels = [], []
    all_intent_preds, all_intent_labels = [], []

    with torch.no_grad():
        for batch in dataloader:

            inputs = {key: batch[key].to(device) for key in ["input_ids", "attention_mask"]}
            entity_labels = batch["entity_labels"].to(device)
            intent_labels = batch["intent_labels"].to(device)

            outputs = model(**inputs)

            entity_preds = torch.argmax(outputs['entity_logits'], dim=1)
            intent_preds = torch.argmax(outputs['intent_logits'], dim=1)

            all_entity_preds.extend(entity_preds.cpu().numpy())
            all_entity_labels.extend(entity_labels.cpu().numpy())
            all_intent_preds.extend(intent_preds.cpu().numpy())
            all_intent_labels.extend(intent_labels.cpu().numpy())

    entity_precision, entity_recall, entity_f1, _ = precision_recall_fscore_support(
        all_entity_labels, all_entity_preds, average="weighted"
    )
    entity_accuracy = accuracy_score(all_entity_labels, all_entity_preds)

    intent_precision, intent_recall, intent_f1, _ = precision_recall_fscore_support(
        all_intent_labels, all_intent_preds, average="weighted"
    )
    intent_accuracy = accuracy_score(all_intent_labels, all_intent_preds)

    print("\n **Entity Classification Metrics:**")
    print(f"Accuracy: {entity_accuracy:.4f}")
    print(f"Precision: {entity_precision:.4f}")
    print(f"Recall: {entity_recall:.4f}")
    print(f"F1-score: {entity_f1:.4f}")

    print("\n **Intent Classification Metrics:**")
    print(f"Accuracy: {intent_accuracy:.4f}")
    print(f"Precision: {intent_precision:.4f}")
    print(f"Recall: {intent_recall:.4f}")
    print(f"F1-score: {intent_f1:.4f}")
    print("\n ")

In [None]:
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForMultiTaskClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features

In [None]:
evaluate_model(
    model,
    val_dataloader,
    entity_encoder,
    intent_encoder, device
    )


 **Entity Classification Metrics:**
Accuracy: 0.8137
Precision: 0.7883
Recall: 0.8137
F1-score: 0.7818

 **Intent Classification Metrics:**
Accuracy: 0.6912
Precision: 0.5554
Recall: 0.6912
F1-score: 0.6070

 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Test trained XLM-RoBERTa model using a given text input**

In [None]:
def predict(text, model, tokenizer, entity_encoder, intent_encoder, device):
    model.eval()

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    entity_pred = torch.argmax(outputs['entity_logits'], dim=1).item()
    intent_pred = torch.argmax(outputs['intent_logits'], dim=1).item()

    entity_label = entity_encoder.inverse_transform([entity_pred])[0]
    intent_label = intent_encoder.inverse_transform([intent_pred])[0]

    return entity_label, intent_label

In [None]:
model_path = "/content/drive/MyDrive/DataSets/fine_tuned_xlm_roberta"

model = XLMRobertaForMultiTaskClassification(
    'xlm-roberta-base',
    num_entity_labels=len(entity_encoder.classes_),
    num_intent_labels=len(intent_encoder.classes_)
)
model.load_state_dict(torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location=torch.device("cpu")))
model.eval()

tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)

import pickle
with open(os.path.join(model_path, "entity_encoder.pkl"), "rb") as f:
    entity_encoder = pickle.load(f)
with open(os.path.join(model_path, "intent_encoder.pkl"), "rb") as f:
    intent_encoder = pickle.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForMultiTaskClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features

In [None]:
text = "IRD email address eka mokakda?"
entity, intent = predict(text, model, tokenizer, entity_encoder, intent_encoder, device)

print(f" **Text:** {text}")
print(f" **Predicted Entity:** {entity}")
print(f" **Predicted Intent:** {intent}")


 **Text:** IRD email address eka mokakda?
 **Predicted Entity:** Contact_Information_for_IRD_Offices
 **Predicted Intent:** get_ird_email
