<a href="https://colab.research.google.com/github/RanulRathnayake/Final-Year-Research-/blob/main/XML_RoBERTa(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch scikit-learn pandas


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizer, XLMRobertaModel, Trainer, TrainingArguments
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DataSets/FAQs dataset.csv')

print(df.head())
print(df.columns)


                                              text             entity  \
0               Mata tax return file karanna oney.  income_tax_filing   
1            Mata tax return submit karanna oneda?  income_tax_filing   
2      Income tax return danna widiyak kiyanawada?  income_tax_filing   
3             Tax return submit karanna puluwanda?  income_tax_filing   
4  Mata income tax return ekak file karanna oneda?  income_tax_filing   

                    intent  
0          file_tax_return  
1        submit_tax_return  
2   how_to_file_tax_return  
3    can_submit_tax_return  
4  need_to_file_tax_return  
Index(['text', ' entity', ' intent'], dtype='object')


In [None]:
entity_encoder = LabelEncoder()
intent_encoder = LabelEncoder()

df['entity_label'] = entity_encoder.fit_transform(df[' entity'])
df['intent_label'] = intent_encoder.fit_transform(df[' intent'])

In [None]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_texts, val_texts, train_entity_labels, val_entity_labels, train_intent_labels, val_intent_labels = train_test_split(
    df['text'].tolist(),
    df['entity_label'].tolist(),
    df['intent_label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [None]:
class EntityIntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, entity_labels, intent_labels):
        self.encodings = encodings
        self.entity_labels = entity_labels
        self.intent_labels = intent_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['entity_labels'] = torch.tensor(self.entity_labels[idx])
        item['intent_labels'] = torch.tensor(self.intent_labels[idx])
        return item

    def __len__(self):
        return len(self.entity_labels)

In [None]:
train_dataset = EntityIntentDataset(train_encodings, train_entity_labels, train_intent_labels)
val_dataset = EntityIntentDataset(val_encodings, val_entity_labels, val_intent_labels)


In [None]:
class XLMRobertaForMultiTaskClassification(torch.nn.Module):
    def __init__(self, model_name, num_entity_labels, num_intent_labels):
        super(XLMRobertaForMultiTaskClassification, self).__init__()
        self.roberta = XLMRobertaModel.from_pretrained(model_name)
        self.entity_classifier = torch.nn.Linear(self.roberta.config.hidden_size, num_entity_labels)
        self.intent_classifier = torch.nn.Linear(self.roberta.config.hidden_size, num_intent_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, entity_labels=None, intent_labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]
        pooled_output = outputs[1]

        entity_logits = self.entity_classifier(sequence_output)
        intent_logits = self.intent_classifier(pooled_output)

        loss = 0
        if entity_labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss += loss_fct(entity_logits.view(-1, self.entity_classifier.out_features), entity_labels.view(-1,1).squeeze(1))
        if intent_labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss += loss_fct(intent_logits.view(-1, self.intent_classifier.out_features), intent_labels.view(-1).squeeze(1))

        return loss, entity_logits, intent_logits

model = XLMRobertaForMultiTaskClassification('xlm-roberta-base', len(entity_encoder.classes_), len(intent_encoder.classes_))

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [None]:
trainer.train()

ValueError: Expected input batch_size (248) to match target batch_size (8).