# NLP Project

# Imports

In [1]:
from transformers import pipeline
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset




# Data Preprocessing

In [15]:
# Reading the English file using read_fwf
df = pd.read_fwf(r"data\TranslatedForMeMaT\health_transcripts1-xh.txt", 
                    header=None)


df.columns = ['text']  # Naming the column as 'text'
# Step 2: Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [16]:
df.head(30)

Unnamed: 0,text
0,Ingaba zikhona izinto ezinokwenziwa ukuthomala...
1,Kubalulekile ukuzilawula izinto ezingumngciphe...
2,Abantu abanesifo seswekile basemngciphekweni o...
3,Ingaba sisenokulungiseka?
4,Wakhe wahlaselwa sisifo sentliziyo?
5,Eyona ndlela isebenzayo yokuthintela isandulel...
6,Xa ikholesteroli embi ekwabizwa ngokuba yiLDL ...
7,Ndinekholesteroli ephezulu kunye noxinzelelo l...
8,Utata wayenesifo sentliziyo kunye neswekile wa...
9,Indlela enditya nendizilolonga ngayo intle.


# Language Model

# General Fine Tuning

In [None]:
# Step 3: Load Pretrained Tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Step 4: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Tokenizing the dataset
tokenized_datasets = dataset.map(tokenize_function, remove_columns=["text"])

# Step 5: Define Data Collator for Masked Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15  # Mask 15% of the tokens
)



Map:   0%|          | 0/508 [00:00<?, ? examples/s]

In [None]:
# Step 1: Check GPU availability
print(torch.cuda.is_available())  # Should return True if the GPU is accessible
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use the GPU if available
    print(torch.cuda.get_device_name(0))  # Should print the GPU name
else:
    device = torch.device("cpu")  # Use CPU if no GPU is available

# Step 2: Load Pretrained XLM-RoBERTa Model for Masked Language Modeling
model = XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-base").to(device)

# Step 5: Set up Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Adjust based on GPU memory
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=200,
    prediction_loss_only=True,
)

# Step 6: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,  # Tokenized dataset
)

# Step 7: Fine-tune the model
trainer.train()

# Step 8: Save the fine-tuned model and tokenizer
model.save_pretrained("./fine-tuned-xlm-roberta-mlm")
tokenizer.save_pretrained("./fine-tuned-xlm-roberta-mlm")


True
NVIDIA RTX A1000 6GB Laptop GPU


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/192 [00:00<?, ?it/s]

{'train_runtime': 1140.4688, 'train_samples_per_second': 1.336, 'train_steps_per_second': 0.168, 'train_loss': 3.9301487604777017, 'epoch': 3.0}


('./fine-tuned-xlm-roberta-mlm\\tokenizer_config.json',
 './fine-tuned-xlm-roberta-mlm\\special_tokens_map.json',
 './fine-tuned-xlm-roberta-mlm\\sentencepiece.bpe.model',
 './fine-tuned-xlm-roberta-mlm\\added_tokens.json')

# Task Specific Fine Tuning
