In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import torch
import re
import os
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup, MarianMTModel, MarianTokenizer
from collections import defaultdict, Counter
from sklearn.preprocessing import LabelEncoder

# Torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
device
os.system('CUDA_LAUNCH_BLOCKING=1')

1

In [None]:
# Get the name of the first model
first_model_name = 'Helsinki-NLP/opus-mt-en-fr'

# Get the tokenizer
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)

# Load the pretrained model based on the name
first_model = MarianMTModel.from_pretrained(first_model_name)

first_model.to(device)

# Get the name of the second model
second_model_name = 'Helsinki-NLP/opus-mt-fr-en'

# Get the tokenizer
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)

# Load the pretrained model based on the name
second_model = MarianMTModel.from_pretrained(second_model_name)

second_model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [None]:
# corpus
original_texts = [
    "All of us do not have equal talent. But, all of us have an equal opportunity to develop our talents.",
    "You have to dream before your dreams can come true.",
    "A dream is not that which you see while sleeping, it is something that does not let you sleep.",
    "You should not give up and we should not allow the problem to defeat us."
]

In [None]:
# adding target source
def format_batch_texts(language_code, batch_texts):
    formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

    return formated_bach

# Test of the function
format_batch_texts("fr", original_texts)  # first we are going to translate to french sentence : )

['>>fr<< All of us do not have equal talent. But, all of us have an equal opportunity to develop our talents.',
 '>>fr<< You have to dream before your dreams can come true.',
 '>>fr<< A dream is not that which you see while sleeping, it is something that does not let you sleep.',
 '>>fr<< You should not give up and we should not allow the problem to defeat us.']

In [None]:
def perform_translation(batch_texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)

    # Generate translation using model
    inputs = tokenizer(formated_batch_texts, return_tensors="pt", padding=True, truncation=True)
    inputs.to(device)
    print(inputs['input_ids'].shape)
    translated = model.generate(**inputs)
    del inputs


    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return translated_texts

# Check the model translation from the original language (English) to French
translated_texts = perform_translation(original_texts, first_model, first_model_tkn)

print('Orignail texts:\n', original_texts)
print('\nTranslated texts:\n', translated_texts)

torch.Size([4, 25])
Orignail texts:
 ['All of us do not have equal talent. But, all of us have an equal opportunity to develop our talents.', 'You have to dream before your dreams can come true.', 'A dream is not that which you see while sleeping, it is something that does not let you sleep.', 'You should not give up and we should not allow the problem to defeat us.']

Translated texts:
 ["Nous n'avons pas tous les mêmes talents, mais nous avons tous les mêmes chances de développer nos talents.", 'Vous devez rêver avant que vos rêves puissent se réaliser.', "Un rêve n'est pas ce que vous voyez en dormant, c'est quelque chose qui ne vous laisse pas dormir.", 'Vous ne devriez pas abandonner et nous ne devrions pas laisser le problème nous vaincre.']


In [None]:
back_translated_texts = perform_translation(translated_texts, second_model, second_model_tkn)
print(back_translated_texts)

torch.Size([4, 29])
['We do not all have the same talents, but we all have the same opportunities to develop our talents.', 'You have to dream before your dreams can come true.', 'A dream is not what you see when you sleep, it is something that does not let you sleep.', 'You should not give up and we should not let the problem defeat us.']


In [None]:
# Let's add all the process in single function
def perform_back_translation_with_augmentation(batch_texts, original_language="en", temporary_language="fr"):

  # Translate from Original to Temporary Language  (STEP 1)
  tmp_translated_batch = perform_translation(batch_texts, first_model, first_model_tkn, temporary_language)

  # Translate Back to English  (STEP 2)
  back_translated_batch = perform_translation(tmp_translated_batch, second_model, second_model_tkn, original_language)

  # Return The Final Result  (STEP 3)
  return back_translated_batch  #

# Execute the function for Data Augmentation
final_augmented = perform_back_translation_with_augmentation(original_texts)

print('Input corpus\n', original_texts)
print('\nAugumented texts \n',final_augmented)

torch.Size([4, 25])
torch.Size([4, 29])
Input corpus
 ['All of us do not have equal talent. But, all of us have an equal opportunity to develop our talents.', 'You have to dream before your dreams can come true.', 'A dream is not that which you see while sleeping, it is something that does not let you sleep.', 'You should not give up and we should not allow the problem to defeat us.']

Augumented texts 
 ['We do not all have the same talents, but we all have the same opportunities to develop our talents.', 'You have to dream before your dreams can come true.', 'A dream is not what you see when you sleep, it is something that does not let you sleep.', 'You should not give up and we should not let the problem defeat us.']


In [None]:
# Reading data
train_df = pd.read_json(path_or_buf=r'train.jsonl', lines=True)
test_df = pd.read_json(path_or_buf=r'test.jsonl', lines=True)
val_df = pd.read_json(path_or_buf=r'dev.jsonl', lines=True)

train_df = train_df[['string', 'label']]
test_df = test_df[['string', 'label']]
val_df = val_df[['string', 'label']]

train_df

Unnamed: 0,string,label
0,"However, how frataxin interacts with the Fe-S ...",background
1,"In the study by Hickey et al. (2012), spikes w...",background
2,"The drug also reduces catecholamine secretion,...",background
3,By clustering with lowly aggressive close kin ...,background
4,Ophthalmic symptoms are rare manifestations of...,background
...,...,...
8238,"Importantly, the results of Pascalis et al. (2...",background
8239,"As suggested by Nguena et al, there is a need ...",background
8240,Skeletal muscle is also a primary site of dise...,background
8241,ACTIVATION OF TRANSCRIPTION FACTORS Roles for ...,method


In [None]:
batch_size = 10
total_batches = (len(train_df) + 9) // 10
back_translated_sentences = []
for i in range(total_batches): ## change later
  print(i)
  batch = train_df["string"][i * 10: (i + 1) * 10]
  back_translated = perform_back_translation_with_augmentation(batch.tolist())
  back_translated_sentences.extend(back_translated)




# new_data = {
#     "string": train_df["string"][:5].apply(perform_back_translation_with_augmentation),
#     "label": train_df["label"][:5]
# }

# new_data

0
torch.Size([10, 92])
torch.Size([10, 104])
1
torch.Size([10, 69])
torch.Size([10, 73])
2
torch.Size([10, 87])
torch.Size([10, 93])
3
torch.Size([10, 100])
torch.Size([10, 109])
4
torch.Size([10, 105])
torch.Size([10, 109])
5
torch.Size([10, 87])
torch.Size([10, 100])
6
torch.Size([10, 126])
torch.Size([10, 120])
7
torch.Size([10, 82])
torch.Size([10, 96])
8
torch.Size([10, 81])
torch.Size([10, 88])
9
torch.Size([10, 112])
torch.Size([10, 132])
10
torch.Size([10, 92])
torch.Size([10, 93])
11
torch.Size([10, 94])
torch.Size([10, 108])
12
torch.Size([10, 138])
torch.Size([10, 156])
13
torch.Size([10, 96])
torch.Size([10, 96])
14
torch.Size([10, 117])
torch.Size([10, 157])
15
torch.Size([10, 106])
torch.Size([10, 118])
16
torch.Size([10, 82])
torch.Size([10, 99])
17
torch.Size([10, 97])
torch.Size([10, 112])
18
torch.Size([10, 76])
torch.Size([10, 92])
19
torch.Size([10, 120])
torch.Size([10, 129])
20
torch.Size([10, 137])
torch.Size([10, 97])
21
torch.Size([10, 100])
torch.Size([10, 112

In [None]:
torch.cuda.empty_cache()

In [None]:
data = {"string": back_translated_sentences, "label": train_df["label"]}
new_df = pd.DataFrame(data)

train_df
new_df

print(train_df.shape)
new_train_df = pd.concat([train_df, new_df], ignore_index=True)
new_train_df = new_train_df.drop_duplicates()
print(new_train_df.shape)

new_train_df

new_train_df.to_json("new_train_df.jsonl", orient="records", lines=True)

(8243, 2)
(16352, 2)
