In [1]:
from transformers import MarianMTModel, MarianTokenizer
import os
import torch
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, SubsetRandomSampler

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

In [4]:
# Get the name of the first model
first_model_name = 'Helsinki-NLP/opus-mt-en-fr'

# Get the tokenizer
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)

# Load the pretrained model based on the name
first_model = MarianMTModel.from_pretrained(first_model_name).to(device)

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

In [5]:
# Get the name of the second model
second_model_name = 'Helsinki-NLP/opus-mt-fr-en'

# Get the tokenizer
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)

# Load the pretrained model based on the name
second_model = MarianMTModel.from_pretrained(second_model_name).to(device)

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

In [6]:
def format_batch_texts(language_code, batch_texts):
  
    formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

    return formated_bach

In [7]:
def get_dataset(dataset_name, tokenizer=first_model_tkn, language='fr'):
    if dataset_name == 'covid':
        # Load the dataset into a pandas dataframe.
        dataset = load_dataset('llangnickel/long-covid-classification-data')
        # get the training data
        batch = dataset['train']['text']
        labels = dataset['train']['label']
        
    elif dataset_name == 'cancer':
        classes = {'Thyroid_Cancer' : 0,  'Lung_Cancer' : 1,  'Colon_Cancer' : 2}
        # Load the dataset into a pandas dataframe.
        df = pd.read_csv('../../cancer.csv', encoding='latin')
        values = df.values

        # get the entire data
        all_batch  = list(values[:,2])
        str_labels = list(values[:,1])
        all_labels = [classes[k] for k in str_labels]

        # training and test data split
        batch, _, labels, _ = train_test_split(all_batch, all_labels, test_size=0.5, random_state=42)
        batch = batch[:100] # since it is an easy dataset use just 500 data points
        labels = labels[:100]
    
    elif dataset_name == 'medical_texts':
        
        f = open('../../Medical texts/train.dat', 'r')
        lines = f.readlines()
        batch = list()
        labels = list()
        for line in lines:
            labels.append(int(line[0])-1) # subtract 1 to make it in the range required by the model
            batch.append(line[2:len(line)-1])
        f.close()

        # training and test data split
        batch, test_batch, labels, test_labels = train_test_split(batch, labels, test_size=0.2, random_state=42)
        
        batch = batch[:500] # since it is an easy dataset use just 500 data points
        labels = labels[:500]
    
    # Prepare the text data into appropriate format for the model
    original_texts = batch
    original_labels = labels
    batch = format_batch_texts(language, batch)
    
    # shuffle the data
    data = list(zip(batch, labels))
    random.shuffle(data)
    train_batch, labels = zip(*data)
    train_labels = torch.tensor(labels)
    
    # tokenizing the sentences
    seq_length = 128
    encoding = tokenizer(train_batch, return_tensors='pt', padding=True, truncation = True, max_length=seq_length)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    
    # print the shapes
    print("Input shape: ")
    print(input_ids.shape, attention_mask.shape,train_labels.shape,train_labels)
    
    # turn to the tensordataset
    train_data = TensorDataset(input_ids, attention_mask, train_labels)
        
    return original_texts, original_labels, train_data

In [8]:
dataset = 'medical_texts'
original_texts, original_labels, train_data_classifier = get_dataset(dataset)

Input shape: 
torch.Size([500, 128]) torch.Size([500, 128]) torch.Size([500]) tensor([4, 4, 0, 1, 1, 2, 4, 2, 3, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 4, 4, 4,
        3, 4, 2, 4, 4, 4, 2, 4, 4, 3, 4, 4, 3, 2, 4, 4, 0, 2, 3, 0, 2, 3, 1, 4,
        3, 4, 4, 0, 1, 3, 4, 0, 1, 4, 0, 2, 4, 0, 0, 2, 2, 0, 3, 4, 4, 2, 1, 2,
        4, 1, 4, 1, 4, 3, 4, 4, 3, 0, 3, 0, 4, 3, 4, 4, 3, 4, 4, 0, 3, 0, 3, 0,
        2, 4, 0, 3, 1, 2, 0, 2, 3, 0, 4, 4, 0, 4, 4, 0, 3, 3, 4, 4, 4, 0, 0, 2,
        4, 2, 4, 0, 4, 4, 4, 3, 3, 0, 3, 3, 4, 4, 4, 3, 3, 4, 2, 4, 4, 3, 4, 0,
        4, 3, 4, 4, 3, 2, 4, 2, 1, 4, 4, 2, 4, 4, 4, 4, 3, 4, 3, 0, 3, 1, 4, 0,
        0, 2, 3, 4, 4, 4, 2, 1, 0, 1, 3, 3, 4, 3, 3, 0, 4, 3, 4, 4, 4, 2, 2, 3,
        2, 4, 2, 2, 0, 4, 1, 3, 4, 1, 3, 3, 2, 4, 0, 3, 1, 3, 4, 4, 4, 3, 3, 2,
        2, 3, 0, 4, 4, 0, 0, 0, 3, 0, 0, 4, 0, 4, 3, 3, 1, 4, 2, 4, 3, 0, 3, 3,
        4, 1, 1, 4, 0, 1, 4, 4, 3, 0, 4, 0, 4, 4, 4, 2, 0, 3, 4, 4, 4, 0, 4, 0,
        2, 2, 3, 3, 4, 0, 4, 1, 0, 0, 2, 0

In [9]:
train_classifier = DataLoader(train_data_classifier, batch_size=16, sampler=torch.utils.data.sampler.RandomSampler(train_data_classifier), pin_memory=True, num_workers=0)

In [10]:
def perform_translation_forward(dataloader, model, tokenizer):
    
    translated_texts = []
    
    aug_labels = []
    
    for step, batch in enumerate(dataloader):
        
        print("At batch : " + str(step))
        
        translated = model.generate(batch[0].to(device))
        
        batch_translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
        
        translated_texts = translated_texts + batch_translated_texts
        
        aug_labels = aug_labels + batch[2].tolist()
        
    return translated_texts, aug_labels

In [11]:
translated_texts, aug_labels = perform_translation_forward(train_classifier, first_model, first_model_tkn)

At batch : 0
At batch : 1
At batch : 2
At batch : 3
At batch : 4
At batch : 5
At batch : 6
At batch : 7
At batch : 8
At batch : 9
At batch : 10
At batch : 11
At batch : 12
At batch : 13
At batch : 14
At batch : 15
At batch : 16
At batch : 17
At batch : 18
At batch : 19
At batch : 20
At batch : 21
At batch : 22
At batch : 23
At batch : 24
At batch : 25
At batch : 26
At batch : 27
At batch : 28
At batch : 29
At batch : 30
At batch : 31


In [12]:
len(translated_texts)

500

In [13]:
translated_texts[0]

"Le traitement par reperfusion a été clairement démontré pour diminuer la mortalité précoce après un infarctus aigu du myocarde, mais l'impact de ce traitement sur la survie à long terme a été moins largement évalué. Cette étude rapporte le suivi prolongé d'une grande cohorte de 810 patients traités par un traitement thrombolytique par voie intraveineuse combinés, lorsqu'il est jugé nécessaire de maintenir ou d'augmenter la patiesse du vaisseau infarctus, avec des traitements de reperfusion mécanique."

In [14]:
def perform_translation_backward(translated_texts, model, tokenizer, language='en'):
    
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, translated_texts)
    
    translated_texts = []
    
    batch_size = 16
    
    for i in range(0, len(formated_batch_texts), batch_size):
        
        print("At batch : " + str(i//batch_size))
        
        encoding = tokenizer(formated_batch_texts[i:i+batch_size], return_tensors="pt", padding=True, truncation = True, max_length=128)
    
        # Generate translation using model
        translated = model.generate(encoding['input_ids'].to(device))
        
        # Convert the generated tokens indices back into text
        batch_translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
        
        translated_texts = translated_texts + batch_translated_texts
    
    return translated_texts

In [15]:
back_translated_texts = perform_translation_backward(translated_texts, second_model, second_model_tkn)

At batch : 0
At batch : 1
At batch : 2
At batch : 3
At batch : 4
At batch : 5
At batch : 6
At batch : 7
At batch : 8
At batch : 9
At batch : 10
At batch : 11
At batch : 12
At batch : 13
At batch : 14
At batch : 15
At batch : 16
At batch : 17
At batch : 18
At batch : 19
At batch : 20
At batch : 21
At batch : 22
At batch : 23
At batch : 24
At batch : 25
At batch : 26
At batch : 27
At batch : 28
At batch : 29
At batch : 30
At batch : 31


In [16]:
len(back_translated_texts)

500

In [17]:
f = open("Data/backtranslation_"+str(dataset)+".txt" , "a" )

for i in range(len(back_translated_texts)):
    f.write(str(aug_labels[i]) + '\t' + back_translated_texts[i] + '\n')
    
f.close()
