In [None]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import torch
import time
import math

# Read the Excel file and perform initial text/label processing.
df = pd.read_excel("data_tweets2.xlsx")
df["text"] = df["text"].str.replace(r"http\S+", "", regex=True)
df['label'] = df['label'].replace({2: 3, 1: 2, 3: 1})


# Check if GPU is available; if not, use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the language codes.
src_lang = 'en'
tgt_lang = 'fr'

# Load MarianMT models and tokenizers for both translation directions.
model_name_en_fr = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
model_name_fr_en = f'Helsinki-NLP/opus-mt-{tgt_lang}-{src_lang}'

model_en_fr = MarianMTModel.from_pretrained(model_name_en_fr).to(device)
tokenizer_en_fr = MarianTokenizer.from_pretrained(model_name_en_fr)

model_fr_en = MarianMTModel.from_pretrained(model_name_fr_en).to(device)
tokenizer_fr_en = MarianTokenizer.from_pretrained(model_name_fr_en)

# Define a batched translation function.
def batch_translate(texts, model, tokenizer, batch_size=8):
    translated_texts = []
    # Process texts in batches.
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
        batch = {key: tensor.to(device) for key, tensor in batch.items()}
        translated_tokens = model.generate(**batch)
        # Decode the outputs.
        batch_translations = [tokenizer.decode(t, skip_special_tokens=True)
                              for t in translated_tokens]
        translated_texts.extend(batch_translations)
    return translated_texts

# Define a batched back-translation function.
def batch_back_translate(texts, batch_size=8):
    # Forward translation: English -> French
    texts_fr = batch_translate(texts, model_en_fr, tokenizer_en_fr, batch_size)
    # Back translation: French -> English
    texts_back = batch_translate(texts_fr, model_fr_en, tokenizer_fr_en, batch_size)
    return texts_back

# Build a new DataFrame with original rows and their corresponding back-translated rows using batching.
new_rows = []
all_texts = df["text"].tolist()
all_labels = df["label"].tolist()

total_original_rows = len(all_texts)
batch_size = 16  # Adjust this value based on available GPU memory/speed.
total_batches = math.ceil(total_original_rows / batch_size)

start_time = time.time()
print("\nStarting batched back-translation processing...\n")

for batch_index in range(total_batches):
    start = batch_index * batch_size
    end = start + batch_size
    batch_texts = all_texts[start:end]
    batch_labels = all_labels[start:end]
    
    # Process the current batch: back-translate all texts at once.
    bt_texts = batch_back_translate(batch_texts, batch_size=batch_size)
    
    # Append the original rows and then the corresponding back-translated rows.
    for orig_text, bt_text, label in zip(batch_texts, bt_texts, batch_labels):
        new_rows.append({"text": orig_text, "label": label})
        new_rows.append({"text": bt_text, "label": label})
        
    # Logging after each batch.
    processed_rows = min(end, total_original_rows)
    elapsed = time.time() - start_time
    avg_time_per_row = elapsed / processed_rows
    rows_left = total_original_rows - processed_rows
    est_remaining = avg_time_per_row * rows_left
    print(f"Processed batch {batch_index+1}/{total_batches} - {processed_rows}/{total_original_rows} original rows complete. " +
          f"Elapsed: {elapsed:0.2f}s, Estimated remaining time: {est_remaining:0.2f}s")

end_time = time.time()
total_time = end_time - start_time
avg_time_per_row = total_time / total_original_rows

print("\nBatched processing complete!")
print(f"Total processing time: {total_time:0.2f} seconds")
print(f"Average time per original row: {avg_time_per_row:0.2f} seconds")

# Create a new DataFrame from the new rows.
df_combined = pd.DataFrame(new_rows)
df_combined.reset_index(drop=True, inplace=True)
print("\nFirst few rows of the combined DataFrame:")
print(df_combined.head())



Starting batched back-translation processing...

Processed batch 1/597 - 16/9543 original rows complete. Elapsed: 2.98s, Estimated remaining time: 1772.04s
Processed batch 2/597 - 32/9543 original rows complete. Elapsed: 6.36s, Estimated remaining time: 1890.26s
Processed batch 3/597 - 48/9543 original rows complete. Elapsed: 9.41s, Estimated remaining time: 1861.82s
Processed batch 4/597 - 64/9543 original rows complete. Elapsed: 147.47s, Estimated remaining time: 21841.93s
Processed batch 5/597 - 80/9543 original rows complete. Elapsed: 151.46s, Estimated remaining time: 17915.44s
Processed batch 6/597 - 96/9543 original rows complete. Elapsed: 154.11s, Estimated remaining time: 15165.01s
Processed batch 7/597 - 112/9543 original rows complete. Elapsed: 157.32s, Estimated remaining time: 13247.42s
Processed batch 8/597 - 128/9543 original rows complete. Elapsed: 161.61s, Estimated remaining time: 11886.89s
Processed batch 9/597 - 144/9543 original rows complete. Elapsed: 165.61s, Es

In [None]:
df_combined.to_csv('output_file2.csv', index=False)