In [None]:
# Install required libraries
!pip install transformers datasets sacrebleu

# Import the libraries
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from sacrebleu.metrics import BLEU
import torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.me

In [None]:
from google.colab import files

# Upload your dataset file
uploaded = files.upload()

# Load dataset into a DataFrame (assuming it's a CSV)
import pandas as pd
df = pd.read_csv(list(uploaded.keys())[0])

# Preview the dataset
print(df.head())


Saving data.csv to data.csv
  english  spanish
0     Go.      Ve.
1     Go.    Vete.
2     Go.    Vaya.
3     Go.  Váyase.
4     Hi.    Hola.


In [None]:
# Import necessary libraries
from transformers import MarianTokenizer, MarianMTModel
from datasets import Dataset

# Define model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Convert your DataFrame into a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenize the dataset
def preprocess_function(batch):
    inputs = tokenizer(batch["english"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(batch["spanish"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply preprocessing
tokenized_data = dataset.map(preprocess_function, batched=True, remove_columns=["english", "spanish"])

# Split into training and test sets
train_test_split = tokenized_data.train_test_split(test_size=0.1)
train_data = train_test_split["train"]
test_data = train_test_split["test"]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/118964 [00:00<?, ? examples/s]

In [None]:
!pip install --upgrade transformers

from transformers import Seq2SeqTrainingArguments
import torch

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated argument
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
    logging_dir="./logs",
    logging_steps=500
)





In [None]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
data_file = "data.csv"  # Ensure data.csv is in the same directory or provide a full path
dataset = load_dataset("csv", data_files=data_file)

# Use a smaller subset for quick testing
dataset = dataset["train"].shuffle(seed=42).select(range(500))
# Define model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

# Tokenize the dataset
def preprocess_function(batch):
    inputs = tokenizer(batch["english"], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(batch["spanish"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Tokenize the dataset
tokenized_data = dataset.map(preprocess_function, batched=True, remove_columns=["english", "spanish"])

# Split into training and evaluation sets
train_test_split = tokenized_data.train_test_split(test_size=0.2)  # 80% training, 20% testing
train_data = train_test_split["train"]
test_data = train_test_split["test"]

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Small batch size for quick training
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # Reduce epochs for testing
    save_steps=1000,  # Save checkpoints less frequently
    save_total_limit=1,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
    logging_dir="./logs",
    logging_steps=100,  # Log less frequently
    eval_accumulation_steps=16,  # Efficient evaluation
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate(train_data)

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Quick test training completed. Model saved successfully.")




Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,1.300821




Quick test training completed. Model saved successfully.


In [None]:
from nltk.translate.bleu_score import corpus_bleu
from transformers import MarianMTModel, MarianTokenizer
import torch

# Define BLEU scorer
def compute_bleu(model, tokenizer, test_data):
    predictions = []
    references = []

    # Check if CUDA is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)  # Move model to the appropriate device

    for example in test_data:
        # Convert input_ids (which is a list) to a tensor, and move to the appropriate device
        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)

        # Use the input_ids and attention_mask for generating translations
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }
        # Generate translation
        outputs = model.generate(**inputs)
        # Decode and store predictions and references
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        reference = tokenizer.decode(example["labels"], skip_special_tokens=True)
        predictions.append(prediction)
        references.append([reference])  # Wrap in a list as BLEU expects a list of lists

    # Compute BLEU score
    bleu_score = corpus_bleu(references, predictions)
    return bleu_score

# Calculate BLEU score
bleu_score = compute_bleu(model, tokenizer, test_data)
print(f"BLEU Score: {bleu_score}")


BLEU Score: 0.782405662172993


In [None]:
# Save the model and tokenizer
model.save_pretrained("fine_tuned_en_es_model")
tokenizer.save_pretrained("fine_tuned_en_es_model")


('fine_tuned_en_es_model/tokenizer_config.json',
 'fine_tuned_en_es_model/special_tokens_map.json',
 'fine_tuned_en_es_model/vocab.json',
 'fine_tuned_en_es_model/source.spm',
 'fine_tuned_en_es_model/target.spm',
 'fine_tuned_en_es_model/added_tokens.json')

In [None]:
# Load the fine-tuned model
from transformers import MarianMTModel, MarianTokenizer

model = MarianMTModel.from_pretrained("fine_tuned_en_es_model")
tokenizer = MarianTokenizer.from_pretrained("fine_tuned_en_es_model")




In [None]:
def translate_to_spanish(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")  # Use GPU if available
    # Generate translation
    outputs = model.generate(**inputs)
    # Decode the translated text
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text


In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# Load models and tokenizers for both translations
en_to_es_model_name = "Helsinki-NLP/opus-mt-en-es"
es_to_en_model_name = "Helsinki-NLP/opus-mt-es-en"

en_to_es_tokenizer = MarianTokenizer.from_pretrained(en_to_es_model_name)
es_to_en_tokenizer = MarianTokenizer.from_pretrained(es_to_en_model_name)

en_to_es_model = MarianMTModel.from_pretrained(en_to_es_model_name)
es_to_en_model = MarianMTModel.from_pretrained(es_to_en_model_name)

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
en_to_es_model.to(device)
es_to_en_model.to(device)

# Function to translate English to Spanish
def translate_to_spanish(text):
    inputs = en_to_es_tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = en_to_es_model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    translation = en_to_es_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Function to translate Spanish to English
def translate_to_english(text):
    inputs = es_to_en_tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = es_to_en_model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    translation = es_to_en_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Example sentences for both English to Spanish and Spanish to English
sample_sentences_en_to_es = [
    "How are you today?",
    "I am learning deep learning.",
    "Where is the nearest restaurant?",
    "This project is about machine translation.",
    "Have a great day!"
]

sample_sentences_es_to_en = [
    "¿Cómo estás hoy?",
    "Estoy aprendiendo aprendizaje profundo.",
    "¿Dónde está el restaurante más cercano?",
    "Este proyecto trata sobre traducción automática.",
    "¡Que tengas un gran día!"
]

# Translate and print results for English to Spanish
print("English to Spanish Translations:\n")
for sentence in sample_sentences_en_to_es:
    translation = translate_to_spanish(sentence)
    print(f"English: {sentence}")
    print(f"Spanish: {translation}\n")

# Translate and print results for Spanish to English
print("Spanish to English Translations:\n")
for sentence in sample_sentences_es_to_en:
    translation = translate_to_english(sentence)
    print(f"Spanish: {sentence}")
    print(f"English: {translation}\n")


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

English to Spanish Translations:

English: How are you today?
Spanish: ¿Cómo estás hoy?

English: I am learning deep learning.
Spanish: Estoy aprendiendo mucho.

English: Where is the nearest restaurant?
Spanish: ¿Dónde está el restaurante más cercano?

English: This project is about machine translation.
Spanish: Este proyecto es sobre traducción automática.

English: Have a great day!
Spanish: ¡Que tengas un buen día!

Spanish to English Translations:

Spanish: ¿Cómo estás hoy?
English: How are you today?

Spanish: Estoy aprendiendo aprendizaje profundo.
English: I'm learning deep learning.

Spanish: ¿Dónde está el restaurante más cercano?
English: Where's the nearest restaurant?

Spanish: Este proyecto trata sobre traducción automática.
English: This project is about machine translation.

Spanish: ¡Que tengas un gran día!
English: Have a great day!



In [None]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install sacremoses


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
import pdfplumber  # To extract text from PDF

# Load the fine-tuned model and tokenizer for English to Spanish
model_en_to_es = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-es")
tokenizer_en_to_es = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")

# Load Spanish-to-English model
model_es_to_en = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-es-en")
tokenizer_es_to_en = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")

# Translation function
def translate(text, translate_to):
    if translate_to == "Spanish":  # English → Spanish
        inputs = tokenizer_en_to_es(text, return_tensors="pt", padding=True, truncation=True)
        outputs = model_en_to_es.generate(**inputs)
        translated_text = tokenizer_en_to_es.decode(outputs[0], skip_special_tokens=True)
    elif translate_to == "English":  # Spanish → English
        inputs = tokenizer_es_to_en(text, return_tensors="pt", padding=True, truncation=True)
        outputs = model_es_to_en.generate(**inputs)
        translated_text = tokenizer_es_to_en.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Interface function
def translation_interface(text, source_language):
    if source_language == "English":
        target_language = "Spanish"
    else:
        target_language = "English"

    translation = translate(text, target_language)
    return translation, target_language

# UI with Gradio
with gr.Blocks() as translator_ui:
    gr.Markdown("<h1 style='text-align: center;'>English ↔ Spanish Translator</h1>")

    # Input and Output Areas
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", placeholder="Enter text here...", lines=4)
            source_language = gr.Radio(["English", "Spanish"], value="English", label="Source Language")
        with gr.Column():
            output_text = gr.Textbox(label="Translated Text", placeholder="Translation will appear here...", lines=4)
            target_language = gr.Textbox(label="Target Language", interactive=False)

    # Translate Button
    translate_button = gr.Button("Translate")

    # Interchange Button
    interchange_button = gr.Button("Switch Languages")

    # Connect buttons to actions
    translate_button.click(translation_interface, inputs=[input_text, source_language], outputs=[output_text, target_language])

    def switch_languages(source_lang):
        return "Spanish" if source_lang == "English" else "English"

    interchange_button.click(switch_languages, inputs=source_language, outputs=source_language)

    # Upload PDF area
    pdf_input = gr.File(label="Upload PDF", type="filepath")
    pdf_source_language = gr.Radio(["English", "Spanish"], value="English", label="Source Language for PDF")

    # Display PDF translation results
    pdf_output = gr.Textbox(label="Translated PDF Text", placeholder="Translated text will appear here...", lines=10)

    # Translate PDF Button
    def translate_pdf(pdf_file, pdf_source_lang):
        # Extract text from PDF using pdfplumber
        try:
            pdf_text = ""
            with pdfplumber.open(pdf_file) as pdf:
                for page in pdf.pages:
                    pdf_text += page.extract_text()

            if not pdf_text.strip():
                return "No text found in the PDF."

            # Translate extracted text
            translated_text = translate(pdf_text, "Spanish" if pdf_source_lang == "English" else "English")
            return translated_text

        except Exception as e:
            return f"Error: {str(e)}"

    # Connect PDF translation action
    translate_pdf_button = gr.Button("Translate PDF")
    translate_pdf_button.click(translate_pdf, inputs=[pdf_input, pdf_source_language], outputs=pdf_output)

# Launch the UI
translator_ui.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ef28eb57c773245e13.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


