In [2]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# Define the path to the local directory containing the pretrained model files
local_model_path = "/kaggle/input/pretrainedd"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer using the SentencePiece files directly
print("Loading tokenizer...")
tokenizer = MarianTokenizer.from_pretrained(
    local_model_path, sp_model_kwargs={"model_file": f"{local_model_path}/source.spm"}
)

# Load the MarianMT model
print("Loading model...")
model = MarianMTModel.from_pretrained(local_model_path).to(device)

# Example texts to translate
texts = [
    "Hello, how are you?",
    "The weather is beautiful today.",
    "I love programming and learning new things."
]

# Function to translate text
def translate_texts(texts, tokenizer, model, device):
    translations = []
    for text in texts:
        # Tokenize and prepare input tensors
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
        # Generate translation
        translated = model.generate(**inputs)
        # Decode and add to results
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        translations.append(translated_text)
    return translations

# Perform translations
print("Translating...")
translated_texts = translate_texts(texts, tokenizer, model, device)

# Output results
for i, (src, tgt) in enumerate(zip(texts, translated_texts)):
    print(f"{i + 1}. Original: {src}")
    print(f"   Translated: {tgt}")


Using device: cuda
Loading tokenizer...




Loading model...
Translating...
1. Original: Hello, how are you?
   Translated: Bonjour, comment allez-vous ?
2. Original: The weather is beautiful today.
   Translated: Le temps est beau aujourd'hui.
3. Original: I love programming and learning new things.
   Translated: J'adore programmer et apprendre de nouvelles choses.


In [3]:
!pip install sacrebleu


  pid, fd = os.forkpty()




In [4]:
from transformers import MarianMTModel, MarianTokenizer
import torch
import sacrebleu

# Path to the folder containing your model files
local_model_path = "/kaggle/input/pretrainedd"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = MarianTokenizer.from_pretrained(local_model_path)
model = MarianMTModel.from_pretrained(local_model_path).to(device)

# Example sentences (source) and their reference translations
source_sentences = [
    "Hello, how are you?",
    "The weather is beautiful today.",
    "I love programming and learning new things."
]

reference_translations = [
    ["Bonjour, comment ça va ?"],
    ["Le temps est magnifique aujourd'hui."],
    ["J'adore programmer et apprendre de nouvelles choses."]
]

# Function to generate translations
def translate_sentences(sentences, tokenizer, model, device, num_beams=5):
    translations = []
    for sentence in sentences:
        # Tokenize and prepare input tensors
        inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True).to(device)
        # Generate translation with beam search
        translated = model.generate(**inputs, num_beams=num_beams, no_repeat_ngram_size=2)
        # Decode and append the result
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        translations.append(translated_text)
    return translations

# Generate translations for source sentences
print("Translating sentences...")
generated_translations = translate_sentences(source_sentences, tokenizer, model, device)

# Compute BLEU score
print("Calculating BLEU score...")
bleu = sacrebleu.corpus_bleu(generated_translations, reference_translations)

# Display the results
for i, (src, gen, ref) in enumerate(zip(source_sentences, generated_translations, reference_translations)):
    print(f"\nSentence {i + 1}:")
    print(f"Original: {src}")
    print(f"Generated: {gen}")
    print(f"Reference: {ref[0]}")

print(f"\nBLEU score: {bleu.score}")


Using device: cuda
Loading tokenizer and model...
Translating sentences...
Calculating BLEU score...

Sentence 1:
Original: Hello, how are you?
Generated: Bonjour, comment allez-vous ?
Reference: Bonjour, comment ça va ?

Sentence 2:
Original: The weather is beautiful today.
Generated: Le temps est beau aujourd'hui.
Reference: Le temps est magnifique aujourd'hui.

Sentence 3:
Original: I love programming and learning new things.
Generated: J'adore programmer et apprendre de nouvelles choses.
Reference: J'adore programmer et apprendre de nouvelles choses.

BLEU score: 34.98330125272253


In [None]:
pip install sacremoses


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import torch

# Load your pre-tokenized dataset
file_path = "/kaggle/input/datasett/cleaned_en_fr_sample_tokenized (2).xlsx"
data = pd.read_excel(file_path)

# Define your tokenizer and model
model_name = "/kaggle/input/pretrainedd"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Preprocess the data
def preprocess_data(row):
    # Tokenize and pad sequences to max length (128)
    inputs = tokenizer(row["source_tokens"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(row["target_tokens"], truncation=True, padding="max_length", max_length=128)

    # Return the processed inputs and targets
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]
    }

# Convert pandas DataFrame to a Dataset object and apply preprocessing
dataset = Dataset.from_pandas(data).map(preprocess_data, batched=True)

# Split the dataset into train and test sets (90% for training, 10% for validation)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Data collator for handling padding during training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
)

# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()


In [None]:
from transformers import MarianMTModel, MarianTokenizer
import sacrebleu
import torch

# Load the pretrained model and tokenizer
model_name = "/kaggle/input/pretrainedd"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

# Example source sentences and reference translations
source_sentences = [
    "She has been working at the company for over five years and has gained a lot of experience.",
    "I will visit my grandparents this weekend if I finish all my work on time.",
    "I love programming and learning new things."
]

reference_translations = [
    ["Elle travaille dans l'entreprise depuis plus de cinq ans et a acquis beaucoup d'expérience."],
    ["Je rendrai visite à mes grands-parents ce week-end si je termine tout mon travail à temps."],
    ["J'adore programmer et apprendre de nouvelles choses."]
]

# Function to translate sentences
def translate_sentences(sentences, tokenizer, model, device, num_beams=4):
    translations = []
    for sentence in sentences:
        # Tokenize input sentence
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(device)
        # Generate translation with beam search
        outputs = model.generate(
            **inputs, 
            num_beams=num_beams, 
            max_length=512, 
            no_repeat_ngram_size=2, 
            early_stopping=True
        )
        # Decode the generated translation
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translations.append(translated_text)
    return translations

# Translate the source sentences
generated_translations = translate_sentences(source_sentences, tokenizer, model, device)

# Compute BLEU score
bleu_score = sacrebleu.corpus_bleu(generated_translations, reference_translations)

# Display results
for i, (src, gen, ref) in enumerate(zip(source_sentences, generated_translations, reference_translations)):
    print(f"\nSentence {i + 1}:")
    print(f"Original: {src}")
    print(f"Generated: {gen}")
    print(f"Reference: {ref[0]}")

print(f"\nBLEU score: {bleu_score.score}")


In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# Load pretrained model and tokenizer
model_name = "/kaggle/input/pretrainedd"  # Replace with your model path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

# Function to translate a single sentence
def translate_sentence(sentence, tokenizer, model, device, num_beams=4):
    # Tokenize input sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(device)
    # Generate translation with beam search
    outputs = model.generate(
        **inputs, 
        num_beams=num_beams, 
        max_length=512, 
        no_repeat_ngram_size=2, 
        early_stopping=True
    )
    # Decode and return the generated translation
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Input your English sentence
english_sentence = input("Enter an English sentence to translate: ")

# Get the translation
translated_french = translate_sentence(english_sentence, tokenizer, model, device)

# Display the result
print(f"\nEnglish: {english_sentence}")
print(f"French: {translated_french}")


In [5]:
pip install gradio

Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting starlette<1.0,>=0.40.0 (from gradio)
  Downloading starlette-0.43.0-py3-none-any.whl.metadata (6

In [6]:
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
import torch
import nltk

nltk.download("punkt")  # For sentence tokenization
from nltk.tokenize import sent_tokenize

# Load pretrained model and tokenizer
model_name = "/kaggle/input/pretrainedd"  # Replace with your model path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

# Function to translate a paragraph (from Code 3)
def translate_paragraph(paragraph):
    # Split the paragraph into sentences
    sentences = sent_tokenize(paragraph.strip())
    translated_sentences = []

    # Translate each sentence separately
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
        outputs = model.generate(
            **inputs, 
            num_beams=4, 
            max_length=512, 
            no_repeat_ngram_size=2, 
            early_stopping=True
        )
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_sentences.append(translated_text)

    # Join translated sentences back into a paragraph
    return " ".join(translated_sentences)

# Function to translate the content of a file (from Code 2)
def translate_file(file):
    # Read content from the uploaded file
    with open(file.name, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()
    
    # Split content into sentences
    sentences = sent_tokenize(content.strip())
    translated_sentences = []

    # Translate each sentence in chunks to avoid memory issues
    batch_size = 10  # Process 10 sentences at a time
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
        outputs = model.generate(
            **inputs,
            num_beams=4,
            max_length=512,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        for output in outputs:
            translated_text = tokenizer.decode(output, skip_special_tokens=True)
            translated_sentences.append(translated_text)

    # Combine the translated sentences into a single string
    translated_content = "\n".join(translated_sentences)

    # Write the translated content to a new file
    output_file = "translated_file.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(translated_content)

    return output_file  # Return the file path for Gradio to offer as download

# CSS styling for the Gradio interface (Black Background Theme)
custom_css = """
    .gradio-container {
        background-color: #000000;
        padding: 30px;
        border-radius: 10px;
        box-shadow: 0 6px 15px rgba(0, 0, 0, 0.5);
        color: white;
    }
    .gradio-input, .gradio-output {
        font-size: 18px;
        padding: 15px;
        border-radius: 5px;
        margin-top: 10px;
        background-color: #1e1e1e;
        color: white;
        border: 2px solid #444;
    }
    .gradio-input:focus, .gradio-output:focus {
        outline: none;
        border-color: #f39c12;
    }
    .gradio-button {
        background-color: #f39c12;
        color: white;
        padding: 12px 20px;
        border-radius: 5px;
        font-size: 16px;
        border: none;
        cursor: pointer;
    }
    .gradio-button:hover {
        background-color: #e67e22;
    }
    .gradio-title {
        color: #f39c12;
        font-size: 32px;
        text-align: center;
        font-family: 'Arial', sans-serif;
        margin-bottom: 20px;
    }
    .gradio-description {
        font-size: 18px;
        color: #bdc3c7;
        text-align: center;
        font-family: 'Arial', sans-serif;
        margin-bottom: 30px;
    }
    .gradio-footer {
        font-size: 14px;
        color: #7f8c8d;
        text-align: center;
    }
"""

# Create the interface with multiple tabs for File and Text translation
with gr.Blocks() as demo:
    with gr.Tab("Text Translation"):
        # Text Translation
        text_input = gr.Textbox(label="Enter Text")
        text_output = gr.Textbox(label="Translated Text")
        text_translate_button = gr.Button("Translate Text")
        text_translate_button.click(translate_paragraph, inputs=text_input, outputs=text_output)
    
    with gr.Tab("File Translation"):
        # File Translation
        file_input = gr.File(label="Upload a text file")
        file_output = gr.File(label="Download Translated File")
        file_translate_button = gr.Button("Translate File")
        file_translate_button.click(translate_file, inputs=file_input, outputs=file_output)

demo.launch()


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://dc32f387a85d02884d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


