In [2]:
import pandas as pd
import string
import re



In [3]:
# Define the dataset path
dataset_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"

# Load the dataset
data = pd.read_csv(dataset_path)

# Display the first few rows to understand the structure
data.head()


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [4]:
def normalize_text(text):
    """
    Normalizes text by:
    1. Lowercasing
    2. Removing punctuation
    3. Removing extra whitespace
    """
    # Lowercasing
    text = text.lower()
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Removing extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [5]:
# Apply normalization to all object (string) columns in the dataset
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].apply(normalize_text)

# Display the first few rows after normalization
data.head()


Unnamed: 0,English words/sentences,French words/sentences
0,hi,salut
1,run,cours
2,run,courez
3,who,qui
4,wow,ça alors


In [6]:
# Define the output path for the normalized data
normalized_path = r"C:\Users\Nishant\OneDrive\Desktop\normalized_french.csv"

# Save the normalized dataset to a new CSV file
data.to_csv(normalized_path, index=False)

print(f"Normalized data saved to: {normalized_path}")


Normalized data saved to: C:\Users\Nishant\OneDrive\Desktop\normalized_french.csv


In [7]:
# Install SentencePiece (uncomment if not already installed)
!pip install sentencepiece

import pandas as pd
import sentencepiece as spm


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   --------------------- ------------------ 524.3/991.5 kB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 2.2 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [8]:
# Define the dataset path
dataset_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"

# Load the dataset
data = pd.read_csv(dataset_path)

# Display the first few rows
data.head()


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [9]:
# Combine all text into a single file for training SentencePiece
raw_text_path = r"C:\Users\Nishant\OneDrive\Desktop\raw_text.txt"
with open(raw_text_path, 'w', encoding='utf-8') as f:
    for column in data.select_dtypes(include=['object']).columns:
        f.write('\n'.join(data[column]) + '\n')

print(f"Raw text prepared for SentencePiece at: {raw_text_path}")


Raw text prepared for SentencePiece at: C:\Users\Nishant\OneDrive\Desktop\raw_text.txt


In [10]:
# # Train the SentencePiece model
# spm.SentencePieceTrainer.train(
#     input=raw_text_path, 
#     model_prefix='spm_model', 
#     vocab_size=8000,  # Define the vocabulary size
#     model_type='bpe'  # 'bpe' for Byte Pair Encoding
# )
# 
# print("SentencePiece model trained and saved with prefix 'spm_model'.")
# 

SentencePiece model trained and saved with prefix 'spm_model'.


In [11]:
# Load the trained model
sp = spm.SentencePieceProcessor(model_file='spm_model.model')


In [13]:
# Apply SentencePiece tokenization to all text columns
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].apply(lambda x: ' '.join(sp.encode_as_pieces(x)))

# Display the first few rows of the tokenized dataset
data.head()


Unnamed: 0,English words/sentences,French words/sentences
0,▁H ▁i ▁ .,▁S ▁al ▁ ut ▁!
1,▁R ▁un ▁!,▁C ▁ours ▁!
2,▁R ▁un ▁!,▁C ▁ou ▁re z ▁!
3,▁Who ▁?,▁Qui ▁?
4,▁W ▁ow ▁!,▁Ça ▁alors ▁!


In [14]:
# Define the output path for the tokenized data
tokenized_path = r"C:\Users\Nishant\OneDrive\Desktop\subword_tokenized_french.csv"

# Save the tokenized dataset to a new CSV file
data.to_csv(tokenized_path, index=False)

print(f"Tokenized data saved to: {tokenized_path}")


Tokenized data saved to: C:\Users\Nishant\OneDrive\Desktop\subword_tokenized_french.csv


In [16]:
import re
import pandas as pd

# Define the dataset path on your system
file_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"

# Load the dataset
data = pd.read_csv(file_path)

# Define function to replace numbers with <NUM>
def handle_numbers(text):
    return re.sub(r'\d+', '<NUM>', text)

# Apply the function to all columns in the dataset
for column in data.columns:
    data[column] = data[column].apply(handle_numbers)

# Save the processed dataset
processed_path = r"C:\Users\Nishant\OneDrive\Desktop\processed_french.csv"
data.to_csv(processed_path, index=False)

print(f"Processed data with numbers handled saved to: {processed_path}")



Processed data with numbers handled saved to: C:\Users\Nishant\OneDrive\Desktop\processed_french.csv


In [17]:
import pandas as pd


In [18]:
# Define the dataset path
file_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"

# Load the dataset
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [19]:
# Drop rows where either the source or target column has missing values
aligned_data = data.dropna()

# Display information about the aligned dataset
print(f"Original dataset size: {data.shape[0]} rows")
print(f"Aligned dataset size: {aligned_data.shape[0]} rows")
aligned_data.head()


Original dataset size: 175621 rows
Aligned dataset size: 175621 rows


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [20]:
# Save the aligned dataset to a new file
aligned_path = r"C:\Users\Nishant\OneDrive\Desktop\aligned_corpus.csv"
aligned_data.to_csv(aligned_path, index=False)

print(f"Aligned parallel corpus saved to: {aligned_path}")


Aligned parallel corpus saved to: C:\Users\Nishant\OneDrive\Desktop\aligned_corpus.csv


In [22]:
!pip install pandas langdetect pyspellchecker



Collecting langdetect
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyspellchecker
  Using cached pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Using cached pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993254 sha256=7b401e81ca1a144bf884e9e4f13534599f6342bed90433a55b4caa9321b3867e
  Stored in directory: c:\users\nishant\appdata\local\pip\cache\wheels\0a\f2\b2\e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: pyspellchecker, langdetect
Successfully installed langdetect-1.0.9 pyspellchecker-0.8.1


In [24]:
print(data.columns)




Index(['English words/sentences', 'French words/sentences'], dtype='object')


In [27]:
import os
import pandas as pd

# Define the file path (update this with your actual file path)
file_path = r"C:\Users\Nishant\Desktop\french.csv"  # Ensure the path is correct

# Check if the file exists
if os.path.exists(file_path):
    print("File found, loading dataset...")
    # Load the dataset
    data = pd.read_csv(file_path)
else:
    print(f"Error: The file {file_path} does not exist. Please check the path.")


Error: The file C:\Users\Nishant\Desktop\french.csv does not exist. Please check the path.


In [2]:
import os
import pandas as pd

# Define the correct file path
file_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"

# Check if the file exists
if os.path.exists(file_path):
    print("File found, loading dataset...")
    # Load the dataset
    data = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
else:
    print(f"Error: The file {file_path} does not exist. Please check the path.")


File found, loading dataset...
Dataset loaded successfully.


In [4]:
# Basic libraries
import pandas as pd
import numpy as np
import re

# Language detection
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Spell-checking
from spellchecker import SpellChecker

# Deep learning frameworks
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, TimeDistributed, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
DetectorFactory.seed = 42


In [5]:
# Load data
file_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"
data = pd.read_csv(file_path)

# Rename columns for consistency
data.rename(columns={
    'English words/sentences': 'source', 
    'French words/sentences': 'target'
}, inplace=True)

# Display dataset preview
print(data.head())


  source      target
0    Hi.      Salut!
1   Run!     Cours !
2   Run!    Courez !
3   Who?       Qui ?
4   Wow!  Ça alors !


In [6]:
# Remove duplicate rows
data = data.drop_duplicates()
print(f"Dataset size after removing duplicates: {len(data)}")


Dataset size after removing duplicates: 175621


In [1]:
!pip install langdetect





In [2]:
from langdetect import detect


In [3]:
# Example text for language detection
text = "This is a simple sentence to detect language."

# Detect the language
language = detect(text)

# Print the detected language
print("Detected language:", language)


Detected language: en


In [4]:
texts = [
    "This is a simple sentence to detect language.",
    "Ceci est une phrase simple pour détecter la langue.",
    "Dies ist ein einfacher Satz, um die Sprache zu erkennen."
]

for text in texts:
    print(f"Text: {text}\nDetected Language: {detect(text)}\n")


Text: This is a simple sentence to detect language.
Detected Language: en

Text: Ceci est une phrase simple pour détecter la langue.
Detected Language: fr

Text: Dies ist ein einfacher Satz, um die Sprache zu erkennen.
Detected Language: de



In [6]:
def filter_sentence_length(sentences, min_length=5, max_length=50):
    filtered_sentences = [sentence for sentence in sentences if min_length <= len(sentence.split()) <= max_length]
    return filtered_sentences



In [7]:
from spellchecker import SpellChecker

def correct_spelling(sentences):
    spell = SpellChecker()
    corrected_sentences = []
    
    for sentence in sentences:
        words = sentence.split()
        corrected_words = [spell.correction(word) if word not in spell else word for word in words]
        corrected_sentences.append(" ".join(corrected_words))
    
    return corrected_sentences




In [8]:
def filter_irrelevant_sentences(sentences, keywords):
    filtered_sentences = [sentence for sentence in sentences if any(keyword in sentence for keyword in keywords)]
    return filtered_sentences


In [9]:
import re

def check_consistency(sentences, keyword="example"):
    consistent_sentences = [sentence for sentence in sentences if re.search(r'\b' + keyword + r'\b', sentence, re.IGNORECASE)]
    return consistent_sentences


In [10]:
import re

def remove_noise(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        # Remove HTML tags and URLs
        sentence = re.sub(r'<.*?>', '', sentence)
        sentence = re.sub(r'http\S+', '', sentence)
        sentence = re.sub(r'[^a-zA-Z0-9\s]', '', sentence)  # Removing special characters
        cleaned_sentences.append(sentence)
    return cleaned_sentences


In [11]:
import random

def quality_control(sentences, sample_size=3):
    sample = random.sample(sentences, sample_size)
    return sample


In [12]:
def clean_data(sentences):
    sentences = filter_sentence_length(sentences)
    sentences = correct_spelling(sentences)
    sentences = filter_irrelevant_sentences(sentences, keywords=["relevant", "important"])
    sentences = check_consistency(sentences, keyword="example")
    sentences = remove_noise(sentences)
    quality_control_sample = quality_control(sentences)
    
    return sentences, quality_control_sample


In [14]:
import random

def quality_control(sentences, sample_size=3):
    # Ensure the sample size doesn't exceed the number of available sentences
    sample_size = min(sample_size, len(sentences))
    
    # Take the sample
    sample = random.sample(sentences, sample_size)
    
    return sample


In [16]:
original_data = sentences[:5]  # First 5 original sentences
cleaned_data = clean_data(sentences)[0][:5]  # First 5 cleaned sentences

print("Original Data:")
for sentence in original_data:
    print(sentence)

print("\nCleaned Data:")
for sentence in cleaned_data:
    print(sentence)


Original Data:
This is a short sentence.
This sentence is way too long and it exceeds the set word limit by a lot.
This is an adequate length.

Cleaned Data:


In [17]:
import re
from spellchecker import SpellChecker

# Function to check if data is cleaned based on various criteria
def check_data_cleanliness(original_data, cleaned_data):
    cleanliness_report = {}
    
    # 1. Check if cleaned data has no empty sentences
    cleanliness_report['empty_sentences'] = sum(1 for sentence in cleaned_data if len(sentence.strip()) == 0)
    
    # 2. Check if sentences have appropriate length (e.g., between 5 and 50 words)
    def sentence_length_check(sentences):
        return sum(1 for sentence in sentences if len(sentence.split()) < 5 or len(sentence.split()) > 50)

    cleanliness_report['invalid_length_sentences'] = sentence_length_check(cleaned_data)
    
    # 3. Check for spelling errors in cleaned data
    spell = SpellChecker()
    def check_spelling_errors(sentences):
        errors = 0
        for sentence in sentences:
            words = sentence.split()
            misspelled = spell.unknown(words)
            errors += len(misspelled)
        return errors

    cleanliness_report['spelling_errors'] = check_spelling_errors(cleaned_data)
    
    # 4. Check if noise like HTML tags or URLs are present in the cleaned data
    def check_noise(sentences):
        noise_count = 0
        for sentence in sentences:
            if re.search(r'<.*?>|http[s]?://\S+', sentence):  # Looks for HTML tags or URLs
                noise_count += 1
        return noise_count
    
    cleanliness_report['noise_in_data'] = check_noise(cleaned_data)
    
    # 5. Compare if there are sentences removed due to irrelevance
    cleanliness_report['removed_irrelevant_sentences'] = len(original_data) - len(cleaned_data)
    
    return cleanliness_report

# Example usage:
original_data = [
    "This is a good sentence.",
    "This sentence has a spelling mistakee.",
    "www.example.com is a website.",
    "Too short.",
    "This sentence is irrelevant and will be removed.",
    "Another normal sentence that will stay."
]

# Assume `clean_data()` is the function that cleans the data and returns the cleaned version
cleaned_data = [
    "This is a good sentence.",
    "This sentence has a spelling mistakee.",
    "Another normal sentence that will stay."
]

# Check the cleanliness of the data
cleanliness_report = check_data_cleanliness(original_data, cleaned_data)
print(cleanliness_report)


{'empty_sentences': 0, 'invalid_length_sentences': 0, 'spelling_errors': 3, 'noise_in_data': 0, 'removed_irrelevant_sentences': 3}


In [19]:
from spellchecker import SpellChecker

# Function to correct spelling errors in a sentence
def correct_spelling(sentences):
    spell = SpellChecker()
    corrected_sentences = []
    
    for sentence in sentences:
        words = sentence.split()  # Split sentence into words
        corrected_words = []
        
        for word in words:
            # Check if the word is spelled correctly
            corrected_word = spell.correction(word)  # Get the most likely correction
            corrected_words.append(corrected_word)
        
        corrected_sentence = ' '.join(corrected_words)  # Rejoin words into a sentence
        corrected_sentences.append(corrected_sentence)
    
    return corrected_sentences



In [21]:
# 1. Count spelling errors before correction
original_error_count = count_spelling_errors(sentences)

# 2. Correct spelling errors
corrected_sentences = correct_spelling(sentences)

# 3. Count spelling errors after correction
corrected_error_count = count_spelling_errors(corrected_sentences)

# 4. Compare the results
print(f"Original spelling errors: {original_error_count}")
print(f"Corrected spelling errors: {corrected_error_count}")

if original_error_count > corrected_error_count:
    print("Spelling errors have been successfully corrected.")
else:
    print("No significant improvement in spelling error correction.")


Original spelling errors: 4
Corrected spelling errors: 0
Spelling errors have been successfully corrected.


In [6]:
!pip install pandas transformers datasets torch




In [7]:
import pandas as pd
from datasets import Dataset

# Load the CSV file
data_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"
df = pd.read_csv(data_path)

# Preview the data
print(df.head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)  # Split into train (90%) and test (10%)


  English words/sentences French words/sentences
0                     Hi.                 Salut!
1                    Run!                Cours !
2                    Run!               Courez !
3                    Who?                  Qui ?
4                    Wow!             Ça alors !


In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load pre-trained tokenizer and model
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [12]:
print(df.columns)



Index(['English words/sentences', 'French words/sentences'], dtype='object')


In [13]:
def preprocess_function(examples):
    # Use the correct column names
    inputs = examples['English words/sentences']  # Source language column
    targets = examples['French words/sentences']  # Target language column
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, padding=True)
    return model_inputs


In [14]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/158058 [00:00<?, ? examples/s]

Map:   0%|          | 0/17563 [00:00<?, ? examples/s]

In [15]:
from torch.utils.data import DataLoader

# Create DataLoaders
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [2]:
!pip install transformers torch




In [3]:
# import numpy as np
# import pandas as pd
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
# 
# 

In [4]:
import pandas as pd

# Load and inspect the cleaned data
data_path = r"C:\Users\Nishant\cleaned_both_languages.csv"
df = pd.read_csv(data_path)

# Display the first few rows
print(df.head())

# Display column names
print("Column names:", df.columns.tolist())


  English words/sentences     French words/sentences
0         I am a shy boy.  Je suis un garçon timide.
1         I am in a spot.    Je suis dans le pétrin.
2         I am in a spot.   Je suis dans un endroit.
3         I had to do it.     Il m'a fallu le faire.
4         I saw it on TV.      Je l'ai vu à la télé.
Column names: ['English words/sentences', 'French words/sentences']


In [6]:
pip install sacremoses


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting click (from sacremoses)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
   ---------------------------------------- 0.0/897.5 kB ? eta -:--:--
   ----------------------- ---------------- 524.3/897.5 kB 2.1 MB/s eta 0:00:01
   ---------------------------------------- 897.5/897.5 kB 2.0 MB/s eta 0:00:00
Downloading click-8.1.7-py3-none-any.whl (97 kB)
Installing collected packages: click, sacremoses
Successfully installed click-8.1.7 sacremoses-0.1.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import sacremoses
print("Sacremoses installed successfully!")


Sacremoses installed successfully!


In [9]:
# Load dataset
data_path = r"C:\Users\Nishant\cleaned_both_languages.csv"
df = pd.read_csv(data_path)

# Extract source and target texts
source_texts = df['English words/sentences'].tolist()  # Source: English
target_texts = df['French words/sentences'].tolist()   # Target: French

# Load tokenizer for English-to-French translation
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")  # Example: English-to-French model

# Tokenize source and target texts
source_encodings = tokenizer(source_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
target_encodings = tokenizer(target_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")


In [10]:
class TranslationDataset(Dataset):
    def __init__(self, source_encodings, target_encodings):
        self.source_encodings = source_encodings
        self.target_encodings = target_encodings

    def __len__(self):
        return len(self.source_encodings['input_ids'])

    def __getitem__(self, idx):
        source_item = {key: val[idx] for key, val in self.source_encodings.items()}
        target_item = {key: val[idx] for key, val in self.target_encodings.items()}
        source_item['labels'] = target_item['input_ids']
        return source_item

dataset = TranslationDataset(source_encodings, target_encodings)


In [16]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")


In [18]:
# from transformers import Seq2SeqTrainingArguments
# 
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./results",             # Save model checkpoints
#     evaluation_strategy="epoch",       # Evaluate model at the end of each epoch
#     learning_rate=5e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_total_limit=2,
#     predict_with_generate=True,        # Enable text generation during evaluation
#     logging_dir='./logs',              # Log directory
#     logging_steps=10,
# )
# 
# 
# 

In [22]:
# from transformers import Trainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
# 
# # Set up training arguments (as before)
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./results",             # Save model checkpoints
#     eval_strategy="epoch",              # Evaluate model at the end of each epoch
#     learning_rate=5e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_total_limit=2,
#     predict_with_generate=True,         # Enable text generation during evaluation
#     logging_dir='./logs',               # Log directory
#     logging_steps=10,
# )
# 
# # Create the data collator (handles tokenization and padding)
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# 
# # Create the Trainer instance without passing `tokenizer`
# trainer = Trainer(
#     model=model,                        # The model to train
#     args=training_args,                 # The training arguments
#     train_dataset=dataset,              # The training dataset
#     eval_dataset=dataset,               # The evaluation dataset (or validation set)
#     data_collator=data_collator,        # Data collator handles tokenization and padding
#     processing_class=DataCollatorForSeq2Seq  # NEW: Add this line for processing class (no tokenizer needed)
# )
# 

In [36]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import pandas as pd

# Load the tokenizer (you can use a different model as required)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Assuming 'dataset' is a dictionary containing 'train' and 'test' splits, 
# and both 'train' and 'test' are pandas DataFrames.

# Convert pandas DataFrames to Hugging Face Datasets
train_df = pd.DataFrame({
    'English words/sentences': ['Hello', 'How are you?', 'Goodbye'],  # Example data
    'Target': ['Hola', '¿Cómo estás?', 'Adiós']  # Example translations
})

test_df = pd.DataFrame({
    'English words/sentences': ['What is your name?', 'Where are you from?'],
    'Target': ['¿Cuál es tu nombre?', '¿De dónde eres?']
})

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict object
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples['English words/sentences'], padding="max_length", truncation=True, max_length=128)

# Apply tokenization with map() function (batch-wise)
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Inspect the tokenized datasets
print(tokenized_datasets['train'][0])
print(tokenized_datasets['test'][0])





Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

{'English words/sentences': 'Hello', 'Target': 'Hola', 'input_ids': [101, 7592, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [41]:
import pandas as pd
from datasets import Dataset

# Load your data (make sure this is a valid DataFrame)
train_df = pd.DataFrame({
    "English words/sentences": ["Hello", "How are you?", "Good morning"],
    "Target": ["Hola", "¿Cómo estás?", "Buenos días"]
})
test_df = pd.DataFrame({
    "English words/sentences": ["Good night", "Thank you", "Goodbye"],
    "Target": ["Buenas noches", "Gracias", "Adiós"]
})

# Convert DataFrame to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Check the dataset structure
print(train_dataset)




Dataset({
    features: ['English words/sentences', 'Target'],
    num_rows: 3
})


In [42]:
from transformers import AutoTokenizer

# Load the tokenizer (BERT example)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function that tokenizes both input and target columns
def tokenize_function(examples):
    source_encodings = tokenizer(examples['English words/sentences'], padding='max_length', truncation=True, max_length=128)
    target_encodings = tokenizer(examples['Target'], padding='max_length', truncation=True, max_length=128)
    
    # Add the target as the 'labels' for training
    source_encodings['labels'] = target_encodings['input_ids']
    
    return source_encodings

# Apply the tokenization to the train and test datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Check the tokenized dataset
print(train_dataset[0])  # Check a sample after tokenization


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

{'English words/sentences': 'Hello', 'Target': 'Hola', 'input_ids': [101, 7592, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [43]:
from transformers import DataCollatorForSeq2Seq

# Create a DataCollator to handle padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

# Check a sample to confirm the correct padding and labels
print(train_dataset[0])  # Check the first sample


{'English words/sentences': 'Hello', 'Target': 'Hola', 'input_ids': [101, 7592, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [44]:
# from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments
# 
# # Load a pre-trained model (example: BART)
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# 
# # Set up training arguments
# training_args = TrainingArguments(
#     output_dir="./results",  # Output directory for model checkpoints
#     evaluation_strategy="epoch",  # Evaluate after each epoch
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )
# 
# # Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
# )
# 
# # Train the model
# trainer.train()
# 

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,14.804253
2,No log,11.62213
3,No log,8.572099




TrainOutput(global_step=3, training_loss=13.149733225504557, metrics={'train_runtime': 229.5207, 'train_samples_per_second': 0.039, 'train_steps_per_second': 0.013, 'total_flos': 2437992677376.0, 'train_loss': 13.149733225504557, 'epoch': 3.0})

In [45]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


In [48]:
# import os
# from transformers import Trainer, TrainingArguments, BartForConditionalGeneration, BartTokenizer
# from datasets import Dataset
# 
# # Disable the symlink warning (optional)
# os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
# 
# # Load the dataset (assuming it's a pandas DataFrame)
# dataset = {'train': train_df, 'test': test_df}
# 
# # Convert to Hugging Face Dataset format
# train_dataset = Dataset.from_pandas(dataset['train'])
# test_dataset = Dataset.from_pandas(dataset['test'])
# 
# # Load model and tokenizer
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
# 
# # Set generation parameters
# model.config.max_length = 142
# model.config.min_length = 56
# model.config.num_beams = 4
# model.config.length_penalty = 2.0
# model.config.no_repeat_ngram_size = 3
# model.config.forced_bos_token_id = 0
# 
# # Define the training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01
# )
# 
# # Tokenization function
# def tokenize_function(examples):
#     # Tokenize the source and target
#     tokenized_inputs = tokenizer(examples['English words/sentences'], padding="max_length", truncation=True, max_length=128)
#     tokenized_targets = tokenizer(examples['Target'], padding="max_length", truncation=True, max_length=128)
#     
#     # Add the 'labels' key to the dictionary
#     tokenized_inputs['labels'] = tokenized_targets['input_ids']
#     return tokenized_inputs
# 
# # Apply the tokenization
# train_dataset = train_dataset.map(tokenize_function, batched=True)
# test_dataset = test_dataset.map(tokenize_function, batched=True)
# 
# # Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset
# )
# 
# # Train the model
# trainer.train()
# 
# 
# 

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,10.94917
2,No log,10.679952
3,No log,10.41141




TrainOutput(global_step=3, training_loss=11.25826899210612, metrics={'train_runtime': 156.1462, 'train_samples_per_second': 0.058, 'train_steps_per_second': 0.019, 'total_flos': 2437992677376.0, 'train_loss': 11.25826899210612, 'epoch': 3.0})

In [51]:
# from transformers import BartForConditionalGeneration, BartTokenizer
# 
# # Load the model and tokenizer
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
# 
# # Set generation parameters in the model's config
# model.config.max_length = 142  # Set a reasonable max_length
# model.config.min_length = 56
# model.config.num_beams = 4
# model.config.length_penalty = 2.0
# model.config.no_repeat_ngram_size = 3
# model.config.forced_bos_token_id = 0
# model.config.early_stopping = True
# 
# # Your input sentence
# input_text = "Your input sentence here."
# 
# # Tokenize the input text with explicit max_length during tokenization
# inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
# 
# # Generate text (summary, translation, etc.)
# generated_ids = model.generate(
#     inputs['input_ids'],
#     attention_mask=inputs['attention_mask'],
#     max_length=142  # Set max_length for generation to control output size
# )
# 
# # Decode the generated text
# generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
# print(generated_text)
# 
# 
# 

Your input sentence here. Your input sentence of the day will appear at the bottom of this article. Please submit your input sentence by Friday at 8:30 a.m. ET. For confidential support call the Samaritans on 08457 90 90 90 or visit a local Samaritans branch, see www.samaritans.org.


In [52]:
# from transformers import BartForConditionalGeneration, BartTokenizer
# 
# # Load the pre-trained BART model and tokenizer
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
# 
# # Simple input text (e.g., a news article or a short paragraph)
# input_text = """
# The quick brown fox jumps over the lazy dog. It is a common phrase used in typing tests.
# The phrase contains all the letters of the English alphabet, making it useful for testing typewriters, keyboards, and fonts.
# """
# 
# # Tokenize the input text (with truncation and padding as necessary)
# inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
# 
# # Generate summary with the model
# generated_ids = model.generate(
#     inputs['input_ids'],
#     attention_mask=inputs['attention_mask'],
#     max_length=50,  # Maximum length of the summary
#     num_beams=4,  # Use beam search for better results
#     length_penalty=2.0,  # Penalize longer summaries
#     early_stopping=True  # Stop if the summary is complete
# )
# 
# # Decode the generated output (summary)
# generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
# 
# # Print the summary
# print("Summary:", generated_summary)
# 



Summary: The quick brown fox jumps over the lazy dog is a common phrase used in typing tests. The phrase contains all the letters of the English alphabet, making it useful for testing typewriters, keyboards, and fonts. It is also used


In [53]:
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)


In [54]:
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# 

In [60]:
# from transformers import BartForConditionalGeneration, Trainer, TrainingArguments
# from datasets import Dataset
# from transformers import BartTokenizer
# 
# # Step 1: Load the model and tokenizer
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
# 
# # Step 2: Prepare the data (Assume `train_df` is already loaded as a pandas DataFrame)
# train_df = {
#     "text": [
#         "Translate this sentence.",
#         "This is a second sentence."
#     ],
#     "target": [
#         "Traduce esta oración.",
#         "Esta es una segunda oración."
#     ]
# }
# 
# # Convert to Hugging Face Dataset format
# train_dataset = Dataset.from_dict(train_df)
# 
# # Step 3: Split dataset into train and validation sets (80-20 split)
# train_dataset, eval_dataset = train_dataset.train_test_split(test_size=0.2).values()
# 
# # Step 4: Tokenize the datasets (Include 'labels' as the target)
# def tokenize_function(examples):
#     inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
#     targets = tokenizer(examples['target'], padding="max_length", truncation=True, max_length=512)
#     inputs['labels'] = targets['input_ids']  # Add target input_ids as labels
#     return inputs
# 
# # Tokenize both the train and eval datasets
# train_dataset = train_dataset.map(tokenize_function, batched=True)
# eval_dataset = eval_dataset.map(tokenize_function, batched=True)
# 
# # Step 5: Define Training Arguments
# training_args = TrainingArguments(
#     output_dir='./results',  # Output directory for model checkpoints
#     evaluation_strategy="epoch",  # Evaluate after each epoch
#     learning_rate=2e-5,  # Learning rate
#     per_device_train_batch_size=8,  # Batch size for training
#     per_device_eval_batch_size=8,  # Batch size for evaluation
#     num_train_epochs=3,  # Number of training epochs
# )
# 
# # Step 6: Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,  # Tokenized training dataset
#     eval_dataset=eval_dataset,  # Tokenized validation dataset
# )
# 
# # Step 7: Train the model
# trainer.train()
# 
# 
#  
# 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,10.41086
2,No log,10.41858
3,No log,10.258078




TrainOutput(global_step=3, training_loss=10.923394521077475, metrics={'train_runtime': 165.8148, 'train_samples_per_second': 0.018, 'train_steps_per_second': 0.018, 'total_flos': 3250656903168.0, 'train_loss': 10.923394521077475, 'epoch': 3.0})

In [61]:
# from transformers import BartForConditionalGeneration, BartTokenizer, GenerationConfig
# 
# # Load model and tokenizer
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
# 
# # Create a generation config with the parameters
# gen_config = GenerationConfig(
#     max_length=142,
#     min_length=56,
#     early_stopping=True,
#     num_beams=4,
#     length_penalty=2.0,
#     no_repeat_ngram_size=3,
#     forced_bos_token_id=0
# )
# 
# # Example generation
# input_text = "Translate this sentence"
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids
# 
# output = model.generate(input_ids, generation_config=gen_config)
# print(tokenizer.decode(output[0], skip_special_tokens=True))
# 

Translate this sentence into English. Use the weekly Newsquiz to test your knowledge of stories you saw on CNN.com. Today's News Quiz includes the question, "What do you know about the U.S. military?" The answer, of course, is "I know a lot of things."


In [63]:
# trainer.train()
# 

Epoch,Training Loss,Validation Loss
1,No log,9.075006
2,No log,8.808546
3,No log,8.647938


TrainOutput(global_step=3, training_loss=9.185171127319336, metrics={'train_runtime': 178.4206, 'train_samples_per_second': 0.017, 'train_steps_per_second': 0.017, 'total_flos': 3250656903168.0, 'train_loss': 9.185171127319336, 'epoch': 3.0})

In [64]:
# eval_results = trainer.evaluate(eval_dataset)  # Or test_dataset
# print(eval_results)
# 

{'eval_loss': 8.647937774658203, 'eval_runtime': 3.0596, 'eval_samples_per_second': 0.327, 'eval_steps_per_second': 0.327, 'epoch': 3.0}


In [65]:
model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_model')




('./my_model\\tokenizer_config.json',
 './my_model\\special_tokens_map.json',
 './my_model\\vocab.json',
 './my_model\\merges.txt',
 './my_model\\added_tokens.json')

In [68]:
# from transformers import BartForConditionalGeneration, BartTokenizer, GenerationConfig
# 
# # Load model and tokenizer
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
# 
# # Example of setting generation parameters using GenerationConfig
# generation_config = GenerationConfig(
#     max_length=142,  # Max length of generated text
#     min_length=56,   # Minimum length of generated text
#     early_stopping=True,
#     num_beams=4,
#     length_penalty=2.0,
#     no_repeat_ngram_size=3,
#     forced_bos_token_id=0
# )
# 
# # You can now use generation_config while generating
# inputs = tokenizer("Your input sentence here.", return_tensors="pt")
# output = model.generate(inputs["input_ids"], generation_config=generation_config)
# 
# # Decode the output
# decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
# print(decoded_output)
# 
# 
# 

Your input sentence here. Your input sentence of the day will appear at the bottom of this article. Please submit your input sentence by Friday at 8:30 a.m. ET. For confidential support call the Samaritans on 08457 90 90 90 or visit a local Samaritans branch, see www.samaritans.org.


In [70]:
import os

saved_model_path = './my_saved_model'
if os.path.exists(saved_model_path):
    print(f"Model is saved at: {os.path.abspath(saved_model_path)}")
else:
    print("Model path does not exist.")



Model path does not exist.


In [71]:
model.save_pretrained('./my_saved_model')




In [72]:
import os

# Check if the directory exists
model_save_path = './my_saved_model'
if os.path.exists(model_save_path):
    print(f"Model saved at: {os.path.abspath(model_save_path)}")
else:
    print(f"Model path '{model_save_path}' does not exist. Please check the path.")


Model saved at: C:\Users\Nishant\my_saved_model


In [74]:
# from transformers import BartTokenizer
# 
# # Load the tokenizer from the pre-trained model (before saving it)
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
# 
# # Save the tokenizer to the same directory where the model is saved
# tokenizer.save_pretrained(model_save_path)
# 
# print("Tokenizer saved successfully!")
# 
# 

Tokenizer saved successfully!


In [75]:
# from transformers import BartForConditionalGeneration, BartTokenizer
# 
# # Path to the saved model
# model_save_path = r'C:\Users\Nishant\my_saved_model'
# 
# # Load the saved model and tokenizer
# model = BartForConditionalGeneration.from_pretrained(model_save_path)
# tokenizer = BartTokenizer.from_pretrained(model_save_path)
# 
# print("Model and tokenizer loaded successfully!")
# 

Model and tokenizer loaded successfully!


In [76]:
input_text = "This is a simple test sentence."


In [78]:
# Tokenize the input text with an explicit max_length
input_text = "This is a simple test sentence."
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)


In [80]:
# Generate the translated output
translated_ids = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)

# Decode the output tokens to get the translated text
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

print(f"Translated Text: {translated_text}")


Translated Text: This is a simple test sentence.


In [81]:
from transformers import MarianMTModel, MarianTokenizer

# Load pre-trained translation model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)


In [82]:
# Example input text (English)
input_text = "This is a simple test sentence."

# Tokenize the input sentence
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)


In [83]:
# Generate translation (French)
translated_ids = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)


In [84]:
# Decode and print the translated text
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
print(f"Translated Text: {translated_text}")


Translated Text: C'est une simple phrase d'essai.


In [88]:
 from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate from English to French
def translate_english_to_french(english_sentence):
    # Tokenize the input English sentence
    inputs = tokenizer(english_sentence, return_tensors="pt", padding=True, truncation=True)

    # Generate the translation (French) from the model
    translated_ids = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)

    # Decode the translated sentence
    french_translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    return french_translation

# Example input
english_sentence = "How are you today?"

# Translate to French
french_sentence = translate_english_to_french(english_sentence)

print("English: ", english_sentence)
print("French: ", french_sentence)



English:  How are you today?
French:  Comment allez-vous aujourd'hui ?


In [1]:
 from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate from English to French
def translate_english_to_french(english_sentence):
    # Tokenize the input English sentence
    inputs = tokenizer(english_sentence, return_tensors="pt", padding=True, truncation=True)

    # Generate the translation (French) from the model
    translated_ids = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)

    # Decode the translated sentence
    french_translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    return french_translation

# Example input
english_sentence = "I am a shy boy."

# Translate to French
french_sentence = translate_english_to_french(english_sentence)

print("English: ", english_sentence)
print("French: ", french_sentence)



English:  I am a shy boy.
French:  Je suis un garçon timide.


In [89]:
print(model.config)


MarianConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "Helsinki-NLP/opus-mt-en-fr",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {


In [1]:
from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate from English to French
def translate_english_to_french(english_sentence):
    # Tokenize the input English sentence
    inputs = tokenizer(english_sentence, return_tensors="pt", padding=True, truncation=True)

    # Generate the translation (French) from the model
    translated_ids = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)

    # Decode the translated sentence
    french_translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    return french_translation

# Example input
english_sentence = "I hate you."

# Translate to French
french_sentence = translate_english_to_french(english_sentence)

print("English: ", english_sentence)
print("French: ", french_sentence)



English:  I hate you.
French:  Je te déteste.


In [2]:
from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate from English to French
def translate_english_to_french(english_sentence):
    # Tokenize the input English sentence
    inputs = tokenizer(english_sentence, return_tensors="pt", padding=True, truncation=True)

    # Generate the translation (French) from the model
    translated_ids = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)

    # Decode the translated sentence
    french_translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    return french_translation

# Example input
english_sentence = "I miss you."

# Translate to French
french_sentence = translate_english_to_french(english_sentence)

print("English: ", english_sentence)
print("French: ", french_sentence)



English:  I miss you.
French:  Tu me manques.


In [3]:
from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate from English to French
def translate_english_to_french(english_sentence):
    # Tokenize the input English sentence
    inputs = tokenizer(english_sentence, return_tensors="pt", padding=True, truncation=True)

    # Generate the translation (French) from the model
    translated_ids = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)

    # Decode the translated sentence
    french_translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    return french_translation

# Example input
english_sentence = "My name is Aditya."

# Translate to French
french_sentence = translate_english_to_french(english_sentence)

print("English: ", english_sentence)
print("French: ", french_sentence)



English:  My name is Aditya.
French:  Mon nom est Aditya.


In [6]:
# import nbformat
# 
# # Path to your notebook
# notebook_path = 'Untitled1.ipynb'  # Replace with the actual notebook path if not in the same directory
# 
# # Load the notebook file
# with open(notebook_path, 'r', encoding='utf-8') as f:
#     notebook_content = nbformat.read(f, as_version=4)
# 
# # List of keywords related to BART and other models you want to check
# keywords = ['BartForConditionalGeneration', 'BartTokenizer', 'facebook/bart', 'Trainer', 'TrainingArguments']
# 
# # Initialize a list to store the indices of irrelevant cells (containing BART code)
# unused_cells = []
# 
# # Iterate through all cells in the notebook
# for idx, cell in enumerate(notebook_content.cells):
#     if cell.cell_type == "code":  # Check only code cells
#         code_content = cell.source
#         # Check if any of the keywords are found in the code content
#         if any(keyword.lower() in code_content.lower() for keyword in keywords):
#             unused_cells.append(idx + 1)  # Store the kernel number (Jupyter starts counting from 1)
# 
# # Output the kernel numbers of cells containing BART code
# print("Unused cells (containing BART-related code):", unused_cells)
# 
# 

Unused cells (containing BART-related code): [9, 49, 57, 58, 63, 65, 66, 67, 69, 70, 71, 72, 73, 75, 79, 80, 94]


In [2]:
from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate from English to French
def translate_english_to_french(english_sentence):
    # Tokenize the input English sentence
    inputs = tokenizer(english_sentence, return_tensors="pt", padding=True, truncation=True)

    # Generate the translation (French) from the model
    translated_ids = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)

    # Decode the translated sentence
    french_translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    return french_translation

# Example input
english_sentence = "I am sad."

# Translate to French
french_sentence = translate_english_to_french(english_sentence)

print("English: ", english_sentence)
print("French: ", french_sentence)


English:  I am sad.
French:  Je suis triste.


In [6]:
import pandas as pd

# Load the dataset
data = pd.read_csv('C:/Users/Nishant/cleaned_both_languages.csv')

# Print the column names
print(data.columns)


Index(['English words/sentences', 'French words/sentences'], dtype='object')


In [7]:
import pandas as pd

# Load the dataset
data = pd.read_csv('C:/Users/Nishant/cleaned_both_languages.csv')

# Define the English sentence to search for
english_input = "Technology has transformed the way we communicate and work. It has made it possible for people from different parts of the world to connect instantly. With the rapid advancements in artificial intelligence, the future promises even greater innovations. However, it is important to balance technological progress with social and ethical considerations to ensure a better future for everyone."

# Check if the sentence exists in the 'English words/sentences' column of your dataset
if english_input in data['English words/sentences'].values:
    print("The English sentence is present in the dataset.")
else:
    print("The English sentence is not present in the dataset.")


The English sentence is not present in the dataset.


In [3]:
!pip install nltk




In [5]:
from nltk.translate.bleu_score import corpus_bleu

# Ground truth (reference translation) and model output (hypothesis)
references = [["mon nom est aditya".split()]]  # Reference: Human-translated sentence
hypotheses = ["mon nom est aditya".split()]   # Hypothesis: Model-generated sentence

# Compute BLEU score
bleu_score = corpus_bleu(references, hypotheses)
print(f"Corpus BLEU Score: {bleu_score:.4f}")


Corpus BLEU Score: 1.0000


In [2]:
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from transformers import MarianMTModel, MarianTokenizer

# Load dataset
file_path = r"C:\Users\Nishant\OneDrive\Desktop\french.csv"
dataset = pd.read_csv(file_path)

# Load MarianMT model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate a small batch of sentences
def translate(sentences):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(inputs['input_ids'], max_length=512, num_beams=4, early_stopping=True)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Limit to a small subset for testing
subset = dataset['English words/sentences'][:10]  # First 10 sentences for testing

# Generate translations for the subset
model_outputs = translate(list(subset))

# Prepare references (ground truth translations)
references = [[ref.split()] for ref in dataset['French words/sentences'][:10]]

# Prepare hypotheses (model-generated translations)
hypotheses = [output.split() for output in model_outputs]

# Use SmoothingFunction to prevent zero BLEU scores
smooth = SmoothingFunction().method4

# Calculate BLEU score with smoothing
bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smooth)
print(f"Corpus BLEU Score: {bleu_score:.4f}")



Corpus BLEU Score: 0.0627


In [3]:
import pandas as pd

# Load preprocessed data files
normalized_data = pd.read_csv(r'C:\Users\Nishant\OneDrive\Desktop\normalized_tokenized_data.csv')
cleaned_data = pd.read_csv(r'C:\Users\Nishant\cleaned_both_languages.csv')

# Display the first few rows of each to verify
print("Normalized Data:\n", normalized_data.head())
print("\nCleaned Data:\n", cleaned_data.head())


Normalized Data:
   English words/sentences French words/sentences English Tokens  \
0                     Hi.                 Salut!         ['hi']   
1                    Run!                Cours !        ['run']   
2                    Run!               Courez !        ['run']   
3                    Who?                  Qui ?        ['who']   
4                    Wow!             Ça alors !        ['wow']   

    French Tokens  
0       ['salut']  
1       ['cours']  
2      ['courez']  
3         ['qui']  
4  ['a', 'alors']  

Cleaned Data:
   English words/sentences     French words/sentences
0         I am a shy boy.  Je suis un garçon timide.
1         I am in a spot.    Je suis dans le pétrin.
2         I am in a spot.   Je suis dans un endroit.
3         I had to do it.     Il m'a fallu le faire.
4         I saw it on TV.      Je l'ai vu à la télé.


In [4]:
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from transformers import MarianMTModel, MarianTokenizer

# Load cleaned data
file_path_cleaned = r'C:\Users\Nishant\cleaned_both_languages.csv'
dataset = pd.read_csv(file_path_cleaned)

# Load MarianMT model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-fr'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Function to translate a batch of sentences
def translate(sentences):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(inputs['input_ids'], max_length=512, num_beams=4, early_stopping=True)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Generate translations for the first 10 sentences for testing
subset = dataset['English words/sentences'][:10]
model_outputs = translate(list(subset))

# Prepare references (ground truth translations)
references = [[ref.split()] for ref in dataset['French words/sentences'][:10]]

# Prepare hypotheses (model-generated translations)
hypotheses = [output.split() for output in model_outputs]

# Use SmoothingFunction to prevent zero BLEU scores
smooth = SmoothingFunction().method4

# Calculate BLEU score with smoothing
bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smooth)
print(f"Corpus BLEU Score: {bleu_score:.4f}")


Corpus BLEU Score: 0.7821


In [5]:
import streamlit as st
import pandas as pd
from io import StringIO
import os

# Set up the title and description of the app
st.title("Upload Any File")
st.write("Please upload any file (CSV, TXT, DOCX, PDFs, Images, etc.).")

# Add file uploader widget for any type of file
uploaded_file = st.file_uploader("Choose a file", type=None)  # Accepts any file type

# Check if a file has been uploaded
if uploaded_file is not None:
    # Get the file name and extension
    file_name = uploaded_file.name
    file_extension = os.path.splitext(file_name)[1].lower()
    
    # Display file details
    st.write(f"File Name: {file_name}")
    st.write(f"File Extension: {file_extension}")
    
    # Handling CSV file separately for processing
    if file_extension == '.csv':
        # If the file is a CSV, read it into a pandas DataFrame
        dataset = pd.read_csv(uploaded_file)
        st.write("Uploaded CSV Data:")
        st.write(dataset.head())

    # Handling TXT file
    elif file_extension == '.txt':
        # If it's a text file, read the content and display it
        file_content = uploaded_file.getvalue().decode("utf-8")
        st.write("Uploaded TXT File Content:")
        st.text(file_content)

    # Handling other file types (e.g., DOCX, PDF, images)
    elif file_extension in ['.docx', '.pdf']:
        st.write(f"Sorry, we do not support processing {file_extension} files yet.")
        
    else:
        # For other file types, simply display the file type and size
        st.write(f"Uploaded {file_extension} file, file size: {uploaded_file.size} bytes.")
        st.write("File content preview is not available for this type.")
    
    # Optionally, allow the user to download the uploaded file
    st.download_button(
        label="Download Uploaded File",
        data=uploaded_file,
        file_name=file_name,
        mime="application/octet-stream"
    )


2024-12-14 20:55:35.727 
  command:

    streamlit run C:\Users\Nishant\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [3]:
!pip install matplotlib




In [6]:
from transformers import MarianMTModel, MarianTokenizer

# Load the pre-trained MarianMT model for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'  # Example: English to French translation
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Example sentence in English
sentence = "Hello, how are you?"

# Tokenize the input sentence
inputs = tokenizer.encode(sentence, return_tensors="pt")

# Perform translation (generate the output in French)
translated = model.generate(inputs, max_length=50)

# Decode the output to get the translated text
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

# Print the translated sentence
print(f"Original: {sentence}")
print(f"Translated: {translated_text}")

 

Original: Hello, how are you?
Translated: Bonjour, comment allez-vous ?


In [9]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, TensorDataset
import time

# Example data (replace with actual data)
X_train = ["Hello, how are you?"] * 100  # Example training data (list of sentences)
y_train = ["Hola, ¿cómo estás?"] * 100   # Example target data (list of sentences)

# Load MarianMT Model and Tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-es'  # Example model (English to Spanish)
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Set device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prepare DataLoader (convert to tensors)
X_train_encoded = [tokenizer.encode(text, padding=True, truncation=True) for text in X_train]
y_train_encoded = [tokenizer.encode(text, padding=True, truncation=True) for text in y_train]

X_train_tensor = torch.tensor(X_train_encoded)
y_train_tensor = torch.tensor(y_train_encoded)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Set optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Start tracking time
start_time = time.time()

# Training loop (just for checking)
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_idx, (input_ids, labels) in enumerate(train_loader):
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Print batch info
        if batch_idx % 10 == 0:  # Print every 10th batch
            print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# End tracking time
end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time:.2f} seconds")


config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

Epoch 1, Batch 0, Loss: 4.3054
Epoch 1/3, Average Loss: 3.4114
Epoch 2, Batch 0, Loss: 2.1993
Epoch 2/3, Average Loss: 1.7297
Epoch 3, Batch 0, Loss: 1.1231
Epoch 3/3, Average Loss: 0.8645
Training time: 27.83 seconds


In [10]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, TensorDataset
import time

# Example data (replace with actual data)
X_train = ["Hello, how are you?"] * 100  # Example training data (list of sentences)
y_train = ["Hola, ¿cómo estás?"] * 100   # Example target data (list of sentences)

# Load MarianMT Model and Tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-es'  # Example model (English to Spanish)
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Set device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prepare DataLoader (convert to tensors)
X_train_encoded = [tokenizer.encode(text, padding=True, truncation=True) for text in X_train]
y_train_encoded = [tokenizer.encode(text, padding=True, truncation=True) for text in y_train]

X_train_tensor = torch.tensor(X_train_encoded)
y_train_tensor = torch.tensor(y_train_encoded)

batch_size = 64  # Change batch size to 64 or desired number
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Set optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Start tracking time
start_time = time.time()

# Training loop (just for checking)
epochs = 20  # Set the number of epochs to 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_idx, (input_ids, labels) in enumerate(train_loader):
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Print batch info
        if batch_idx % 10 == 0:  # Print every 10th batch
            print(f"Batch Size: {batch_size}, Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# End tracking time
end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time:.2f} seconds")


Batch Size: 64, Epoch 1, Batch 0, Loss: 4.2424
Epoch 1/20, Average Loss: 3.9655
Batch Size: 64, Epoch 2, Batch 0, Loss: 3.1393
Epoch 2/20, Average Loss: 2.8662
Batch Size: 64, Epoch 3, Batch 0, Loss: 2.1641
Epoch 3/20, Average Loss: 2.0029
Batch Size: 64, Epoch 4, Batch 0, Loss: 1.5683
Epoch 4/20, Average Loss: 1.4387
Batch Size: 64, Epoch 5, Batch 0, Loss: 1.0765
Epoch 5/20, Average Loss: 0.9875
Batch Size: 64, Epoch 6, Batch 0, Loss: 0.7712
Epoch 6/20, Average Loss: 0.7138
Batch Size: 64, Epoch 7, Batch 0, Loss: 0.5818
Epoch 7/20, Average Loss: 0.5339
Batch Size: 64, Epoch 8, Batch 0, Loss: 0.4324
Epoch 8/20, Average Loss: 0.3967
Batch Size: 64, Epoch 9, Batch 0, Loss: 0.3060
Epoch 9/20, Average Loss: 0.2727
Batch Size: 64, Epoch 10, Batch 0, Loss: 0.1892
Epoch 10/20, Average Loss: 0.1684
Batch Size: 64, Epoch 11, Batch 0, Loss: 0.1052
Epoch 11/20, Average Loss: 0.0938
Batch Size: 64, Epoch 12, Batch 0, Loss: 0.0628
Epoch 12/20, Average Loss: 0.0645
Batch Size: 64, Epoch 13, Batch 0,

In [11]:
import torch
from transformers import MarianMTModel, MarianTokenizer

# Example setup with MarianMT model
model_name = 'Helsinki-NLP/opus-mt-en-es'  # Example model (English to Spanish)
model = MarianMTModel.from_pretrained(model_name)

# Set device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Print optimizer details
print(optimizer)


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 1e-05
    maximize: False
    weight_decay: 0
)


In [13]:
import torch
from transformers import MarianMTModel, MarianTokenizer
import torch.optim as optim
import torch.nn as nn

# Example setup with MarianMT model
model_name = 'Helsinki-NLP/opus-mt-en-es'  # Example model (English to Spanish)
model = MarianMTModel.from_pretrained(model_name)

# Set device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Dummy dataset (replace these with your actual data)
# X_train and y_train should be your tokenized input and target data
# For demonstration, we'll use dummy tensors (replace with actual data)
X_train = torch.randint(0, 1000, (100, 20))  # 100 samples, 20 tokens each (replace with actual input)
y_train = torch.randint(0, 1000, (100, 20))  # 100 samples, 20 tokens each (replace with actual target)

# Train for a few epochs
epochs = 3
batch_size = 32

for epoch in range(epochs):
    model.train()  # Set model to training mode
    total_loss = 0  # Initialize total loss for each epoch
    
    # Example batch iteration (replace with your actual data loader)
    for batch_idx in range(0, len(X_train), batch_size):
        # Get the current batch (this is just an example)
        input_batch = X_train[batch_idx: batch_idx + batch_size]
        target_batch = y_train[batch_idx: batch_idx + batch_size]

        # Move batches to the correct device (GPU or CPU)
        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_batch, labels=target_batch)
        loss = outputs.loss  # Loss is part of the output in MarianMTModel

        # Backward pass
        loss.backward()

        # Optimize
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()

    # Calculate average loss for the epoch
    average_loss = total_loss / len(X_train) * batch_size  # Adjust based on batch size
    
    print(f"Epoch {epoch + 1}/{epochs}, Total Loss: {total_loss:.4f}, Average Loss: {average_loss:.4f}")


Epoch 1/3, Total Loss: 45.8199, Average Loss: 14.6624
Epoch 2/3, Total Loss: 41.7106, Average Loss: 13.3474
Epoch 3/3, Total Loss: 39.5434, Average Loss: 12.6539
