# Model m2m 1.2b

In [None]:
import os
import subprocess
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from datasets import load_dataset, DatasetDict
from transformers import Trainer, TrainingArguments
import torch


In [None]:
# Ensure SentencePiece is installed
try:
    import sentencepiece
except ImportError:
    print("Installing SentencePiece...")
    subprocess.check_call(["pip", "install", "sentencepiece"])
    import sentencepiece  # Retry importing after installation

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


In [None]:
# Load model and tokenizer
model_name = "facebook/m2m100_1.2B"
print("Loading model and tokenizer...")
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device)

# Set source and target languages
source_lang = "km"  # Correct Khmer language code
target_lang = "en"  # Correct English language code
tokenizer.src_lang = source_lang


In [None]:

# Load dataset
file_path = r"dataset_khmer.csv"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset file not found at {file_path}")

print("Loading dataset...")
dataset = load_dataset("csv", data_files=file_path)["train"]  # Load as a single dataset

# Display column names for debugging
print("Column names in dataset:", dataset.column_names)

# Dynamically rename columns if needed
source_column = "Action Sentence                                                                           "
target_column = "Formatted Output"

if source_column not in dataset.column_names or target_column not in dataset.column_names:
    raise ValueError(f"Expected columns '{source_column}' and '{target_column}' not found in the dataset.")

# Rename columns for consistency
dataset = dataset.rename_column(source_column, "Source")
dataset = dataset.rename_column(target_column, "Target")

# Remove rows with empty Source or Target
print("Filtering dataset for valid rows...")
dataset = dataset.filter(lambda example: example["Source"].strip() and example["Target"].strip())

# Split the dataset into train and test
print("Splitting dataset...")
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
split_dataset = DatasetDict({"train": split_dataset["train"], "test": split_dataset["test"]})


In [None]:

# Tokenize the dataset
def preprocess_function(examples):
    print("Processing examples:", examples["Source"][:5])  # Debug input samples
    print("Processing targets:", examples["Target"][:5])  # Debug output samples

    inputs = examples["Source"]
    targets = examples["Target"]

    # Tokenize inputs and targets separately
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length",
    )
    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing dataset...")
tokenized_datasets = split_dataset.map(preprocess_function, batched=True, remove_columns=["Source", "Target"])


In [None]:

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    report_to=["none"],
    remove_unused_columns=False,  # Prevent removal of unused columns
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Train the model
print("Starting training...")
trainer.train()


In [None]:

# Evaluate the model
print("Evaluating model...")
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

# Save the fine-tuned model
print("Saving the fine-tuned model...")
model.save_pretrained("./fine_tuned_m2m100")
tokenizer.save_pretrained("./fine_tuned_m2m100")
print("Model and tokenizer saved in ./fine_tuned_m2m100")

# Chatbot Testing Loop
print("\nStarting chatbot testing... Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Goodbye!")
        break
    tokenizer.src_lang = source_lang
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding="max_length").to(device)
    outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(target_lang))
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    print(f"Bot: {response}")


In [10]:
import torch
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu  # BLEU score for translation evaluation


In [None]:

# Fix column names issue
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='utf-8')
    data.columns = data.columns.str.strip()  # Remove extra spaces in column names
    print("Dataset Columns:", data.columns)
    return data
data


In [13]:

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts.tolist()
        self.target_texts = target_texts.tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        targets = self.tokenizer(self.target_texts[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return inputs['input_ids'].squeeze(), targets['input_ids'].squeeze()


In [None]:
import torch
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu

# Fix column names issue
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='utf-8')
    data.columns = data.columns.str.strip()  # Remove extra spaces in column names
    print("Dataset Columns:", data.columns)
    return data

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts.tolist()
        self.target_texts = target_texts.tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        targets = self.tokenizer(self.target_texts[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return inputs['input_ids'].squeeze(0), targets['input_ids'].squeeze(0)

# Collate function to handle variable-length sequences
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=tokenizer.pad_token_id)
    return inputs, targets

# Load and preprocess dataset
data = load_data('dataset_khmer.csv')
if 'Action Sentence' not in data.columns or 'Formatted Output' not in data.columns:
    raise ValueError("Dataset must contain 'Action Sentence' and 'Formatted Output' columns")

input_texts = data['Action Sentence']
target_texts = data['Formatted Output']

# Split dataset into training and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_texts, target_texts, test_size=0.1, random_state=42)

# Initialize model and tokenizer
model_name = "facebook/m2m100_1.2B"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

# Create DataLoaders
train_dataset = TranslationDataset(train_inputs, train_targets, tokenizer)
val_dataset = TranslationDataset(val_inputs, val_targets, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop with gradient clipping
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    epoch_loss = 0
    for batch in train_dataloader:
        inputs, targets = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradients
        optimizer.step()

        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_dataloader):.4f}")

# Save the trained model
save_path = "trained_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully!")

# Evaluation function
def evaluate(model, dataloader, tokenizer):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs, targets = [b.to(device) for b in batch]
            outputs = model.generate(input_ids=inputs)
            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            labels = tokenizer.batch_decode(targets, skip_special_tokens=True)

            all_preds.extend([p.lower() for p in preds])  # Lowercase for fair comparison
            all_labels.extend([l.lower() for l in labels])
    
    # BLEU Score Evaluation
    bleu_score = corpus_bleu(all_preds, [all_labels]).score
    return bleu_score

# Evaluate the model
bleu = evaluate(model, val_dataloader, tokenizer)
print(f"BLEU Score: {bleu:.2f}")


In [None]:
import torch
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu
import tqdm


In [None]:

# Fix column names issue
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='utf-8')
    data.columns = data.columns.str.strip()  # Remove extra spaces in column names
    print("Dataset Columns:", data.columns)
    return data

In [7]:

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts.tolist()
        self.target_texts = target_texts.tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        targets = self.tokenizer(self.target_texts[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return inputs['input_ids'].squeeze(0), targets['input_ids'].squeeze(0)


In [8]:

# Collate function to handle variable-length sequences
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=tokenizer.pad_token_id)
    return inputs, targets


In [None]:

# Load and preprocess dataset
data = load_data('dataset_khmer.csv')
if 'Action Sentence' not in data.columns or 'Formatted Output' not in data.columns:
    raise ValueError("Dataset must contain 'Action Sentence' and 'Formatted Output' columns")

input_texts = data['Action Sentence']
target_texts = data['Formatted Output']

# Split dataset into training and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_texts, target_texts, test_size=0.1, random_state=42)


In [None]:

# Initialize model and tokenizer
model_name = "facebook/m2m100_1.2B"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

# Create DataLoaders
train_dataset = TranslationDataset(train_inputs, train_targets, tokenizer)
val_dataset = TranslationDataset(val_inputs, val_targets, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device("cpu")  # Force CPU execution
model.to(device)
print(device)

In [None]:

# Training loop with gradient clipping
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    epoch_loss = 0
    progress_bar = tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training", unit="batch")
    for batch in progress_bar:
        inputs, targets = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradients
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / len(train_dataloader))
    
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_dataloader):.4f}")

# Save the trained model
save_path = "trained_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully!")

# Evaluation function
def evaluate(model, dataloader, tokenizer):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="Evaluating", unit="batch"):
            inputs, targets = [b.to(device) for b in batch]
            outputs = model.generate(input_ids=inputs)
            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            labels = tokenizer.batch_decode(targets, skip_special_tokens=True)

            all_preds.extend([p.lower() for p in preds])  # Lowercase for fair comparison
            all_labels.extend([l.lower() for l in labels])
    
    # BLEU Score Evaluation
    bleu_score = corpus_bleu(all_preds, [all_labels]).score
    return bleu_score

# Evaluate the model
bleu = evaluate(model, val_dataloader, tokenizer)
print(f"BLEU Score: {bleu:.2f}")


Epoch 1 Training:   1%|          | 1/113 [05:57<11:07:14, 357.45s/batch, loss=0.071]

In [1]:
import torch
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu
import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Fix column names issue
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='utf-8')
    data.columns = data.columns.str.strip()
    print("Dataset Columns:", data.columns)
    return data

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts.tolist()
        self.target_texts = target_texts.tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors='pt', padding='longest', truncation=True, max_length=128)
        targets = self.tokenizer(self.target_texts[idx], return_tensors='pt', padding='longest', truncation=True, max_length=128)
        return inputs['input_ids'].squeeze(0), targets['input_ids'].squeeze(0)


Dataset Columns: Index(['Action Sentence', 'Formatted Output'], dtype='object')
Downloading model... Ensure internet connection is active.


OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like facebook/m2m100_418M is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:

# Create DataLoaders
train_dataset = TranslationDataset(train_inputs, train_targets, tokenizer)
val_dataset = TranslationDataset(val_inputs, val_targets, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop with mixed precision for speed
scaler = torch.cuda.amp.GradScaler()
model.train()
for epoch in range(3):
    epoch_loss = 0
    progress_bar = tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training", unit="batch")
    for batch in progress_bar:
        inputs, targets = [b.to(device) for b in batch]
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / len(train_dataloader))
    
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_dataloader):.4f}")

# Save the trained model
save_path = "trained_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully!")

# Evaluation function
def evaluate(model, dataloader, tokenizer):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="Evaluating", unit="batch"):
            inputs, targets = [b.to(device) for b in batch]
            outputs = model.generate(input_ids=inputs, max_length=128)
            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            labels = tokenizer.batch_decode(targets, skip_special_tokens=True)

            all_preds.extend([p.lower() for p in preds])
            all_labels.extend([l.lower() for l in labels])
    
    bleu_score = corpus_bleu(all_preds, [all_labels]).score
    return bleu_score

# Evaluate the model
bleu = evaluate(model, val_dataloader, tokenizer)
print(f"BLEU Score: {bleu:.2f}")

In [7]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

# Save to a local directory
model.save_pretrained("./m2m100_418M")
tokenizer.save_pretrained("./m2m100_418M")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


('./m2m100_418M\\tokenizer_config.json',
 './m2m100_418M\\special_tokens_map.json',
 'm2m100_418M\\vocab.json',
 'm2m100_418M\\sentencepiece.bpe.model',
 './m2m100_418M\\added_tokens.json')

In [9]:
import torch
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu
import tqdm
import os

# Fix column names issue
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='utf-8')
    data.columns = data.columns.str.strip()
    print("Dataset Columns:", data.columns)
    return data

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts.tolist()
        self.target_texts = target_texts.tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors='pt', padding='longest', truncation=True, max_length=128)
        targets = self.tokenizer(self.target_texts[idx], return_tensors='pt', padding='longest', truncation=True, max_length=128)
        return inputs['input_ids'].squeeze(0), targets['input_ids'].squeeze(0)

# Collate function to handle variable-length sequences
def collate_fn(batch):
    input_ids, target_ids = zip(*batch)
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    target_ids = torch.nn.utils.rnn.pad_sequence(target_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    return input_ids, target_ids

# Load and preprocess dataset
data = load_data('dataset_khmer.csv')
if 'Action Sentence' not in data.columns or 'Formatted Output' not in data.columns:
    raise ValueError("Dataset must contain 'Action Sentence' and 'Formatted Output' columns")

input_texts = data['Action Sentence']
target_texts = data['Formatted Output']

# Split dataset
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_texts, target_texts, test_size=0.1, random_state=42)

# Ensure offline mode if no internet
os.environ["TRANSFORMERS_OFFLINE"] = "1"

# Load model from local storage if available
model_path = "./models/m2m100_418M"
if os.path.exists(model_path):
    print("Loading model from local storage...")
    model = M2M100ForConditionalGeneration.from_pretrained(model_path)
    tokenizer = M2M100Tokenizer.from_pretrained(model_path)
else:
    raise OSError("Model not found locally. Please download and place it in './models/m2m100_418M'")

# Create DataLoaders
train_dataset = TranslationDataset(train_inputs, train_targets, tokenizer)
val_dataset = TranslationDataset(val_inputs, val_targets, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(3):
    epoch_loss = 0
    progress_bar = tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training", unit="batch")
    for batch in progress_bar:
        inputs, targets = [b.to(device) for b in batch]
        optimizer.zero_grad()

        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / len(train_dataloader))
    
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_dataloader):.4f}")

# Save the trained model
save_path = "trained_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully!")

# Evaluation function
def evaluate(model, dataloader, tokenizer):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="Evaluating", unit="batch"):
            inputs, targets = [b.to(device) for b in batch]
            outputs = model.generate(input_ids=inputs, max_length=128)
            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            labels = tokenizer.batch_decode(targets, skip_special_tokens=True)
            all_preds.extend([p.lower() for p in preds])
            all_labels.extend([l.lower() for l in labels])
    
    bleu_score = corpus_bleu(all_preds, [all_labels]).score
    return bleu_score

# Evaluate the model
bleu = evaluate(model, val_dataloader, tokenizer)
print(f"BLEU Score: {bleu:.2f}")

Dataset Columns: Index(['Action Sentence', 'Formatted Output'], dtype='object')
Loading model from local storage...


Epoch 1 Training:   0%|          | 0/57 [00:00<?, ?batch/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1 Training: 100%|██████████| 57/57 [12:18<00:00, 12.96s/batch, loss=0.584]


Epoch 1, Loss: 0.5844


Epoch 2 Training: 100%|██████████| 57/57 [13:30<00:00, 14.22s/batch, loss=0.0985]


Epoch 2, Loss: 0.0985


Epoch 3 Training: 100%|██████████| 57/57 [13:02<00:00, 13.73s/batch, loss=0.032]  


Epoch 3, Loss: 0.0320
Model saved successfully!


Evaluating: 100%|██████████| 7/7 [04:13<00:00, 36.16s/batch]

BLEU Score: 100.00





In [None]:
import torch
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sacrebleu import corpus_bleu
import tqdm
import os

# Fix column names issue
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='utf-8')
    data.columns = data.columns.str.strip()
    print("Dataset Columns:", data.columns)
    return data

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer):
        self.input_texts = input_texts.tolist()
        self.target_texts = target_texts.tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.input_texts[idx], return_tensors='pt', padding='longest', truncation=True, max_length=128)
        targets = self.tokenizer(self.target_texts[idx], return_tensors='pt', padding='longest', truncation=True, max_length=128)
        return inputs['input_ids'].squeeze(0), targets['input_ids'].squeeze(0)

# Collate function to handle variable-length sequences
def collate_fn(batch):
    input_ids, target_ids = zip(*batch)
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    target_ids = torch.nn.utils.rnn.pad_sequence(target_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    return input_ids, target_ids

# Load and preprocess dataset
data = load_data('dataset_khmer.csv')
if 'Action Sentence' not in data.columns or 'Formatted Output' not in data.columns:
    raise ValueError("Dataset must contain 'Action Sentence' and 'Formatted Output' columns")

input_texts = data['Action Sentence']
target_texts = data['Formatted Output']

# Split dataset
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_texts, target_texts, test_size=0.1, random_state=42)

# Ensure offline mode if no internet
os.environ["TRANSFORMERS_OFFLINE"] = "1"

# Load model from local storage if available
model_path = "./models/m2m100_418M"
if os.path.exists(model_path):
    print("Loading model from local storage...")
    model = M2M100ForConditionalGeneration.from_pretrained(model_path)
    tokenizer = M2M100Tokenizer.from_pretrained(model_path)
else:
    raise OSError("Model not found locally. Please download and place it in './models/m2m100_418M'")

# Chatbot function
def chatbot():
    print("Chatbot is ready! Type 'exit' to quit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("Goodbye!")
            break
        
        inputs = tokenizer(user_input, return_tensors='pt', padding='longest', truncation=True, max_length=128).input_ids.to(device)
        outputs = model.generate(input_ids=inputs, max_length=128)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("Bot:", response)

# Start chatbot
# tokenizer does not need to be moved to device
chatbot()ទៅមុខ 7.68 ម៉ែត្រ នៅល្បឿន 1.75 ម៉ែត្រ/វិនាទី, បន្ទាប់មក បង្វែស្ដាំ 74 ដឺក្រេ។


Dataset Columns: Index(['Action Sentence', 'Formatted Output'], dtype='object')
Loading model from local storage...
Chatbot is ready! Type 'exit' to quit.
Bot: Қаланың тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының саны 75 адамды құрайды.
Bot: Қаланың тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының саны 75 адамды құрайды.
Bot: Қазіргі уақытта 7,68 метр
Bot: Қаланың тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының саны 75 адамды құрайды.
Bot: Βοήθεια
Bot: Қаланың тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының тұрақты тұрғындарының саны 75 адамды құрайды.
Bot: Қаланың тұрақты тұрғындарының тұ