<a href="https://colab.research.google.com/github/RodBurr/Project-3/blob/main/EN_to_ES_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas
!pip install datasets
!pip install transformers
!pip install sklearn
!pip install nbstripout
!nbstripout --install --global
!nbstripout EN_to_ES_Model.ipynb

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
import pandas as pd  # For handling tabular data
from sklearn.feature_extraction.text import CountVectorizer  # For text vectorization

In [3]:

df_filtered = pd.read_csv("./EN-ES.txt", sep='\t', header = None)[[0,1]].rename(columns = {0:"EN", 1:"ES"})

FileNotFoundError: [Errno 2] No such file or directory: './EN-ES.txt'

In [None]:
# Preprocessing function to clean text using CountVectorizer
def clean_text_sklearn(text):
    # Define a CountVectorizer to convert text to lowercase and remove unwanted characters
    vectorizer = CountVectorizer(lowercase=True, stop_words='english', token_pattern=r'\b\w+\b')

    # Transform the text (vectorization and basic cleaning)
    vectorized = vectorizer.fit_transform([text])

    # Return the cleaned and tokenized text as a string
    cleaned_text = ' '.join(vectorizer.get_feature_names_out())
    return cleaned_text

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_filtered = df_filtered[['EN', 'ES']].dropna()
#df_filtered.columns = ['source', 'target']

# Save for later training
#df_filtered.to_csv("en-es-parallel.csv", index=False)
train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=42)

# Save train and test sets
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [None]:
df_filtered.columns

In [None]:
df_filtered.head(20)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict

model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load datasets from the saved CSV files
train_dataset = load_dataset('csv', data_files={'train': 'train.csv'})['train']
test_dataset = load_dataset('csv', data_files={'test': 'test.csv'})['test']

# Rename columns to 'source' and 'target' for the preprocess_function
train_dataset = train_dataset.rename_columns({'EN': 'input_ids', 'ES': 'labels'})
test_dataset = test_dataset.rename_columns({'EN': 'input_ids', 'ES': 'labels'})
# Combine train and test datasets into a DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})


def preprocess_function(examples):
    # Filter out empty strings and None values before tokenization
    sources = [src for src in examples["input_ids"] if src and isinstance(src, str)]
    targets = [tgt for tgt in examples["labels"] if tgt and isinstance(tgt, str)]

    # If the batch has valid sources and targets, proceed with tokenization
    if sources and targets:
        inputs = tokenizer(sources, padding="max_length", truncation=True, return_tensors="pt")
        targets = tokenizer(targets, padding="max_length", truncation=True, return_tensors="pt")

        return {"input_ids": inputs["input_ids"], "labels": targets["input_ids"]}
    else:
        # If the batch is empty, return an empty dictionary
        return {}

tokenized_dataset = dataset.map(preprocess_function, batched=True)
# Remove empty examples from the dataset
tokenized_dataset = tokenized_dataset.filter(lambda example: len(example) > 0)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    remove_unused_columns=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt")
)

# Train the model
trainer.train()