<a href="https://colab.research.google.com/github/RodBurr/Project-3/blob/Chris/EN_to_ES_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install datasets
!pip install transformers
!pip install sklearn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
import pandas as pd  # For handling tabular data
from sklearn.feature_extraction.text import CountVectorizer  # For text vectorization

In [None]:

df_filtered = pd.read_csv("./EN-ES.txt", sep='\t', header = None)[[0,1]].rename(columns = {0:"EN", 1:"ES"})

In [None]:
# Preprocessing function to clean text using CountVectorizer
def clean_text_sklearn(text):
    # Define a CountVectorizer to convert text to lowercase and remove unwanted characters
    vectorizer = CountVectorizer(lowercase=True, stop_words='english', token_pattern=r'\b\w+\b')

    # Transform the text (vectorization and basic cleaning)
    vectorized = vectorizer.fit_transform([text])

    # Return the cleaned and tokenized text as a string
    cleaned_text = ' '.join(vectorizer.get_feature_names_out())
    return cleaned_text

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_filtered = df_filtered[['EN', 'ES']].dropna()
#df_filtered.columns = ['source', 'target']

# Save for later training
#df_filtered.to_csv("en-es-parallel.csv", index=False)
train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=42)

# Save train and test sets
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [None]:
df_filtered.columns

Index(['EN', 'ES'], dtype='object')

In [None]:
df_filtered.head(20)

Unnamed: 0,EN,ES
0,Commission Regulation (EC) No 1788/2004,Reglamento (CE) no 1788/2004 de la Comisión
1,of 15 October 2004,de 15 de octubre de 2004
2,fixing the minimum selling prices for butter f...,por el que se fijan los precios mínimos de ven...
3,"THE COMMISSION OF THE EUROPEAN COMMUNITIES,","LA COMISIÓN DE LAS COMUNIDADES EUROPEAS,"
4,Having regard to the Treaty establishing the E...,Visto el Tratado constitutivo de la Comunidad ...
5,Having regard to Council Regulation (EC) No 12...,Visto el Reglamento (CE) no 1255/1999 del Cons...
6,Whereas:,Considerando lo siguiente:
7,"The intervention agencies are, pursuant to Com...",De conformidad con lo dispuesto en el Reglamen...
8,The amount(s) of the processing securities mus...,El o los importes de las garantías de transfor...
9,The measures provided for in this Regulation a...,Las medidas previstas en el presente Reglament...


In [None]:
#from datasets import load_dataset

# Load dataset
#dataset = load_dataset("csv", data_files="en-es-parallel.csv")

# Display example
#print(dataset["train"][0])

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict

model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load datasets from the saved CSV files
train_dataset = load_dataset('csv', data_files={'train': 'train.csv'})['train']
test_dataset = load_dataset('csv', data_files={'test': 'test.csv'})['test']

# Rename columns to 'source' and 'target' for the preprocess_function
train_dataset = train_dataset.rename_columns({'EN': 'input_ids', 'ES': 'labels'})
test_dataset = test_dataset.rename_columns({'EN': 'input_ids', 'ES': 'labels'})
# Combine train and test datasets into a DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})


def preprocess_function(examples):
    # Filter out empty strings and None values before tokenization
    sources = [src for src in examples["input_ids"] if src and isinstance(src, str)]
    targets = [tgt for tgt in examples["labels"] if tgt and isinstance(tgt, str)]

    # If the batch has valid sources and targets, proceed with tokenization
    if sources and targets:
        inputs = tokenizer(sources, padding="max_length", truncation=True, return_tensors="pt")
        targets = tokenizer(targets, padding="max_length", truncation=True, return_tensors="pt")

        return {"input_ids": inputs["input_ids"], "labels": targets["input_ids"]}
    else:
        # If the batch is empty, return an empty dictionary
        return {}

tokenized_dataset = dataset.map(preprocess_function, batched=True)
# Remove empty examples from the dataset
tokenized_dataset = tokenized_dataset.filter(lambda example: len(example) > 0)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    remove_unused_columns=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt")
)

# Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/33769 [00:00<?, ? examples/s]

Map:   0%|          | 0/8443 [00:00<?, ? examples/s]

Filter:   0%|          | 0/33769 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8443 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcorrianderchris[0m ([33mcorrianderchris-michigan-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.0607,0.047117
2,0.0464,0.039347
3,0.039,0.037404




TrainOutput(global_step=12666, training_loss=0.057700090680889625, metrics={'train_runtime': 11199.0152, 'train_samples_per_second': 9.046, 'train_steps_per_second': 1.131, 'total_flos': 1.3736566623043584e+16, 'train_loss': 0.057700090680889625, 'epoch': 3.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!zip -r dataset.zip /content/results

  adding: content/results/ (stored 0%)
  adding: content/results/checkpoint-12666/ (stored 0%)
  adding: content/results/checkpoint-12666/vocab.json (deflated 71%)
  adding: content/results/checkpoint-12666/generation_config.json (deflated 43%)
  adding: content/results/checkpoint-12666/model.safetensors (deflated 7%)
  adding: content/results/checkpoint-12666/tokenizer_config.json (deflated 68%)
  adding: content/results/checkpoint-12666/trainer_state.json (deflated 73%)
  adding: content/results/checkpoint-12666/training_args.bin (deflated 51%)
  adding: content/results/checkpoint-12666/optimizer.pt (deflated 8%)
  adding: content/results/checkpoint-12666/target.spm (deflated 50%)
  adding: content/results/checkpoint-12666/scheduler.pt (deflated 56%)
  adding: content/results/checkpoint-12666/rng_state.pth (deflated 25%)
  adding: content/results/checkpoint-12666/special_tokens_map.json (deflated 35%)
  adding: content/results/checkpoint-12666/config.json (deflated 63%)
  adding: con

In [None]:
from google.colab import files
files.download('dataset.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>