# HUGGING FACE BART TRANSLATOR (TORCH)
I followed the instructions at https://huggingface.co/docs/transformers/tasks/translation

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


#### Install Dependencies

In [5]:
%pip install transformers==4.33.1 datasets==2.14.5 evaluate==0.4.0 sacrebleu==2.3.1 accelerate==0.22.0 wandb==0.15.12 python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
# %pip uninstall -y wandb

Found existing installation: wandb 0.15.12
Uninstalling wandb-0.15.12:
  Successfully uninstalled wandb-0.15.12
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
%pip list | grep 'transformers\|datasets\|evaluate\|sacrebleu\|accelerate\|wandb\|dotenv'

accelerate               0.22.0
datasets                 2.14.5
evaluate                 0.4.0
python-dotenv            1.0.0
sacrebleu                2.3.1
transformers             4.33.1
wandb                    0.15.12
Note: you may need to restart the kernel to use updated packages.


#### Set up WANDB

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mnikkideez[0m ([33mnlp-ltl[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
%env WANDB_PROJECT=nlp-ltl-capstone
%env WANDB_LOG_MODEL=all

env: WANDB_PROJECT=nlp-ltl-capstone
env: WANDB_LOG_MODEL=all


#### Load Data

In [4]:
from datasets import load_dataset

# dataset = load_dataset("cRick/NL-to-LTL-Synthetic-Dataset")
# dataset = load_dataset("parquet", data_files={'train': '/content/drive/MyDrive/ERP/train/0000.parquet', 'test': '/content/drive/MyDrive/ERP/test/0000.parquet'})
# dataset = load_dataset("text", data_files={'ltl': '/content/drive/MyDrive/ERP/data_src_combined.txt', 'en': '/content/drive/MyDrive/ERP/data_tar_combined.txt'})
dataset = load_dataset("csv", data_files='/home/nikx/Downloads/CECW-en-ltl-dataset(combined) - Sheet1.csv')

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ltl', 'en'],
        num_rows: 7185
    })
})


#### Split the data into Train, Valid and Test


In [6]:
from datasets import DatasetDict

train_test = dataset["train"].train_test_split(test_size=0.3, seed=42)
test_valid = train_test["test"].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    'train': train_test["train"],
    'test': train_test["test"],
    'valid': test_valid["test"],
    })

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ltl', 'en'],
        num_rows: 5029
    })
    test: Dataset({
        features: ['ltl', 'en'],
        num_rows: 2156
    })
    valid: Dataset({
        features: ['ltl', 'en'],
        num_rows: 1078
    })
})


#### Check for longest word in dataset to find max_length
###### Note this part may not be necessary - dynamic padding would be a more effective solution

In [8]:
# Assuming your 2D array is named 'data'
max_length = 0

for column in dataset['train']['en']:
    column_length = len(column)
    if column_length > max_length:
        max_length = column_length

print("Maximum length:", max_length)

Maximum length: 154


#### Preprocess/Tokenize Data

In [9]:
from transformers import AutoTokenizer

checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
source_lang = "en"
target_lang = "ltl"
prefix = "translate English to LTL: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

In [11]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

#### Data Collator

In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

#### Metrics

In [13]:
# evalpred_global = None
# result_global = None

In [14]:

# def write_to_txt(data, filename):
#     with open(filename, 'w') as f:
#         f.write(str(data))


In [15]:
import evaluate

metric = evaluate.combine(["sacrebleu","exact_match"])
# metric = evaluate.combine(["accuracy","sacrebleu"])

In [16]:
import spot
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

# custom metric to use the SPOT library to get metrics
def are_formulas_equal(preds, labels):
    if len(preds) != len(labels):
        raise ValueError("Both lists should be of the same length.")
    
    c = spot.language_containment_checker()
    results = []
    
    for pred, label in zip(preds, labels):
        try:
            f = spot.formula(pred)
            g = spot.formula(label)
            result = 1 if pred == label or c.equal(f, g) else 0
            results.append(result)
        except Exception as e:
            # print(e)
            result = 1 if pred == label else 0
            results.append(result)
    return results

# Computing metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Calculate formula equality accuracy
    spot_results = are_formulas_equal(decoded_preds, decoded_labels)
    spot_accuracy = sum(spot_results) / len(spot_results)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"], "exact_match": result["exact_match"], "spot_acc": spot_accuracy}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


## Model

In [17]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DefaultFlowCallback

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, dropout=0.25)

In [18]:
print(model.config)

BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.25,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_t

In [19]:
# Write metrics to a CSV

import os
import csv
from transformers import TrainerCallback
import datetime

class CSVLoggerCallback(TrainerCallback):
    CSV_HEADER = ["Epoch", "Training Loss", "Validation Loss", "Bleu", "Exact Match", "Spot Accuracy", "Gen Len"]
    VALIDATION_METRICS = [
        ("Validation Loss", "eval_loss"),
        ("Bleu", "eval_bleu"),
        ("Exact Match", "eval_exact_match"),
        ("Spot Accuracy", "eval_spot_acc"),
        ("Gen Len", "eval_gen_len")
    ]

    def __init__(self, output_dir):
        super().__init__()
        formatted_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        self.filepath = os.path.join(output_dir, f"training_logs_{formatted_time}.csv")

        with open(self.filepath, 'w', newline='') as file:
            csv.writer(file).writerow(self.CSV_HEADER)
        self.current_epoch_data = {}

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return

        if "loss" in logs:
            self.current_epoch_data["Training Loss"] = logs["loss"]

        for metric_name, log_key in self.VALIDATION_METRICS:
            if log_key in logs:
                self.current_epoch_data[metric_name] = logs[log_key]

        if (state.epoch).is_integer() and all(key in self.current_epoch_data for key in self.CSV_HEADER[1:]):
            with open(self.filepath, 'a', newline='') as file:
                csv.writer(file).writerow([state.epoch] + [self.current_epoch_data.get(key, "") for key in self.CSV_HEADER[1:]])
            self.current_epoch_data = {}


#### Hyperparameter Search with WANDB

In [20]:
# sweep_config = {
#     'method': 'random'
# }


# # hyperparameters
# parameters_dict = {
#     'epochs': {
#         'value': 10
#         },
#     'batch_size': {
#         'values': [8, 16, 32, 64]
#         },
#     'learning_rate': {
#         'distribution': 'log_uniform_values',
#         'min': 1e-5,
#         'max': 1e-3
#     },
#     'weight_decay': {
#         'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
#     },
# }


# sweep_config['parameters'] = parameters_dict


In [21]:
# def model_init():
#     return AutoModelForSeq2SeqLM.from_pretrained(
#         checkpoint
#     )

In [22]:
# sweep_id = wandb.sweep(sweep_config, project='nlp-ltl-capstone')

In [23]:
# def train(config=None):
#   with wandb.init(config=config):
#     # set sweep configuration
#     config = wandb.config

#     # set training arguments
#     training_args = Seq2SeqTrainingArguments(
#         output_dir="/content/drive/MyDrive/ERP",
#         learning_rate=config.learning_rate,
#         per_device_train_batch_size=config.batch_size,
#         per_device_eval_batch_size=32,
#         weight_decay=config.weight_decay,
#         save_strategy='epoch',
#         evaluation_strategy='epoch',
#         logging_strategy='epoch',
#         num_train_epochs=config.epochs,
#         predict_with_generate=True,
#         load_best_model_at_end=True,
#         fp16=True,
#         push_to_hub=False,
#         logging_steps=1,
#         report_to="wandb"
#     )
#     # define training loop
#     trainer = Seq2SeqTrainer(
#         model=model,
#         args=training_args,
#         train_dataset=tokenized_dataset["train"],
#         eval_dataset=tokenized_dataset["valid"],
#         tokenizer=tokenizer,
#         data_collator=data_collator,
#         compute_metrics=compute_metrics,
#         callbacks=[DefaultFlowCallback,CSVLoggerCallback("/content/drive/MyDrive/ERP")]
#     )

#     # start training loop
#     trainer.train()

In [24]:
# wandb.agent(sweep_id, train, count=20)

#### Train the Model

In [25]:
%env WANDB_DISABLED=true

env: WANDB_DISABLED=true


In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/tmp",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=50,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    logging_steps=1,
    report_to=None
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[DefaultFlowCallback,CSVLoggerCallback("/home/nikx/capstone/dump")]
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
You are adding a <class 'transformers.trainer_callback.DefaultFlowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Exact Match,Spot Acc,Gen Len
1,0.3088,0.207475,69.8094,0.3701,0.3701,14.5167
2,0.203,0.120336,82.8043,0.6308,0.6419,15.192
3,0.0395,0.092193,86.7303,0.7171,0.7171,15.0594
4,0.0885,0.078612,88.493,0.7301,0.7551,15.2254
5,0.1032,0.077702,88.6569,0.7291,0.7597,15.1605
6,0.0205,0.075851,87.9749,0.7263,0.769,14.8516
7,0.0009,0.076926,87.837,0.7226,0.7514,15.0983
8,0.0093,0.07616,88.0451,0.7282,0.7356,15.0881
9,0.0005,0.073026,88.9997,0.7301,0.7737,15.1494
10,0.0778,0.070111,88.4263,0.731,0.7579,15.1224


TrainOutput(global_step=7900, training_loss=0.09853137246574183, metrics={'train_runtime': 1729.3033, 'train_samples_per_second': 145.405, 'train_steps_per_second': 4.568, 'total_flos': 5151694894295040.0, 'train_loss': 0.09853137246574183, 'epoch': 50.0})

In [27]:
test_predictions = trainer.predict(tokenized_dataset["test"])
print(test_predictions)

PredictionOutput(predictions=array([[    2,     0,  1640, ...,    36,   272,     2],
       [    2,     0,   597, ...,     1,     1,     1],
       [    2,     0,   597, ...,     1,     1,     1],
       ...,
       [    2,     0, 34437, ...,     1,     1,     1],
       [    2,     0,   597, ...,     1,     1,     1],
       [    2,     0,   597, ...,     1,     1,     1]]), label_ids=array([[    0,  1640,    36, ...,  -100,  -100,  -100],
       [    0,   597,    36, ...,  -100,  -100,  -100],
       [    0,   597,    36, ...,  -100,  -100,  -100],
       ...,
       [    0, 34437,    36, ...,  -100,  -100,  -100],
       [    0,   597,    36, ...,  -100,  -100,  -100],
       [    0,   597,    36, ...,  -100,  -100,  -100]]), metrics={'test_loss': 0.06871870160102844, 'test_bleu': 89.208, 'test_exact_match': 0.7324, 'test_spot_acc': 0.7866, 'test_gen_len': 15.0422, 'test_runtime': 31.6534, 'test_samples_per_second': 68.113, 'test_steps_per_second': 2.148})


In [28]:
# text = "it is never the case that HUymGnOfeUvUnxi"
# # Answer: G(!( HUymGnOfeUvUnxi ))
# text2 = "a train has arrived involves that as a semaphore is green, in the future the bar is down"
# # Answer2: "G(( train_has_arrived ) -> G(( semaphore_is_green ) -> F( bar_is_down )))"

In [26]:
# from transformers import pipeline

# translator = pipeline("translation", model="/content/drive/MyDrive/ERP/checkpoint-500", max_length=198)


In [None]:
# print(translator(text))
# print(translator(text2))

In [None]:
# !pip list | grep torch

In [79]:
random_words = [
    "absquatulate",   # To leave hurriedly.
    "jentacular",     # Pertaining to breakfast.
    "cachinnate",     # To laugh loudly.
    "yarborough",     # A hand of cards with no card above a nine.
    "floccinaucinihilipilification",   # The act of describing or regarding something as unimportant.
    "nudiustertian",  # Relating to the day before yesterday.
    "limerence",      # The state of being infatuated with another person.
    "susurrus",       # Whispering or rustling sound.
    "defenestration", # The act of throwing someone out a window.
    "agastopia",      # Admiration of a particular part of someone's body.
    "ultracrepidarian",  # A person who gives opinions on subjects they know nothing about.
    "bibliopole",     # A person who buys and sells books.
    "cynosure",       # A person or thing that is the center of attention or admiration.
    "digerati",       # People with expertise or professional involvement in information technology.
    "esemplastic",    # Having the ability to shape diverse elements or concepts into a unified whole.
    "fugacious",      # Transient or fleeting.
    "gargalesthesia", # The sensation caused by tickling.
    "heterodox",      # Not conforming with accepted standards or beliefs.
    "illecebrous",    # Alluring.
    "jumentous",      # Smelling like horse urine.
    "kenspeckle",     # Conspicuous or easily recognizable.
    "logomachy",      # An argument about words.
    "mellifluous",    # A sound that is sweet and smooth.
    "nepenthe",       # Something that can make you forget pain or sorrow.
    "obambulate",     # To walk around aimlessly.
    "peregrinate",    # To travel or wander around.
    "quixotic",       # Extremely idealistic, unrealistic, or impractical.
    "rumbustious",    # Boisterous or unruly.
    "slubberdegullion", # A slovenly or slobbish person.
    "triskaidekaphobia", # Fear of the number 13.
    "uxorious",       # Excessively fond of or submissive to one's wife.
    "ventriloquy",    # The art of speaking in such a manner that the voice appears to come from elsewhere.
    "widdershins",    # In a left-handed, wrong, or contrary direction.
    "xenoglossy",     # The ability to speak a language without having learned it.
    "yonderly",       # Mentally or emotionally distant; absent-minded.
    "zeugma",         # The use of a word to modify two or more words when it should only modify one.
    "ailurophile",    # A person who loves cats.
    "borborygm",      # The sound of gas traveling through the intestines.
    "catawampus",     # Askew or awry.
    "doodle-sack",    # Old English word for bagpipe.
    "ephemeral",      # Lasting for a very short time.
    "farraginous",    # Consisting of a confused mixture; formed of various materials in no fixed order or arrangement.
    "gobbledygook",   # Language that is meaningless or hard to understand.
    "hobbledehoy",    # An awkward adolescent boy.
    "illuminati",     # People claiming to possess special enlightenment or knowledge of something.
    "jiggumbob",      # A thingamajig or whatchamacallit.
    "kludge",         # A workaround or quick-and-dirty solution that is clumsy yet effective.
    "lollygag",       # To spend time aimlessly; to dawdle.
    "mugwump",        # An independent politician who does not follow any party.
    "noodle",         # A person's head.
    "oxter",          # Old word meaning armpit.
    "pauciloquent",   # Uttering few words; brief in speech.
    "quockerwodger",  # A wooden puppet controlled by strings.
    "ratoon",         # A small shoot growing from the root of a plant.
    "sialoquent",     # Spitting while speaking.
    "tittynope",      # A small quantity of something left over.
    "ulotrichous",    # Having woolly or crispy hair.
    "vex",            # To irritate or annoy.
    "wabbit",         # Exhausted or slightly unwell.
    "xertz",          # To gulp something down quickly and greedily.
    "yex",            # A hiccup or belch.
    "zoanthropy",     # Delusion of a person who believes they've changed into an animal.
    "abibliophobia",  # The fear of running out of reading material.
    "bloviate",       # To speak at length in a pompous or boastful manner.
    "callypygian",    # Having a beautiful backside.
    "dactylonomy",    # Counting using one's fingers.
    "ergophobia",     # Fear of work.
    "frugivorous",    # Fruit-eating.
    "gallivant",      # To go around from one place to another in the pursuit of pleasure or entertainment.
    "hullabaloo",     # Uproar or commotion.
    "inaniloquent",   # Pertaining to idle talk.
    "juxtapose",      # To place side by side for comparison.
    "kakorrhaphiophobia", # Fear of failure.
    "ludibrious",     # Apt to be a subject of jest or mockery.
    "mumpsimus",      # A traditional custom or notion adhered to although shown to be unreasonable.
    "niggle",         # To spend excessive time on minor details.
    "onomatomania",   # Vexation at having difficulty in finding the right word.
    "philoprogenitive",  # Fond of children.
    "quidnunc",       # One who always wants to know what is going on.
    "ragamuffin",     # A ragged often disreputable person.
    "snickersnee",    # A large knife.
    "tatterdemalion", # A person wearing tattered clothing.
    "unputdownable",  # Impossible to put down or stop reading or watching.
    "vomitory",       # An exit or outlet.
    "whippersnapper", # A young, impertinent person.
    "xylography",     # The art of engraving on wood.
    "yare",           # Ready or prepared.
    "zephyr"          # A gentle, mild breeze.
]


In [110]:
import re
from random import choice, shuffle

def number_to_word(n):
    words = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    return words[n]

def tokenize_formula(formula):
    # Tokenize the formula into atomic propositions
    return set(re.findall(r'\b[A-Za-z_0-9]+\b', formula))

def replace_phrases_with_random_words(data, random_words):
    ltl_phrases = set()
    
    # 1. Tokenize the LTL formulas to identify the atomic propositions.
    for formula in data["ltl"]:
        tokens = tokenize_formula(formula)
        ltl_phrases.update(tokens)

    # 2. Convert these tokens to their English equivalent.
    eng_phrases = {}
    for token in ltl_phrases:
        if "_" in token:
            parts = token.split("_")
            if parts[1].isdigit():
                eng_phrase = parts[0] + " " + number_to_word(int(parts[1]))
            else:
                eng_phrase = " ".join(parts)
            eng_phrases[token] = eng_phrase

    # 3. Map English phrases to random words
    word_mappings = {}
    unused_words = list(random_words)  # Keep a separate list of unused words

    for ltl_phrase, eng_phrase in eng_phrases.items():
        word = choice(unused_words)
        word_mappings[eng_phrase] = word
        unused_words.remove(word)  # Remove the word from the unused list

    # 4. Replace the identified phrases in the LTL formulas and English sentences.
    updated_ltl = []
    for formula in data["ltl"]:
        for ltl_phrase, eng_phrase in eng_phrases.items():
            formula = re.sub(r'\b' + ltl_phrase + r'\b', word_mappings[eng_phrase], formula)
        updated_ltl.append(formula)

    updated_eng = []
    for sentence in data["eng"]:
        for eng_phrase, random_word in word_mappings.items():
            sentence = sentence.replace(eng_phrase, random_word)
        updated_eng.append(sentence)

    return {"ltl": updated_ltl, "eng": updated_eng}


In [112]:
# print({key: value[:5] for key, value in result.items()})
# print(result)

print(result["ltl"][:5])
print(dataset["test"]["ltl"][:5])
print(result["eng"][:5])
print(dataset["test"]["en"][:5])



['( ( F G ( flag ) ) | ( F G ( orange ) ) )', 'F ( limerence & F ( dactylonomy ) )', 'F ( ventriloquy & F ( rumbustious ) )', '~ ( quixotic ) U ( kakorrhaphiophobia )', 'F ( dactylonomy & F ( quixotic ) )']
['( ( F G ( flag ) ) | ( F G ( orange ) ) )', 'F ( green_room & F ( third_floor ) )', 'F ( blue_room & F ( red_room ) )', '~ ( orange_room ) U ( landmark_1 )', 'F ( third_floor & F ( orange_room ) )']
['pick up the peach or go to the flag', 'move to the dactylonomy , travelling through the limerence .', 'move to the ventriloquy then continue to the rumbustious .', 'go to kakorrhaphiophobia avoiding the quixotic . then go to the quixotic or stay on kakorrhaphiophobia .', 'go to the dactylonomy then go to the quixotic']
['pick up the peach or go to the flag', 'move to the third floor , travelling through the green room .', 'move to the blue room then continue to the red room .', 'go to landmark one avoiding the orange room . then go to the orange room or stay on landmark one .', 'go t

In [111]:
# Example usage:
data = {
    "ltl": dataset["test"]["ltl"],
    "eng": dataset["test"]["en"]
}

result = replace_phrases_with_random_words(data, random_words)
print(dataset["test"]["ltl"][1])
print(dataset["test"]["en"][1])
print(result["ltl"][1])
print(result["eng"][1])

F ( green_room & F ( third_floor ) )
move to the third floor , travelling through the green room .
F ( limerence & F ( dactylonomy ) )
move to the dactylonomy , travelling through the limerence .


In [114]:
def write_array_to_file(data, filename="output.txt"):
    """
    Writes an array of strings to a text file.

    :param data: List of strings to be written to the file.
    :param filename: Name of the output file.
    """
    with open(filename, "w") as file:
        for item in data:
            file.write(item + ",\n")

# Usage example:
# data = ["string1", "string2", "string3"]
# write_array_to_file(data, "my_output.txt")


In [116]:
write_array_to_file(result["eng"], "/home/nikx/capstone/dump/test-rare-eng.txt")