In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq

from sklearn.model_selection import train_test_split

import re

## Load Data

In [2]:
def load_data(data_dir):
    dataset = DatasetDict({
        "train": load_dataset("json", data_files=f"{data_dir}/train.json", field=None)["train"],
        "validation": load_dataset("json", data_files=f"{data_dir}/val.json", field=None)["train"],
        "test": load_dataset("json", data_files=f"{data_dir}/test.json", field=None)["train"],
    })
    return dataset

## Preprocess Data

### Filter Data

In [3]:
def is_garbage(text):
    """
    Determines if a given text is garbage based on repetition, incoherence, or unrelated content.
    """
    # Check for excessive repetition of characters or phrases
    if re.search(r"(.)\1{5,}", text):  # e.g., "aaaaaa"
        return True
    if re.search(r"(\b\w+\b)(\s+\1){3,}", text):  # e.g., "Amanda Amanda Amanda"
        return True
    
    # Check for lack of meaningful sentences (excessive non-alphanumeric content)
    if len(re.findall(r"[a-zA-Z0-9]", text)) / len(text) < 0.5:  # More than 50% non-alphanumeric
        return True
    
    # Check for excessive random punctuation or unusual patterns
    if re.search(r"(\.{3,}|\_\_+|\={3,})", text):  # Excessive ellipses, underscores, or equals
        return True

    return False

In [4]:
def load_and_process_data(file_path, test_size=0.2, val_size=0.1, random_state=42):
    """
    Loads data from a JSON file, applies filtering, and splits it into train, validation, and test sets.
    
    Args:
        file_path (str): Path to the JSON file.
        test_size (float): Proportion of the dataset to include in the test split.
        val_size (float): Proportion of the dataset to include in the validation split.
        random_state (int): Random seed for reproducibility.
        
    Returns:
        DatasetDict: A DatasetDict containing train, validation, and test datasets.
    """
    # Load the JSON file into a Hugging Face Dataset
    dataset = Dataset.from_json(file_path)
    
    # Apply a filter to remove garbage data
    def filter_garbage(example):
        return not is_garbage(example.get("generated_story", ""))
    
    filtered_dataset = dataset.filter(filter_garbage)
    
    # Split the data into train and test sets
    train_val_data, test_data = train_test_split(
        filtered_dataset.to_pandas(), test_size=test_size, random_state=random_state
    )
    
    # Split the train data into train and validation sets
    train_data, val_data = train_test_split(
        train_val_data, test_size=val_size / (1 - test_size), random_state=random_state
    )
    
    # Convert pandas DataFrames back to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)
    test_dataset = Dataset.from_pandas(test_data)
    
    # Wrap datasets into a DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset,
    })
    
    return dataset_dict

In [5]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
def preprocess_function(examples):
    inputs = examples["summary"]
    targets = examples["generated_story"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Initialize T5-model, Tokenizer and Tokenize dataset

In [21]:
data_dir = "../../stories/stories_v2.json"
dataset = load_and_process_data(data_dir)

print(dataset['train'][0]['summary'])
story = dataset['train'][0]['generated_story']
print("\n".join(story.split(".")))

model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenized_datasets = dataset.map(preprocess_function, batched=True)


Liz has to study this weekend but finds time to go to the cinema, go shopping and have a pizza with Liz on Saturday. The movie starts at 1 p.m.
[1] Maggie is a student at the University of California, Santa Barbara
 She is the only one of her class to have a job
 Maggie has a crush on Liz
 Liz is also a fan of Maggie



[[1]] Maggie's crush is Liz's boyfriend
 He is an actor
 The two of them have been dating for a while
 They have also been friends for years
 It is unknown if they have ever met
 [2] Liz has been a friend of Liz for some time
 In fact, she has even been in a relationship with him
 However, it is not known if she ever had a romantic relationship
 [[3] The only time Liz ever has had an affair is when she was a child
 This is because she is still a virgin
 There is no way that she could have had any relationship other than that with Liz and her boyfriend, and that is why she never had sex with her
 If she had had intercourse with his boyfriend she would have died
 Therefor

Map:   0%|          | 0/1244 [00:00<?, ? examples/s]



Map:   0%|          | 0/178 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

## Setup training arguments

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



## Initialize Sequence to Sequence trainer (to generate text-to-text output)

In [23]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


## Train model

In [24]:
trainer.train()

  0%|          | 0/468 [00:00<?, ?it/s]

{'loss': 6.0191, 'grad_norm': 10.196195602416992, 'learning_rate': 4.893162393162393e-05, 'epoch': 0.06}
{'loss': 4.385, 'grad_norm': 14.421276092529297, 'learning_rate': 4.786324786324787e-05, 'epoch': 0.13}
{'loss': 3.9039, 'grad_norm': 2.009028911590576, 'learning_rate': 4.67948717948718e-05, 'epoch': 0.19}
{'loss': 3.7089, 'grad_norm': 20.088640213012695, 'learning_rate': 4.572649572649573e-05, 'epoch': 0.26}
{'loss': 3.6361, 'grad_norm': 1.415466070175171, 'learning_rate': 4.465811965811966e-05, 'epoch': 0.32}
{'loss': 3.4558, 'grad_norm': 34.69195556640625, 'learning_rate': 4.358974358974359e-05, 'epoch': 0.38}
{'loss': 3.4941, 'grad_norm': 8.70056438446045, 'learning_rate': 4.252136752136752e-05, 'epoch': 0.45}
{'loss': 3.3734, 'grad_norm': 1.414243221282959, 'learning_rate': 4.145299145299146e-05, 'epoch': 0.51}
{'loss': 3.3143, 'grad_norm': 1.067173957824707, 'learning_rate': 4.038461538461539e-05, 'epoch': 0.58}
{'loss': 3.3629, 'grad_norm': 1.1821787357330322, 'learning_rate

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 2.9879403114318848, 'eval_runtime': 2.2695, 'eval_samples_per_second': 78.431, 'eval_steps_per_second': 10.134, 'epoch': 1.0}
{'loss': 3.1704, 'grad_norm': 2.7858970165252686, 'learning_rate': 3.290598290598291e-05, 'epoch': 1.03}
{'loss': 3.2677, 'grad_norm': 0.7576261162757874, 'learning_rate': 3.183760683760684e-05, 'epoch': 1.09}
{'loss': 3.2347, 'grad_norm': 0.8691450953483582, 'learning_rate': 3.0769230769230774e-05, 'epoch': 1.15}
{'loss': 3.2329, 'grad_norm': 1.1175466775894165, 'learning_rate': 2.97008547008547e-05, 'epoch': 1.22}
{'loss': 3.2713, 'grad_norm': 1.877809762954712, 'learning_rate': 2.863247863247863e-05, 'epoch': 1.28}
{'loss': 3.2101, 'grad_norm': 0.8784136176109314, 'learning_rate': 2.756410256410257e-05, 'epoch': 1.35}
{'loss': 3.155, 'grad_norm': 4.514078617095947, 'learning_rate': 2.64957264957265e-05, 'epoch': 1.41}
{'loss': 3.1535, 'grad_norm': 5.824220657348633, 'learning_rate': 2.5427350427350426e-05, 'epoch': 1.47}
{'loss': 3.1784, 'grad_n

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 2.87398099899292, 'eval_runtime': 2.1125, 'eval_samples_per_second': 84.261, 'eval_steps_per_second': 10.888, 'epoch': 2.0}
{'loss': 3.0903, 'grad_norm': 0.7216551899909973, 'learning_rate': 1.581196581196581e-05, 'epoch': 2.05}
{'loss': 3.1385, 'grad_norm': 1.1805016994476318, 'learning_rate': 1.4743589743589745e-05, 'epoch': 2.12}
{'loss': 3.0514, 'grad_norm': 1.4104477167129517, 'learning_rate': 1.3675213675213677e-05, 'epoch': 2.18}
{'loss': 3.0693, 'grad_norm': 0.7679086923599243, 'learning_rate': 1.2606837606837608e-05, 'epoch': 2.24}
{'loss': 3.0605, 'grad_norm': 0.751008927822113, 'learning_rate': 1.153846153846154e-05, 'epoch': 2.31}
{'loss': 3.0996, 'grad_norm': 0.9917963147163391, 'learning_rate': 1.0470085470085471e-05, 'epoch': 2.37}
{'loss': 3.135, 'grad_norm': 2.0268051624298096, 'learning_rate': 9.401709401709402e-06, 'epoch': 2.44}
{'loss': 3.1514, 'grad_norm': 0.7496169805526733, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}
{'loss': 3.0616, 'gra

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 2.8448233604431152, 'eval_runtime': 2.1235, 'eval_samples_per_second': 83.825, 'eval_steps_per_second': 10.831, 'epoch': 3.0}
{'train_runtime': 146.9538, 'train_samples_per_second': 25.396, 'train_steps_per_second': 3.185, 'train_loss': 3.301574723333375, 'epoch': 3.0}


TrainOutput(global_step=468, training_loss=3.301574723333375, metrics={'train_runtime': 146.9538, 'train_samples_per_second': 25.396, 'train_steps_per_second': 3.185, 'total_flos': 53147033665536.0, 'train_loss': 3.301574723333375, 'epoch': 3.0})

## Evaluate Model

In [25]:
metrics = trainer.evaluate()
print(metrics)

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 2.8448233604431152, 'eval_runtime': 2.3355, 'eval_samples_per_second': 76.214, 'eval_steps_per_second': 9.848, 'epoch': 3.0}


## Save Model

In [26]:
save_directory = "./fine_tuned_t5_model_stories"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./fine_tuned_t5_model_stories\\tokenizer_config.json',
 './fine_tuned_t5_model_stories\\special_tokens_map.json',
 './fine_tuned_t5_model_stories\\spiece.model',
 './fine_tuned_t5_model_stories\\added_tokens.json')

# Testing

## Load Model for inference testing

In [27]:
model_path = "fine_tuned_t5_model_stories"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

## Test Model on sample input

In [28]:
"""
Test model on a sample input from a file
"""
sample_input = []
with open('sample_input.txt') as file: 

    for line in file:
        sample_input.append(line.strip())

def test(si):

    input_ids = tokenizer.encode("".join(si), return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(output_text)

test(sample_input)

#Person1#: Mom, I'm flying to visit uncle Lee's family next Saturday. Should I pack my bags today?
#Person2#: Yes, I think so.\n#Person1#: OK. What clothes should I take? I know it's hot there.
#Person2#: Yes, but it rains a lot. You can borrow an umbrella or a jacket if it's wet. Just pack some T-shirts.
#Person1#: OK. And who is meeting me at the airport?
#Person2#: Well, uncle Lee and aunt Wong will be busy, but your cousin Susan can pick you up.

rnMatt: Well, I don't know what to expect.rnMatt: I don't know what to expect.rnMatt: I don't know what to expect.rnMatt: I don't know what to expect.rnMatt: I don't know what to expect.rnMatt: I don't know what to expect.rnMatt: rnMatt: rnMatt is going on.rnMatt: I'm not sure what to do.rnMatt is going on.rnMatt: I don't know what to expect.rnMatt: I'm not sure what to do.rnMatt says.rnMatt: I don't know what to expect.rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt: rnMatt:


# Test

## Llama Model

In [14]:
from transformers import LlamaForCausalLM, LlamaTokenizer

hf_token = "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"


# Load tokenizer and model
# model_name = "meta-llama/Llama-2-7b-hf"  # Replace with the model you want
# tokenizer = LlamaTokenizer.from_pretrained(model_name)
# model = LlamaForCausalLM.from_pretrained(model_name, device_map="auto")  # Use 'device_map="auto"' for GPU, or 'map_location' for CPU.


"""Meta code"""
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

# # Input prompt
# prompt = "What is the capital of France?"

# # Tokenize input and generate response
# inputs = tokenizer(prompt, return_tensors="pt")
# outputs = model.generate(inputs["input_ids"], max_length=50, temperature=0.7)

# # Decode the response
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(response)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
403 Client Error. (Request ID: Root=1-675753ac-03f4bf552be2768e673f009f;9d464f70-686e-40d6-b341-232210574cae)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Your request to access model meta-llama/Llama-2-7b-chat-hf has been rejected by the repo's authors.

## Alpaca Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Replace with the name of an Alpaca model on Hugging Face
model_name = "chavinlo/alpaca-native"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Generate text
prompt = "Explain why the sky is blue."
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(inputs["input_ids"], max_length=50)

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)