**1. Imports**

Let's start with imports and installing Hugging Face datasets.

In [None]:
!pip install datasets



In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoModelForCausalLM
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType
import matplotlib.pyplot as plt
import random
import os


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


**2. Tokenization and Data Blocks**

Let's access our .txt files from Google Drive and prepare them with the tokenzier.

*   Step 1. Read all files as a concatenated text and Tokenize everything.
*   Step 2. Create blocks of data from the one long piece of tokens.
*   Step 3. Make Hugging Face dataset.



In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# set up tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ---------Saber's path--------------

# # read and tokenize each file
# def load_and_tokenize(file_path):
#     with open(file_path, 'r', encoding='utf-8') as f:
#         text = f.read()
#     tokenized = tokenizer(text)
#     return tokenized['input_ids']

# # specify file path in Google Drive
# data_path = "/content/drive/MyDrive/SEP775NLP_Final_Project/Friends_script/"

# # run the function above to tokenize the data
# train_ids = load_and_tokenize(f"{data_path}/train_data.txt")
# val_ids = load_and_tokenize(f"{data_path}/val_data.txt")
# test_ids = load_and_tokenize(f"{data_path}/test_data.txt")
# general_test_ids = load_and_tokenize(f"{data_path}/wikitext103_test.txt")

In [None]:
# ---------Rose's path--------------

# # read and tokenize each file
# def load_and_tokenize(file_path):
#     with open(file_path, 'r', encoding='utf-8') as f:
#         text = f.read()
#     tokenized = tokenizer(text)
#     return tokenized['input_ids']

# # specify file path in Google Drive
# data_path = "/content/drive/MyDrive/datasets/"

# # run the function above to tokenize the data
# train_ids = load_and_tokenize(f"{data_path}/train_data.txt")
# val_ids = load_and_tokenize(f"{data_path}/val_data.txt")
# test_ids = load_and_tokenize(f"{data_path}/test_data.txt")
# general_test_ids = load_and_tokenize(f"{data_path}/wikitext103_test.txt")

In [None]:
# ---------Yingtao's path--------------

# read and tokenize each file
def load_and_tokenize(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokenized = tokenizer(text)
    return tokenized['input_ids']

# specify file path in Google Drive
data_path = "/content/drive/MyDrive/NLP/datas/"

# run the function above to tokenize the data
train_ids = load_and_tokenize(f"{data_path}/train_data.txt")
val_ids = load_and_tokenize(f"{data_path}/val_data.txt")
test_ids = load_and_tokenize(f"{data_path}/test_data.txt")
general_test_ids = load_and_tokenize(f"{data_path}/wikitext103_test.txt")

Token indices sequence length is longer than the specified maximum sequence length for this model (1077787 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# create blocks of sequences
def create_blocks(all_tokens, block_size=512):
  blocks = []
  # loop through all of the tokens, split them by block_size
  for i in range(0, len(all_tokens), block_size):
    tokens_in_this_block = all_tokens[i:i+block_size] # takes [0:512], then [512:1024]...
    blocks.append(tokens_in_this_block)
  return blocks # returns a list of list of blocks

# run the function above to create the data blocks
train_blocks = create_blocks(train_ids)
val_blocks = create_blocks(val_ids)
test_blocks = create_blocks(test_ids)
general_test_blocks = create_blocks(general_test_ids)

# verify that one block contains multiple lines of the script
print(train_blocks[0])

[464, 1881, 2080, 262, 520, 12004, 13145, 198, 25354, 416, 25, 5502, 3469, 5714, 1222, 5502, 42971, 628, 198, 58, 36542, 25, 5694, 2448, 74, 11, 15984, 318, 7351, 26154, 11, 9847, 11, 290, 23240, 511, 11758, 8183, 198, 198, 44045, 25, 357, 1462, 26154, 8, 19443, 13, 357, 39, 1746, 340, 284, 683, 2014, 198, 198, 19585, 88, 25, 6952, 345, 13, 198, 198, 44045, 25, 357, 1462, 9847, 8, 327, 1324, 18863, 2879, 13, 357, 39, 1746, 340, 284, 683, 2014, 198, 198, 38328, 25, 402, 3247, 494, 13, 198, 198, 44045, 25, 843, 257, 3621, 3024, 36930, 329, 23240, 13, 357, 39, 1746, 340, 284, 607, 2014, 198, 198, 9069, 3970, 25, 317, 1383, 11, 5875, 345, 13, 357, 3673, 1063, 1223, 2014, 28574, 371, 620, 30, 198, 198, 44045, 25, 9425, 30, 198, 198, 9069, 3970, 25, 4162, 857, 616, 269, 259, 16487, 4859, 423, 281, 1931, 6005, 30, 198, 198, 44045, 25, 3966, 0, 1320, 338, 1521, 13, 357, 44045, 8794, 2157, 607, 1027, 11, 290, 7228, 257, 269, 259, 16487, 4859, 2014, 314, 1101, 7926, 0, 198, 198, 7, 3347, 2753, 2

In [None]:
# make dictionaries and then turn them into hugging face datasets
train_dataset = Dataset.from_dict({"input_ids": train_blocks})
val_dataset = Dataset.from_dict({"input_ids": val_blocks})
test_dataset = Dataset.from_dict({"input_ids": test_blocks})
general_test_dataset = Dataset.from_dict({"input_ids": general_test_blocks})

# create the Hugging Face dataset dict
dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset,
                            "general_test": general_test_dataset})

In [None]:
train_dataset = dataset_dict["train"].shuffle(seed=42).select(range(2000))  # 只使用 1000 条数据
eval_dataset = dataset_dict["validation"].shuffle(seed=42).select(range(200))  # 只使用 200 条数据

In [None]:
# set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # For causal LM

**3. Model Training**

Let's set up the parameters for training here. We will also benchmark the pre-trained model's ability on a general Wiki language dataset (before fine-tuning). We will test the model again on this dataset after fine-tuning to see if our model retains the general language capability.

In [None]:
# access the model from Hugging Face
#model = AutoModelForCausalLM.from_pretrained("gpt2-medium") # or use "deepseek-ai/deepseek-coder-1.3b-instruct"

local_model_path = "/content/drive/MyDrive/SEP775NLP_Final_Project/Models/gpt2_335m_lorar8"
model = AutoModelForCausalLM.from_pretrained("gpt2-medium")
model.save_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

model.config.pad_token_id = model.config.eos_token_id

# --------------------------------------------------------------------------------(comment this section out if running without LoRA)
# freeze all the weights
for param in model.parameters():
    param.requires_grad = False
# -------------------------------------------------------------------------------- (this code might not be needed even when running LoRa, but just in case)

In [None]:
# # set up dummy trainer to benchmark the pre-trained model's capability on the general wiki test set
# dummy_training_args = TrainingArguments(output_dir="./eval_output", do_train=False, per_device_eval_batch_size=4, report_to="none")
# dummy_trainer = Trainer(model=model, args=dummy_training_args, eval_dataset=dataset_dict["general_test"], data_collator=data_collator)

# # test the pre-trained model
# benchmark_general_eval_results = dummy_trainer.evaluate(dataset_dict["general_test"])
# print(f"General Wiki Data before Fine-Tuning: eval_loss = {benchmark_general_eval_results['eval_loss']:.2f}")

In [None]:
# # --------------------------------------------------------------------------------(comment this section out if running without LoRA)
# # setup LoRA configuration and wrap the model with it
# lora_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM) # increase r for a more powerful adapter
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()# verify we're only training LoRA weights
# # --------------------------------------------------------------------------------
# output_dir = "/content/drive/MyDrive/SEP775NLP_Final_Project/Models/gpt2_335m_lorar8"
# # set up training_args
# training_args = TrainingArguments(output_dir=output_dir, # Rose: "/content/drive/MyDrive/datasets"
#                                   evaluation_strategy="epoch",
#                                   save_strategy="epoch",
#                                   num_train_epochs=3,
#                                   per_device_train_batch_size=8,
#                                   per_device_eval_batch_size=8,
#                                   learning_rate=3e-5,
#                                   #save_steps=2000, (this is disabled beacuse the checkpoints took too much space in Google Dirve)
#                                   logging_steps=500,
#                                   save_total_limit=1,
#                                   resume_from_checkpoint=True,
#                                   load_best_model_at_end=True,
#                                   metric_for_best_model="eval_loss",
#                                   weight_decay=0.01, # added regularization
#                                   warmup_steps=1, # added lr scheduler and warmup
#                                   lr_scheduler_type="linear",
#                                   push_to_hub=False,
#                                   report_to=[], )

# # setup trainer
# trainer = Trainer(model=model,
#                   args=training_args,
#                   train_dataset=train_dataset,
#                   eval_dataset=eval_dataset,
#                   data_collator=data_collator)

In [None]:
# # start training
# trainer.train()

**4. Evaluation**

Take the trained model, and evaluate its performance on the two different tests sets.

*   Test 1. Unseen FRIENDS scripts - to test model's ability to pick up the specific style of the show.
*   Test 2. General Wiki English data - to test model's general language capabilities. (compare to benchmark)



In [None]:
# # evaluation process
# eval_results = trainer.evaluate(dataset_dict["test"])
# general_eval_results = trainer.evaluate(dataset_dict["general_test"])

# print(f"Unseen FRIENDS scripts: eval_loss = {eval_results['eval_loss']:.2f}")
# print(f"General Wiki Data after Fine-Tuning: eval_loss = {general_eval_results['eval_loss']:.2f}")

**5. Generation**

Finally, we'll use this model to generate a new episode to see for ourselves how the model performs. We trialed the following different approaches for text generation and found that better results can be obtained with the last approach, so in this version of the code we only kept that approach.

*   Approach 1. Greedy Search (using argmax)
*   Approach 2. MultiNomial selection
*   Approach 3. MultiNomial selection with top_k and top_p filters





In [None]:
from transformers import pipeline

# load a summarization model
summarizer = pipeline("summarization", model="Falconsai/text_summarization")


config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def summarize_long_text(text, max_length=256, summary_max_length=50, summary_min_length=25):
    words = text.split()
    if len(words) <= 10:
        return text


    chunks = [" ".join(words[i:i + max_length]) for i in range(0, len(words), max_length)]


    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=summary_max_length, min_length=summary_min_length, do_sample=False)[0]['summary_text']
        summaries.append(summary)


    return " ".join(summaries)

In [None]:


# create initial prompt and tokenize it
prompt = "The One With McMaster University Party /n Written by: GPT2-medium /n Cohen Produced by: Robert Carlock & Wendy Knoller [Scene: McMaster University Student Center. Rachel and Joey are talking] Joey:"
model_inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(model.device)
model.generation_config.pad_token_id = tokenizer.pad_token_id # suppress padding warning

# create a list that holds the prompt, future tokens will be appended into this list later on
all_tokens = model_inputs["input_ids"][0].tolist()

# to avoid hitting memory limits, generate only one batch with the initial prompt, take last few tokens out to generate the next batch.
context_window = 256 # take this many tokens to set up next batch's generation
max_token = 2048
step_size = 256 # generate only 256 new tokens at a time
num_of_loops = int(max_token / step_size)

for i in range(num_of_loops):

  output = model.generate(**model_inputs,
                          max_new_tokens=step_size,
                          do_sample=True,
                          top_k=50, # keep 50 highest probable options
                          top_p=0.9, # keep only the options that add up to p% probability, filter out the remaining less probable options
                          temperature=1) # to add a little bit more creativity if needed

  output_tokens = output[0].tolist()

  new_tokens = output_tokens[-step_size:] # get rid of the previous context and keep only new tokens
  all_tokens.extend(new_tokens) # and place those tokens into the list


  #current_text = tokenizer.decode(all_tokens, skip_special_tokens=True)
  #summary = summarize_long_text(current_text)
  # new_prompt = summary
  # model_inputs = tokenizer(new_prompt, return_tensors="pt", return_attention_mask=True).to(model.device)


  previous_context = output_tokens[-context_window:] # set new previous_context for next loop
  previous_context_tensor = torch.tensor([previous_context]).to(model.device) # prepare the inputs and attention mask for next loop
  attention_mask = (previous_context_tensor != tokenizer.pad_token_id).long()
  model_inputs = {"input_ids": previous_context_tensor, "attention_mask":attention_mask}



# decode from the list and print results
generated_script = tokenizer.decode(all_tokens, skip_special_tokens=True)
print("-------------------------------------------------------------")
print(generated_script)

-------------------------------------------------------------
The One With McMaster University Party /n Written by: GPT2-medium /n Cohen Produced by: Robert Carlock & Wendy Knoller [Scene: McMaster University Student Center. Rachel and Joey are talking] Joey: Do you want to know why I'm dating a guy? Rachel: What about? Joey: We don't have time. We don't want to waste it talking. Joey: He's cool, though. Rachel: I mean, he's not like me. He's like an old lady. It's really hard for me to figure out how to be a woman. [Dates. Rachel goes in to check on her date. Joey is still waiting.] Joey: He's cool. Rachel: He's really cool. Joey: Do you think I should date him? Rachel: It's pretty obvious. [They start dating again. They don't start talking until Joey makes some kind of weird remark. Rachel is shocked at the sound of this.] Joey: [to Rachel] How did you know? You didn't ask. Rachel: The only thing that makes me really nervous is that you're a little old. Joey: I didn't know! [They kis

In [None]:

output_dir = "/content/drive/MyDrive/SEP775NLP_Final_Project/Models/Falconsai"
file_name = "generated_script.txt"
file_path = os.path.join(output_dir, file_name)

os.makedirs(output_dir, exist_ok=True)
with open(file_path, "w") as f:
    f.write(generated_script)
print(f"File saved to: {file_path}")


File saved to: /content/drive/MyDrive/SEP775NLP_Final_Project/Models/Falconsai/generated_script.txt


In [None]:
# create initial prompt and tokenize it
prompt = "The One With McMaster University Party /n Written by: GPT2-medium /n Cohen Produced by: Robert Carlock & Wendy Knoller [Scene: McMaster University Student Center. Rachel and Joey are talking] Joey:"
model_inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(model.device)
model.generation_config.pad_token_id = tokenizer.pad_token_id # suppress padding warning

# create a list that holds the prompt, future tokens will be appended into this list later on
all_tokens = model_inputs["input_ids"][0].tolist()

# to avoid hitting memory limits, generate only one batch with the initial prompt, take last few tokens out to generate the next batch.
context_window = 256 # take this many tokens to set up next batch's generation
max_token = 2048
step_size = 256 # generate only 256 new tokens at a time
num_of_loops = int(max_token / step_size)

for i in range(num_of_loops):

  output = model.generate(**model_inputs,
                          max_new_tokens=step_size,
                          do_sample=True,
                          top_k=50, # keep 50 highest probable options
                          top_p=0.9, # keep only the options that add up to p% probability, filter out the remaining less probable options
                          temperature=1) # to add a little bit more creativity if needed

  output_tokens = output[0].tolist()

  new_tokens = output_tokens[-step_size:] # get rid of the previous context and keep only new tokens
  all_tokens.extend(new_tokens) # and place those tokens into the list


  current_text = tokenizer.decode(all_tokens, skip_special_tokens=True)
  summary = summarize_long_text(current_text)
  new_prompt = summary
  model_inputs = tokenizer(new_prompt, return_tensors="pt", return_attention_mask=True).to(model.device)


  # previous_context = output_tokens[-context_window:] # set new previous_context for next loop
  # previous_context_tensor = torch.tensor([previous_context]).to(model.device) # prepare the inputs and attention mask for next loop
  # attention_mask = (previous_context_tensor != tokenizer.pad_token_id).long()
  # model_inputs = {"input_ids": previous_context_tensor, "attention_mask":attention_mask}



# decode from the list and print results
generated_script_with_summary = tokenizer.decode(all_tokens, skip_special_tokens=True)
print("-------------------------------------------------------------")
print(generated_script_with_summary)

Your max_length is set to 50, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


-------------------------------------------------------------
The One With McMaster University Party /n Written by: GPT2-medium /n Cohen Produced by: Robert Carlock & Wendy Knoller [Scene: McMaster University Student Center. Rachel and Joey are talking] Joey: We're like, "We want to make some noise. And we're gonna sing." (beat) Rachel: You guys did a fantastic job of it! (beat) And you guys even talked to the president! (beat) And he said he wants us to try again next year! That's awesome! Joey: I was so worried!

[Scene: Professor Ponder's office] Ponder: Whoops! No students came into my office. Joey: So we had to do something! Rachel: You did great!

[Scene: Principal's office] Principal: Joey? What's the problem? Joey: We're really bored with the dance party. Principal: Well, I've been told that dance parties are awful. Joey: Really? How can you be mad? You guys have been dancing for eight minutes straight!

[Scene: Principal's office] Principal: The dance was great. How did you li

In [None]:
file_name = "generated_script_with_summary.txt"
file_path = os.path.join(output_dir, file_name)

os.makedirs(output_dir, exist_ok=True)
with open(file_path, "w") as f:
    f.write(generated_script_with_summary)
print(f"File saved to: {file_path}")

File saved to: /content/drive/MyDrive/SEP775NLP_Final_Project/Models/Falconsai/generated_script_with_summary.txt


In [None]:
guides = [
    "Rachel:",
    "Monica:",
    "Phoebe:",
    "Joey:",
    "Chandler:",
    "Ross:"
]
# create initial prompt and tokenize it
prompt = "The One With McMaster University Party /n Written by: GPT2-medium /n Cohen Produced by: Robert Carlock & Wendy Knoller [Scene: McMaster University Student Center. Rachel and Joey are talking] Joey:"
model_inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(model.device)
model.generation_config.pad_token_id = tokenizer.pad_token_id # suppress padding warning

# create a list that holds the prompt, future tokens will be appended into this list later on
all_tokens = model_inputs["input_ids"][0].tolist()

# to avoid hitting memory limits, generate only one batch with the initial prompt, take last few tokens out to generate the next batch.
context_window = 256 # take this many tokens to set up next batch's generation
max_token = 2048
step_size = 256 # generate only 256 new tokens at a time
num_of_loops = int(max_token / step_size)

for i in range(num_of_loops):

  output = model.generate(**model_inputs,
                          max_new_tokens=step_size,
                          do_sample=True,
                          top_k=50, # keep 50 highest probable options
                          top_p=0.9, # keep only the options that add up to p% probability, filter out the remaining less probable options
                          temperature=1) # to add a little bit more creativity if needed

  output_tokens = output[0].tolist()

  new_tokens = output_tokens[-step_size:] # get rid of the previous context and keep only new tokens
  all_tokens.extend(new_tokens) # and place those tokens into the list




  current_text = tokenizer.decode(all_tokens, skip_special_tokens=True)
  summary = summarize_long_text(current_text)

  recent_part = tokenizer.decode(all_tokens[-40:], skip_special_tokens=True)

  new_prompt = summary + "\n\n" + recent_part.strip() + "\n\n" + random.choice(guides)
  model_inputs = tokenizer(new_prompt, return_tensors="pt", return_attention_mask=True).to(model.device)




  # previous_context = output_tokens[-context_window:] # set new previous_context for next loop
  # previous_context_tensor = torch.tensor([previous_context]).to(model.device) # prepare the inputs and attention mask for next loop
  # attention_mask = (previous_context_tensor != tokenizer.pad_token_id).long()
  # model_inputs = {"input_ids": previous_context_tensor, "attention_mask":attention_mask}



# decode from the list and print results
generated_script_with_summary_guide = tokenizer.decode(all_tokens, skip_special_tokens=True)
print("-------------------------------------------------------------")
print(generated_script_with_summary_guide)

-------------------------------------------------------------
The One With McMaster University Party /n Written by: GPT2-medium /n Cohen Produced by: Robert Carlock & Wendy Knoller [Scene: McMaster University Student Center. Rachel and Joey are talking] Joey: You're saying we're crazy? Rachel: Yeah, you're just as crazy. I mean, do you realize how much of that stuff is just so... It's just bullshit. Joey: No, I mean, we're not crazy. We're just the ones who like it. You have a sense of humor. Rachel: Oh my God, I mean... But we're not gonna leave the world like this. That's why I'm going to start doing it. And you know why? Because I don't want you to go back to the old, old world of doing this shit like... [Shouts.] Joey: You don't even wanna go back to this shit, Rachel? Rachel: I'm not gonna go back to it! I'm gonna do it! It's okay! You can just stop thinking about it! You don't want to hear about it, you know? You don't want to hear about us doing stuff like this. It's not the sam

In [None]:
file_name = "generated_script_with_summary_guide.txt"
file_path = os.path.join(output_dir, file_name)

os.makedirs(output_dir, exist_ok=True)
with open(file_path, "w") as f:
    f.write(generated_script_with_summary_guide)
print(f"File saved to: {file_path}")

File saved to: /content/drive/MyDrive/SEP775NLP_Final_Project/Models/Falconsai/generated_script_with_summary_guide.txt
