In [1]:
%pip install --upgrade gradio accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import re
import sys
from tqdm.auto import tqdm
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset


In [3]:
transcript_fp = '../Final Project/data/transcripts_separated-1.txt'
with open(transcript_fp, 'r') as f:
    transcript = f.read()

delimter = '=================================================='
transcripts = transcript.split(delimter)
_ = transcripts.pop(0)

In [4]:
transcripts[0]

"\n[Music] Runk hello hello hello [Music] H I used to list that in my car I used to like it dude I used to like it you still like it I I do yeah but I us that soundtrack was a banger it was a really good soundtrack yeah surprised you liked it you don't like any music what other movies is that song in Pulp Fiction was the song I want to know what other definitely one of those movies like a bugs life that ripped off that ripped off you definitely Dick Dale say s wordss so yeah so welcome to Lifeline thank you very much sign on up for a patreon lifeline.com I'm sorry patreon.com Lifeline that's where you see the live shows that's where you see Lifeline luxury we do extra episodes of Lifeline we do Lifeline luxury we do at least two three a month um so uh at least three a month you you still say two and that pisses me off I want to do what I want to do we've grown it we've grown it we've expanded it used to be two now it's three you get three episodes a month at least at patreon.com lifean

In [5]:
# -----------------------------------------
# Instructions:
# 1. Replace 'your_local_path_here' with a local directory on your machine.
#    For example: transcripts_dir = 'C:/Users/YourName/Documents/Transcripts'
#
# 2. Ensure that the directory exists or use os.makedirs to create it.
#
# 3. Provide your own list of transcripts or load them from a file.
# -----------------------------------------

# Your transcripts as a Python list of strings.
# Replace these with your own transcripts from YouTube or another source.

# Set your local directory for storing transcripts
transcripts_dir = './data/transcripts/'  # e.g. 'C:/Users/YourName/Documents/Transcripts'
os.makedirs(transcripts_dir, exist_ok=True)

# Option 1: Save as a Text File with Separators
text_file_path = os.path.join(transcripts_dir, 'transcripts_separated.txt')
separator = "\n" + "="*50 + "\n"

with open(text_file_path, 'w', encoding='utf-8') as txt_file:
    for transcript in transcripts:
        txt_file.write(separator)
        txt_file.write(transcript + "\n")

print(f"Transcripts have been saved to {text_file_path}")

# Option 2: Save as a JSON File with Pretty-Printing
json_file_path = os.path.join(transcripts_dir, 'transcripts_separated.json')

with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(transcripts, json_file, ensure_ascii=False, indent=4)

print(f"Transcripts have been saved to {json_file_path}")



Transcripts have been saved to ./data/transcripts/transcripts_separated.txt
Transcripts have been saved to ./data/transcripts/transcripts_separated.json


In [6]:

# -----------------------------------------
# Cleaning function for text
# -----------------------------------------
def clean_text(text):
    # Remove non-verbal cues like [Applause], [Laughter], etc.
    text = re.sub(r"\[.*?\]", "", text)
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)
    # Strip leading and trailing spaces
    text = text.strip()

    # Insert commas for likely sentence continuations
    text = re.sub(r"\b(and|but|so|or)\b", r", \1", text, flags=re.IGNORECASE)

    # Insert periods before capital letters indicating new sentences
    text = re.sub(r"(?<!\w)([a-zA-Z]+)(\s+)([A-Z])", r"\1.\2\3", text)

    # Capitalize the start of each sentence after periods
    sentences = text.split(". ")
    sentences = [sentence.capitalize() for sentence in sentences]
    text = ". ".join(sentences)

    # Add a period at the end if missing
    if text and text[-1] not in ".!?":
        text += "."

    return text

# Clean each transcript
cleaned_transcripts = [clean_text(transcript) for transcript in transcripts]

In [7]:

# -----------------------------------------
# Fine-tuning the GPT-2 model
# -----------------------------------------
os.environ["WANDB_DISABLED"] = "true"

# Create a Hugging Face Dataset from the cleaned transcripts
dataset = Dataset.from_dict({'text': cleaned_transcripts})

# Split the dataset into training and test sets (e.g., 90-10 split)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Load the GPT-2 tokenizer and set up padding
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Pad token set to EOS

# Tokenize the data
def tokenize(batch):
    encoding = tokenizer(batch["text"], padding=True, truncation=True, max_length=512)
    encoding["labels"] = encoding["input_ids"].copy()  # labels = input_ids for language modeling
    return encoding

train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["text"])
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["text"])

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./my_chatbot_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start fine-tuning
trainer.train()

# Save the Fine-Tuned Model and tokenizer to a local directory
local_model_dir = 'your_local_path_here/ComBot'  # e.g. 'C:/Users/YourName/Documents/ComBot'
os.makedirs(local_model_dir, exist_ok=True)
model.save_pretrained(local_model_dir)
tokenizer.save_pretrained(local_model_dir)

print(f"Model and tokenizer saved to {local_model_dir}")




Map:   0%|          | 0/426 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 3.1370296478271484, 'eval_runtime': 13.2729, 'eval_samples_per_second': 3.616, 'eval_steps_per_second': 1.808, 'epoch': 1.0}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 3.0843374729156494, 'eval_runtime': 13.5434, 'eval_samples_per_second': 3.544, 'eval_steps_per_second': 1.772, 'epoch': 2.0}
{'loss': 3.1023, 'learning_rate': 1.0876369327073553e-05, 'epoch': 2.35}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 3.0744972229003906, 'eval_runtime': 12.3103, 'eval_samples_per_second': 3.899, 'eval_steps_per_second': 1.95, 'epoch': 3.0}
{'train_runtime': 1444.272, 'train_samples_per_second': 0.885, 'train_steps_per_second': 0.442, 'train_loss': 3.0630564323986453, 'epoch': 3.0}
Model and tokenizer saved to your_local_path_here/ComBot


In [10]:

# -----------------------------------------
# Creating a Chat Interface using Gradio
# -----------------------------------------

# Load the fine-tuned model and tokenizer from local directory
model = GPT2LMHeadModel.from_pretrained(local_model_dir)
tokenizer = GPT2Tokenizer.from_pretrained(local_model_dir)

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs.input_ids, max_length=100, 
                             num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

iface = gr.Interface(
    fn=generate_response,
    inputs="text",
    outputs="text",
    title="My Fine-Tuned Chatbot",
    description="Interact with your chatbot trained on your custom transcripts!"
)

# Launch the Gradio interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
