In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
import torch
from transformer_lens import HookedTransformer
from jaxtyping import Float, Int
from muutils.dictmagic import condense_tensor_dict
import tkinter as tk
from tkinter import simpledialog, messagebox
from transformers import pipeline
import csv
import os  
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import json
import wandb
from transformers import TrainingArguments
import evaluate

In [2]:
def load_model(model_name):
    """ Load a model as a text generation pipeline. """
    return pipeline("text-generation", model=model_name)

def generate_text(prompt, model):
    """ Generate text using the specified model and prompt. """
    result = model(prompt, max_length=80, truncation=True)
    return result[0]['generated_text']

if __name__ == "__main__":
    model_name = "gpt2"
    model = load_model(model_name)
    
    # Get input from the user
    user_input = input("Enter your prompt: ")
    
    # Generate and print the output text
    output_text = generate_text(user_input, model)
    print("Generated Text:", output_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: today has been such a hot topic of discussion across the Internet lately, but with an ongoing scandal that's made headlines worldwide, there's no stopping that. "Just ask "The X," among other "hot topics" on Reddit, over the weekend:


Advertisement

There's even conspiracy-minded Twitter users running around wondering whether an actual search result actually comes from the actual site. And


In [3]:
def load_model(model_name):
    """ Load a model as a text generation pipeline. """
    return pipeline("text-generation", model=model_name, trust_remote_code=True)

class App:
    def __init__(self, root):
        self.root = root
        root.title("Model Comparison Tool")
        name1 = "gpt2"
        name2 = "EleutherAI/gpt-neo-1.3B"

        self.model1 = load_model(name1)  
        self.model2 = load_model(name2)  

        # Setup the UI
        tk.Label(root, text="Enter your prompt:").pack()
        
        self.prompt_entry = tk.Entry(root, width=50)
        self.prompt_entry.pack()

        self.generate_button = tk.Button(root, text="Generate Texts", command=self.generate_texts)
        self.generate_button.pack()

        tk.Label(root, text="", height=1).pack() 

        self.output1_label = tk.Label(root, text="", wraplength=300)
        self.output1_label.pack()

        tk.Label(root, text="", height=2).pack() 

        self.output2_label = tk.Label(root, text="", wraplength=300)
        self.output2_label.pack()

        tk.Label(root, text="", height=1).pack()

        self.select_button1 = tk.Button(root, text="Select Output 1", command=lambda: self.update_model(1))
        self.select_button1.pack()
        self.select_button1.config(state="disabled")  # Initially disabled

        self.select_button2 = tk.Button(root, text="Select Output 2", command=lambda: self.update_model(2))
        self.select_button2.pack()
        self.select_button2.config(state="disabled")  # Initially disabled

        self.corgis_label = tk.Label(root, text="Press the 'X' once you have had enough fun.")
        self.corgis_label.pack(side=tk.BOTTOM)

    def generate_texts(self):
        prompt = self.prompt_entry.get()
        result1 = self.model1(prompt, max_length=80, truncation=True)
        result2 = self.model2(prompt, max_length=80, truncation=True)
        self.output1 = result1[0]['generated_text']
        self.output2 = result2[0]['generated_text']

        self.output1_label.config(text=self.output1)
        self.output2_label.config(text=self.output2)

        self.select_button1.config(state="normal")
        self.select_button2.config(state="normal")
        
    def log_user_feedback(self, prompt, selected_output, model_name):
        feedback_data = {
            'prompt': prompt,
            'selected_output': selected_output,
            'model_name': model_name
        }

        with open('user_feedback.jsonl', 'a', encoding='utf-8') as file:
            json.dump(feedback_data, file)
            file.write('\n')  # Write a newline to separate JSON objects

    def update_model(self, selected_model):
        prompt = self.prompt_entry.get()
        selected_output = self.output1 if selected_model == 1 else self.output2
        model_name = "model1" if selected_model == 1 else "model2"
        
        self.log_user_feedback(prompt, selected_output, model_name)
        messagebox.showinfo("Selection", "Feedback recorded! Generating new texts...")
        self.generate_texts()

if __name__ == "__main__":
    root = tk.Tk()
    app = App(root)
    root.mainloop()


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


: 

In [12]:
def fine_tune_model(model_name, jsonl_file, output_dir):
    # Initialize wandb
    wandb.init(project="just_work", name="running_ostrich")

    # Load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    
    # Set padding token if undefined
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = GPT2LMHeadModel.from_pretrained(model_name)

    metric = evaluate.load("accuracy")

    # Make sure the model's embedding sizes match if you're changing the tokenizer's pad token
    model.config.pad_token_id = tokenizer.pad_token_id

    # Load dataset from JSONL file
    dataset = load_dataset('json', data_files=jsonl_file, split='train')
    
    # Define preprocessing function to concatenate prompt and selected_output
    def preprocess_function(examples):
        texts = [p + " " + o for p, o in zip(examples['prompt'], examples['selected_output'])]
        return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

    # Map preprocessing function
    dataset = dataset.map(preprocess_function, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

    # Data collator for padding
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Define a function to compute the loss and accuracy
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=6,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        compute_metrics=compute_metrics  # Add the compute_metrics function
    )

    # Start training
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Finish wandb run
    wandb.finish()

# Example usage:
fine_tune_model('gpt2', 'user_feedback.jsonl', 'fine_tuned_model')




[34m[1mwandb[0m: Currently logged in as: [33mmyeasyemailforstuff[0m ([33mml_stuff[0m). Use [1m`wandb login --relogin`[0m to force relogin


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 37%|███▋      | 11/30 [02:49<05:02, 15.89s/it]

In [6]:
class TextGeneratorApp:
    def __init__(self, master):
        self.master = master
        master.title("Text Generator")

        # Entry widget to take user input for the prompt
        self.prompt_entry = tk.Entry(master, width=50)
        self.prompt_entry.pack()

        # Button to trigger text generation
        self.generate_button = tk.Button(master, text="Generate", command=self.generate_text)
        self.generate_button.pack()

        # Label to display the generated text
        self.result_label = tk.Label(master, text="", wraplength=400)
        self.result_label.pack()

    def generate_text_from_prompt(self, prompt):
        model_dir = 'fine_tuned_model'  
        tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
        model = GPT2LMHeadModel.from_pretrained(model_dir)

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model.config.pad_token_id = tokenizer.pad_token_id

        encoding = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=200)
        outputs = model.generate(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask'],
            max_length=80,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            temperature=0.9,
            top_k=50,
            top_p=0.92,
            repetition_penalty=1.2,
            do_sample=True
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text

    def generate_text(self):
        user_input = self.prompt_entry.get()
        result = self.generate_text_from_prompt(user_input)
        self.result_label.config(text=result)

if __name__ == "__main__":
    root = tk.Tk()
    app = TextGeneratorApp(root)
    root.mainloop()

