## 1. Prerequisites

In [None]:
!pip install transformers datasets

In [2]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
import re

## 2. Import dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# Positive Examples (Keep only entries where the Rating column is not null)
normalized_jester_df = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Datasets/Preprocessed-Datasets/Positive-Examples/jester/normalized_jester.csv')
normalized_reddit_jokes_df = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Datasets/Preprocessed-Datasets/Positive-Examples/joke-dataset/normalized_reddit_jokes.csv')
normalized_stupidstuff_df = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Datasets/Preprocessed-Datasets/Positive-Examples/joke-dataset/normalized_stupidstuff.csv')

normalized_jester_df = normalized_jester_df[normalized_jester_df['Rating'].notna()]
normalized_reddit_jokes_df = normalized_reddit_jokes_df[normalized_reddit_jokes_df['Rating'].notna()]
normalized_stupidstuff_df = normalized_stupidstuff_df[normalized_stupidstuff_df['Rating'].notna()]

df = pd.concat([normalized_jester_df, normalized_reddit_jokes_df, normalized_stupidstuff_df], ignore_index=True)
df = df.dropna(subset=['Body'])
df = df[df['Body'].str.strip() != '']


def is_clean(text):
    banned = ["fuck", "shit", "sex", "rape"]
    return not any(bad in text.lower() for bad in banned)

def preprocessed_sample(sample):
    sample = str(sample)
    sample = sample.replace('\r', ' ').replace('\n', ' ')  # Replace line breaks with space
    sample = re.sub(r'[^a-zA-Z0-9.,!?\'\";:()\[\]{}-]', ' ', sample)  # Keep common punctuation
    sample = sample.lower()
    sample = re.sub(r'\s+', ' ', sample).strip()  # Normalize spaces
    return sample

df['Body'] = df['Body'].apply(preprocessed_sample)
df = df[df['Body'].apply(is_clean)]
df = df[df['Rating'] > 0.5]
print(len(df))

2282


Prepare a text file of only your positive jokes

In [5]:
from pathlib import Path

# — after your df filtering —
# df already has only positive, clean jokes in df['Body']

def split_joke(text: str):
    # Heuristic: split on the last sentence-ending punctuation
    # (?!.*[.!?]) ensures we grab the final sentence as punchline
    parts = re.split(r'(?<=[\.!?])\s+(?!.*[\.!?])', text.strip())
    if len(parts) == 1:
        # If no clear split, treat first half as setup
        mid = len(text)//2
        return text[:mid].strip(), text[mid:].strip()
    setup = " ".join(parts[:-1])
    punch = parts[-1]
    return setup.strip(), punch.strip()

# Apply split_joke and drop any that failed
jokes = df['Body'].tolist()
pairs = [split_joke(j) for j in jokes if len(j) > 20]
setups, punchlines = zip(*pairs)

# Write out JSONL for HuggingFace style fine-tuning
import json
out = []
for s, p in zip(setups, punchlines):
    out.append({"prompt": s, "completion": " " + p})

Path("jokes_pairs.jsonl").write_text(
    "\n".join(json.dumps(x) for x in out),
    encoding="utf-8"
)

1599026

Load GPT-2 and tokenizer


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # needed for batching
model = GPT2LMHeadModel.from_pretrained("gpt2")

Create Dataset and DataCollator

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

# 1. Load JSONL
ds = load_dataset("json", data_files="jokes_pairs.jsonl", split="train")

# 2. Tokenize
def tokenize_batch(example):
    # we concatenate prompt + completion so the model learns the whole sequence
    full = example["prompt"] + example["completion"]  # note the space in completion
    tokens = tokenizer(full, truncation=True,
                       max_length=128, padding="max_length")
    # labels = copy of input_ids, but we mask the prompt region with -100
    labels = tokens["input_ids"].copy()
    # Calculate the length of the prompt within the tokenized sequence
    prompt_len = len(tokenizer(example["prompt"], add_special_tokens=False, truncation=True, max_length=128)["input_ids"])
    # mask prompt tokens so loss only on punchline
    # Ensure prompt_len does not exceed the length of labels
    prompt_len = min(prompt_len, len(labels))
    for i in range(prompt_len):
        labels[i] = -100
    tokens["labels"] = labels
    return tokens

tokenized = ds.map(tokenize_batch,
                   remove_columns=ds.column_names,
                   batched=False)

# 3. Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

Set up Trainer

In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Training on GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Training on CPU.")


CUDA is available. Training on GPU.


In [9]:
training_args = TrainingArguments(
    output_dir="./gpt2-jokes",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=5e-5,
    # Specify the device for training
    fp16=True, # Enable mixed precision training (if supported by your GPU
    **{"no_cuda": False} # for cuda
)

# Move the model to the selected device
model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,
)

# 5. Fine-tune!
trainer.train()
trainer.save_model("./gpt2-jokes")
tokenizer.save_pretrained("./gpt2-jokes")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmihneavicentiu[0m ([33mmihneavicentiu-bucharest-university-of-economic-studies[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.4992
200,3.3193
300,3.2534
400,3.2274
500,3.2601
600,3.1438
700,2.8947
800,2.8872
900,2.9207
1000,2.8649


('./gpt2-jokes/tokenizer_config.json',
 './gpt2-jokes/special_tokens_map.json',
 './gpt2-jokes/vocab.json',
 './gpt2-jokes/merges.txt',
 './gpt2-jokes/added_tokens.json')

In [None]:
# 1. Save to a local folder (inside Colab VM)
model.save_pretrained("./gpt2-jokes")       # saves pytorch_model.bin and config.json
tokenizer.save_pretrained("./gpt2-jokes")   # saves vocab and special-tokens files

# 2. (Optional) Copy that folder into your Google Drive for persistence
!cp -r ./gpt2-jokes "/content/drive/MyDrive/Proiect NLP/models/gpt2-jokes"

In [25]:
# 4. Generate jokes
model.eval()
prompt = "Yo mamma"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Generate 5 joke variants
outputs = model.generate(
    input_ids,
    max_length=30,
    do_sample=True,
    temperature=0.94,
    top_p=0.9,
    num_return_sequences=5,
    pad_token_id=tokenizer.eos_token_id
)

def cut_at_last_punctuation(text):
    match = re.search(r'(.+[.!?])', text)
    return match.group(1).strip() if match else text.strip()

for i, out in enumerate(outputs, 1):
    text = tokenizer.decode(out, skip_special_tokens=True)
    clean = cut_at_last_punctuation(text)
    print(f"{i}. {clean}")

1. Yo mamma so fat she stuck her head out of the window on a mule!" said the young lady. "what?
2. Yo mamma so fat she lost her weight! her husband is like, "damn! she lost it!
3. Yo mamma so fat she walked into a limo, put on a pair of white panties, and got in the way of a car.
4. Yo mamma so dumb she walked over to the kitchen and said, "i can't do that" and got out her cell phone.
5. Yo mamma so stupid, that she threw a birthday party for her 10th birthday party, and all the guests were invited to join her.
