<h1> Generate jokes using BART </h1>

<h2> 1. Prerequisites </h2>

In [1]:
!pip install transformers datasets torch scikit-learn
!pip install --upgrade transformers

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [2]:
import pandas as pd
from transformers import BartTokenizerFast
import re

<h2> 2. Import dataset </h2>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
# Positive Examples (Keep only entries where the Rating column is not null)

normalized_jester_df = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Datasets/Preprocessed-Datasets/Positive-Examples/jester/normalized_jester.csv')
normalized_reddit_jokes_df = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Datasets/Preprocessed-Datasets/Positive-Examples/joke-dataset/normalized_reddit_jokes.csv')
normalized_stupidstuff_df = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Datasets/Preprocessed-Datasets/Positive-Examples/joke-dataset/normalized_stupidstuff.csv')

normalized_jester_df = normalized_jester_df[normalized_jester_df['Rating'].notna()]
normalized_reddit_jokes_df = normalized_reddit_jokes_df[normalized_reddit_jokes_df['Rating'].notna()]
normalized_stupidstuff_df = normalized_stupidstuff_df[normalized_stupidstuff_df['Rating'].notna()]

# Negative Examples (All of them, put 0 in the Rating column)

news_category_dataset_df = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Datasets/Preprocessed-Datasets/Negative-Examples/News_Category_Dataset_v3/News_Category_Dataset_v3.csv')
news_category_dataset_df['Rating'] = news_category_dataset_df['Rating'].fillna(0.0)

df = pd.concat([normalized_jester_df, normalized_reddit_jokes_df, normalized_stupidstuff_df, news_category_dataset_df], ignore_index=True)
df = df.dropna(subset=['Body'])
df = df[df['Body'].str.strip() != '']
print(df[df['Rating'] != 0.0])
print(df[df['Rating'] == 0.0])

def preprocessed_sample(sample):
  sample = str(sample)
  sample = re.sub(r'[^a-zA-Z0-9\s]', '', sample) # Only letters (lowercase and uppercase) + digits
  sample = sample.lower() # Everything lowercase
  sample = re.sub(r'\d+', 'NUMBER', sample) # Replace all numerical values with a common label.
  return sample

df['Body'] = df['Body'].apply(preprocessed_sample)

          ID Title       Category  \
0          5   NaN            NaN   
1          7   NaN            NaN   
2          8   NaN            NaN   
3         13   NaN            NaN   
4         15   NaN            NaN   
...      ...   ...            ...   
198457  3765   NaN  Miscellaneous   
198458  3766   NaN  Miscellaneous   
198462  3770   NaN  Miscellaneous   
198464  3772   NaN  Miscellaneous   
198465  3773   NaN  Miscellaneous   

                                                     Body    Rating  
0       Q.\tWhat's O. J. Simpson's Internet address? \...  0.153659  
1       How many feminists does it take to screw in a ...  0.145475  
2       Q. Did you hear about the dyslexic devil worsh...  0.321407  
3       They asked the Japanese visitor if they have e...  0.334060  
4       Q:  What did the blind person say when given s...  0.212328  
...                                                   ...       ...  
198457  Britain decided it was time to switch left lan...  0.8000

<h2> 3. Keep only positive examples </h2>

In [30]:
pos_df = df[df["Rating"] > 0].reset_index(drop=True)
pos_df = pos_df.head(30000)

<h2> 4. Build a dataset </h2>

In [31]:
from datasets import Dataset
train_ds = Dataset.from_pandas(pos_df[["Body"]])
val_ds   = train_ds.train_test_split(test_size=0.05, seed=42)["test"]

<h2> 5. Tokenizer & Model <h2>

In [32]:
from transformers import BartTokenizerFast, BartForConditionalGeneration
model_name = "facebook/bart-base"
tok = BartTokenizerFast.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

<h2> 6. Data Collator </h2>

In [33]:
def encode(batch):
    inputs = ["Tell me a joke:" for _ in batch["Body"]]
    enc_inputs = tok(inputs, truncation=True, padding="max_length", max_length=32)
    enc_labels = tok(batch["Body"], truncation=True, padding="max_length", max_length=64)

    batch["input_ids"] = enc_inputs["input_ids"]
    batch["attention_mask"] = enc_inputs["attention_mask"]
    batch["labels"] = enc_labels["input_ids"]
    return batch

train_ds = train_ds.map(encode, batched=True, remove_columns=["Body"])
val_ds   = val_ds.map(encode,   batched=True, remove_columns=["Body"])

from transformers import DataCollatorForSeq2Seq
collator = DataCollatorForSeq2Seq(tok, model=model)


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

<h2> 7. Train args & Trainer </h2>

In [34]:
from transformers import TrainingArguments, Trainer
import torch
args = TrainingArguments(
    output_dir          = "/content/bart‑jokes",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    gradient_accumulation_steps = 4,
    eval_strategy       = "steps",
    eval_steps          = 500,
    logging_steps       = 100,
    save_steps          = 1000,
    save_total_limit    = 2,
    learning_rate       = 5e-5,
    num_train_epochs    = 3,
    lr_scheduler_type   = "cosine",
    warmup_ratio        = 0.1,
    fp16                = torch.cuda.is_available(),
    report_to           = "none"
)

trainer = Trainer(
    model         = model,
    args          = args,
    train_dataset = train_ds,
    eval_dataset  = val_ds,
    data_collator = collator,
)

trainer.train()

Step,Training Loss,Validation Loss
500,1.4758,1.326297
1000,1.3235,1.238811
1500,1.3494,1.189488
2000,1.2406,1.158931
2500,1.2533,1.147676




TrainOutput(global_step=2811, training_loss=1.6778561474377047, metrics={'train_runtime': 714.9999, 'train_samples_per_second': 125.874, 'train_steps_per_second': 3.931, 'total_flos': 1713359447654400.0, 'train_loss': 1.6778561474377047, 'epoch': 2.997333333333333})

<h2> 8. Generate brand new joke </h2>

In [66]:
def make_joke(min_len = 30, max_len = 70):
    input_text = "Tell me a one-liner joke: "
    dummy = tok(input_text, return_tensors="pt").to(model.device)

    ids = model.generate(
        input_ids=dummy["input_ids"],
        attention_mask=dummy["attention_mask"],
        min_length=min_len,
        max_new_tokens=max_len,
        decoder_start_token_id=tok.eos_token_id,
        num_return_sequences=1,
        do_sample=True,
        top_k=30,
        top_p=0.9,
        temperature=0.7,
        repetition_penalty=1.3,
        no_repeat_ngram_size=3,
    )

    jokes = [
        tok.decode(i, skip_special_tokens=True)
        for i in ids
    ]

    return jokes

import random

def clean_joke(joke):
    joke = joke[0]
    return (
        joke.replace("NUMBER", str(random.randint(1, 100)))
            .replace("NAME", random.choice(["Bob", "Alice", "Charlie"]))
            .replace("PLACE", random.choice(["a bar", "the park", "the store"]))
    )

print("•", clean_joke(make_joke()))


• the bartender asks him what he wants to drink 

the man replies i want to have a drink  
the bartender looks at the man and says im sorry but you cant have a beer  
