In [None]:
import os
from tqdm import tqdm
import pandas as pd
import csv
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_path = os.path.join(os.getcwd(),"drive","MyDrive","LLM project","DATA","task-a-en.tsv")
print(data_path)

/content/drive/MyDrive/LLM project/DATA/task-a-en.tsv


In [None]:
# 1. Install necessary libraries
!pip install -q -U torch transformers bitsandbytes accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import huggingface_hub
from google.colab import userdata

HF_CACHE_DIR = "/content/drive/MyDrive/LLM project/hf_cache"

# 2. Login to Hugging Face
# Recommended: Add your token to Colab Secrets (key name: HF_TOKEN)
# OR enter it manually when prompted by the login() function below.
try:
    huggingface_hub.login(token=userdata.get('hf_EhcqthzoVNo***LiVCFWsGasuwobcoViZK'))
except:
    huggingface_hub.login()

# 3. Configure 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 4. Load Model and Tokenizer
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer_path = "/content/drive/MyDrive/LLM project/llama_tokenizer"

if os.path.exists(tokenizer_path) :
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
else :
  os.makedirs(tokenizer_path, exist_ok=True)
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.save_pretrained(tokenizer_path)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    cache_dir = HF_CACHE_DIR
)

print("Model loaded successfully.")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully.


In [None]:
def create_headline_prompt(headline_text):
    prompt_text = f"""### Instruction
You are a witty, cynical stand-up comedian.
Your task is to write EXACTLY ONE punchy joke (1–2 sentences) based on the provided headline.

### Examples
Here is how to turn a headline into a standalone joke (weaving the context into the setup):

Headline: "Study finds 90% of office meetings could be emails."
Joke: "A new study found that 90% of office meetings could be emails, which implies the other 10% could have just been silence."

Headline: "Billionaire builds giant clock inside a mountain."
Joke: "Jeff Bezos is building a giant clock inside a mountain, finally providing a way to tell time for the five people who actually survive the apocalypse."

Headline: "Scientists discover new species of deep-sea jelly."
Joke: "Scientists have discovered a new species of jelly at the bottom of the ocean, mostly because they were tired of looking for the ones in their donuts."

### Task
Target Headline: "{headline_text}"

### Constraints
1. The joke must be **STANDALONE**. Do not assume the audience has read the headline; include the premise in the joke itself.
2. Be clever, cynical, or ironic.
3. **NO** explanations or conversational filler (e.g., do not write "Here is the joke").
4. Output **ONLY** the joke.

### Response
Joke:"""

    return prompt_text

In [None]:
def create_words_prompt(word1, word2):
    prompt_text = f"""You are a witty, cynical stand-up comedian.

Task: Write EXACTLY ONE punchy joke (1–2 sentences) that connects the following two concepts: "{word1}" and "{word2}".

Here are examples of how to connect random words creatively:

Example 1 (Metaphor/Analogy):
Words: "unplug" + "fridge"
Joke: "My current relationship is exactly like an unplugged fridge: it's cold, dark, and I'm terrified to open it and see what's rotting inside."

Example 2 (Ironic Failure):
Words: "hammer" + "banana"
Joke: "I tried to fix my diet with the same tool I use to fix my furniture, but it turns out taking a hammer to a banana just makes a smoothie with too much crunch."

Example 3 (Cynical Observation):
Words: "measure" + "pizza"
Joke: "Trying to measure happiness with money is like trying to measure a pizza with a thermometer: you're using the wrong tool and you're just going to burn your hand."

MANDATORY Rules:
- You can use the words literally OR metaphorically.
- The logic must hold up (e.g., do not say a laptop cooks food).
- Do NOT explain the joke.
- Do NOT use filler like "Here is a joke."

Words to connect: "{word1}" and "{word2}"
Joke:"""

    return prompt_text

In [None]:
def append_entry(
    json_path: str,
    entry_id: str,
    entry_type: str,
    input_original: str,
    generated_joke: str
):
    # 1. Load existing file or create base structure
    if os.path.exists(json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = {"ids": {}}

    # 2. Append new entry
    data["ids"][entry_id] = {
        "type": entry_type,
        "input_original": input_original,
        "generated_joke": generated_joke
    }

    # 3. Write back to file
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)


In [None]:
def generate_jokes(input_path, output_file="/content/drive/MyDrive/LLM project/DATA/generated_jokes.json") :

  df = pd.read_csv(input_path, delimiter="\t")

  processed_ids = set()

  if os.path.isfile(output_file):
    print(f"Found existing file: {output_file}. Checking progress...")
    try:
        with open(output_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        existing_df = pd.json_normalize(data["ids"]).T.reset_index()
        # Rename columns
        existing_df = df.rename(columns={"index": "id"})

        if 'id' in existing_df.columns:
            processed_ids = set(existing_df['id'].astype(str))
            print(f"Resuming! {len(processed_ids)} records already finished.")
    except Exception as e:
        print(f"Warning: Could not read existing file ({e}). Starting separate backup.")
        output_file = 'results_incremental_v2.tsv'
  else:
    print("No existing file found. Starting from scratch.")
    # Create file and write header ONLY if it doesn't exist


  # Loop through every row in your dataframe
  for index, row in tqdm(df.iterrows(), total=df.shape[0]):

      if str(row["id"]) in processed_ids :
        continue

      # A. LOGIC: Decide which prompt to use
      # We check if headline is valid (not empty and not just a dash)
      if pd.notna(row['headline']) and str(row['headline']).strip() != '-':
          prompt = create_headline_prompt(row['headline'])
      else:
          prompt = create_words_prompt(row['word1'], row['word2'])

      # B. TOKENIZATION: Convert prompt to numbers
      input_ids = tokenizer.apply_chat_template(
          [{"role": "user", "content": prompt}],
          add_generation_prompt=True,
          return_tensors="pt"
      ).to(model.device) # Move data to GPU

      attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

      # C. GENERATION: The model "thinks"
      terminators = [
          tokenizer.eos_token_id,
          tokenizer.convert_tokens_to_ids("<|eot_id|>")
      ]

      outputs = model.generate(
          input_ids,
          attention_mask=attention_mask,
          pad_token_id=tokenizer.eos_token_id,
          max_new_tokens=128,       # Limit output length to save time
          eos_token_id=terminators,
          do_sample=True,          # Adds creativity (False = robotic/deterministic)
          temperature=0.9,         # Controls randomness (0.7 is a good balance)
          top_p=0.9,
          repetition_penalty=1.2,
      )

      # D. DECODING: Extract only the new response
      # We slice [input_ids.shape[-1]:] to remove the prompt from the answer
      response = outputs[0][input_ids.shape[-1]:]
      joke_text = tokenizer.decode(response, skip_special_tokens=True)

      # Clean up newlines/tabs so they don't break the TSV
      clean_joke = joke_text.replace('\t', ' ').replace('\n', ' ').strip()

      # if index % 5 == 0:
      #   if row["headline"] != "-" :
      #     print(f"input {row["id"]} : {row["headline"]}",end="\n")
      #   else :
      #     print(f"input {row["id"]} : ,{row["word1"]} , {row["word2"]}",end="\n")

      #   print(f"Clean joke : {clean_joke}")
      #   print("joke text :",joke_text,end="\n\n")

      entry_type = "headline" if row["headline"] != "-" else "words"
      input_original = row["headline"] if entry_type == "headline"  else f"{row["word1"]}, {row["word2"]}"

      append_entry(output_file, row["id"], entry_type, input_original, clean_joke)

In [None]:
generate_jokes(input_path = data_path)

No existing file found. Starting from scratch.


100%|██████████| 1200/1200 [1:07:53<00:00,  3.39s/it]
