In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import json
import time


import google.generativeai as genai
import typing_extensions as typing
from google.api_core import retry

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
def create_prompt(joke) :
  prompt = f"""
Here is a raw joke:
"{joke}"

Based on this joke, generate a JSON object with the following three fields:

1. "instruction": Write a natural, human-like prompt asking for this specific joke.
   - Do not just say "Tell me a joke."
   - Include the specific topic, style, or format of the joke (e.g., "Tell me a dad joke about chemistry," "I need a witty one-liner about marriage," "Give me a pun about fruit").
   - Vary the phrasing (e.g., "Make me laugh with...", "Do you know any jokes about...", "Crack a joke regarding...").

2. "reasoning": A concise analysis (1-2 sentences) of why the joke works.
   - Identify the humor mechanism (e.g., wordplay, pun, double entendre, subversion of expectation, irony).
   - Explain the connection (e.g., "The humor relies on the double meaning of the word 'date' as both a fruit and a social meeting.").

3. "improved_joke": The joke itself, but polished.
   - Fix any typos, capitalization, or punctuation errors from the raw input.
   - Ensure the timing is correct (use newlines or punctuation to separate setup and punchline).

Output strictly valid JSON. Do not include markdown formatting (like ```json). Just the raw JSON string."""

  return prompt

In [4]:
class JokeSchema(typing.TypedDict):
    instruction: str
    reasoning: str
    improved_joke: str

generation_config = {
    "temperature": 0.7,
    "top_p": 0.9,
    "response_mime_type": "application/json",  # FIXED: responseMimeType -> response_mime_type
    "max_output_tokens": 2048,     # FIXED: maxOutputTokens -> max_output_tokens

    # FIXED: responseSchema -> response_schema
    # You can pass the class directly (JokeSchema) or the raw dict, but the key must be snake_case
    "response_schema": JokeSchema
}

my_retry_policy = retry.Retry(
    initial=1.0,       # Wait just 1 second (was 10.0)
    multiplier=2.0,    # Double the wait (1s -> 2s -> 4s...)
    maximum=15.0,      # Cap the wait at 10 seconds (was 60.0)
    timeout=40.0       # Give up after 30 seconds total (was 300.0)
)

# --- CONFIGURATION ---
API_KEY = "AIzaSyDYzF1n1HdyLQTDRKfFKZplFKW8rRlqy9w"
genai.configure(api_key=API_KEY)

model = genai.GenerativeModel('gemini-3-flash-preview')


In [5]:
def append_entry(
    output_path: str,
    original_id: int,
    response : str
):
    new_entry = json.loads(response)
    new_entry["ID"] = original_id

    # Open with 'a' (append) to add to the end of the file
    with open(output_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(new_entry) + "\n")


In [6]:
input_path = "/content/drive/MyDrive/LLM project/DATA/sampled_shortjokes.csv"
output_path = "/content/drive/MyDrive/LLM project/DATA/synthetic_data.jsonl"

In [7]:
def generate_data(input_path: str, output_path: str):
    # Load input
    if os.path.exists(input_path):
        df = pd.read_csv(input_path)
    else:
        print("Could not locate the input file")
        return

    # Check for progress
    existing_ids = set() # Using a set is much faster for checking "if x in ids"
    if os.path.exists(output_path):
        try:
            existing_df = pd.read_json(output_path, lines=True)
            existing_ids = set(existing_df["ID"].tolist())
            print(f"Resuming: Found {len(existing_ids)} existing entries.")
        except Exception:
            print("Found existing file but could not read it. Starting fresh or check format.")

    for index, row in tqdm(df.iterrows(), total=len(df)):
        joke_id = int(row["ID"])

        if joke_id in existing_ids:
            continue

        try:
            response = model.generate_content(
                create_prompt(row["Joke"]),
                generation_config=generation_config,
                request_options={'retry': my_retry_policy}
            )

            # SAFETY CHECKS
            candidate = response.candidates[0]
            finish_reason = candidate.finish_reason.name

            # 1. Check for Safety Blocks specifically
            if finish_reason == "SAFETY":
                print(f"üö´ ID {joke_id}: Blocked by Safety Filters.")
                continue

            # 2. Check for other non-success reasons (like Recitation or Max Tokens)
            if finish_reason != "STOP":
                print(f"‚ö†Ô∏è ID {joke_id}: Skipped. Finish Reason: {finish_reason}")
                continue

            # 3. Check for empty text (just in case)
            if not response.text:
                print(f"‚ö†Ô∏è ID {joke_id}: Response was empty.")
                continue

            append_entry(
                output_path=output_path,
                original_id=joke_id,
                response=response.text
            )

        except Exception as e:
            print(f"\nError at ID {joke_id}: {e}")
            # If it's a serious API error, you might want to wait longer
            time.sleep(1)
            continue

    print("\nGeneration complete!")

In [8]:
generate_data(input_path, output_path)

  0%|          | 0/3500 [00:00<?, ?it/s]

‚ö†Ô∏è ID 2181: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 3548: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 6350: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 10636: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 12105: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 12457: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 19951: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 20144: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 23319: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 27262: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 27612: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 27794: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 28090: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 28756: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 28759: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 29004: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 31254: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 32078: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 35668: Skipped. Finish Reason: MAX_TOKENS
‚ö†Ô∏è ID 36113