In [1]:
# -------------------------------------------
# 0.  Install / import dependencies
# -------------------------------------------
# !pip install --quiet openai>=1.3 pandas tqdm tenacity

import os, re, random, time, json
import pandas as pd
from tqdm.auto import tqdm
from tenacity import retry, stop_after_attempt, wait_random_exponential
import openai

  from pandas.core import (


In [2]:
# -------------------------------------------
# 1.  Configure DeepSeek API
# -------------------------------------------
import openai
from keys_do_not_upload import open_AI_key
openai.api_key = open_AI_key
openai.base_url = "https://api.deepseek.com"

MODEL_NAME = "deepseek-chat"   
TEMPERATURE = 0.9
N_SAMPLES = 5000


In [3]:
# -------------------------------------------
# 2.  Word list & helpers
# -------------------------------------------
slang_words = [
    "bad", "bang", "beat", "bet", "blow", "bomb", "booked", "bounce", "bread", "broke",
    "burn", "buzz", "calm", "cap", "catch", "check", "chef", "chill", "clap", "clean",
    "clutch", "cold", "come", "cook", "cool", "crack", "cringe", "cut", "dank", "dark",
    "dead", "deadass", "dope", "drag", "draw", "drip", "drop", "dust", "extra", "fam",
    "fire", "fit", "flex", "gas", "ghost", "glow", "grind", "grub", "hard", "hater",
    "head", "hit", "hot", "jam", "kick", "kill", "light", "link", "lit", "live",
    "loaded", "long", "loop", "loud", "lowkey", "mad", "man", "mood", "move", "off",
    "peak", "pop", "press", "pressed", "pull", "quiet", "ride", "ripped", "roll", "run",
    "safe", "salty", "savage", "secure", "serve", "shade", "shook", "sick", "slaps",
    "slay", "slide", "smoke", "snap", "snack", "soft", "spill", "squad", "stack",
    "stale", "stan", "stick", "sus", "swag", "tea", "thick", "thin", "thirsty", "tight",
    "ting", "tool", "touch", "trash", "trip", "turnt", "vibe", "wave", "wet", "whip",
    "woke", "work", "bag", "bars", "base", "brick", "cake", "cheese", "dash", "dip",
    "fade", "game", "heat", "ice", "juice", "plug", "poppin", "rack", "sauce", "score",
    "shine", "trap"
]

word_pattern = {w: re.compile(rf"\b{re.escape(w)}\b", re.IGNORECASE) for w in slang_words}


@retry(stop=stop_after_attempt(6), wait=wait_random_exponential(multiplier=1, max=20))
def deepseek_sentence(word: str, slang: bool) -> str:
    """Call DeepSeek to generate a single sentence for one word."""
    sense = "modern slang sense" if slang else "its standard, literal meaning (not slang)"
    prompt = (
        f"Write one original, fluent English sentence that *contains* the word “{word}” "
        f"used in {sense}. Output **only** the sentence, no explanations or quotes."
    )

    response = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a creative writer."},
            {"role": "user", "content": prompt}
        ],
        temperature=TEMPERATURE,
        max_tokens=50,
        top_p=0.95
    )
    sentence = response.choices[0].message.content.strip()

    # Basic sanity checks
    if not word_pattern[word].search(sentence):
        raise ValueError("Target word not found, retrying.")
    if len(sentence.split()) < 3:
        raise ValueError("Sentence too short, retrying.")
    return sentence

In [None]:
# -------------------------------------------
# 3.  Main generation loop
# -------------------------------------------
samples = []
seen_sentences = set()

pbar = tqdm(total=N_SAMPLES, desc="Generating dataset")

while len(samples) < N_SAMPLES:
    w = random.choice(slang_words)
    is_slang = random.random() < 0.5           # roughly balanced
    try:
        s = deepseek_sentence(w, is_slang)
        if s.lower() in seen_sentences:
            continue
        samples.append({"sentence": s, "is_slang": int(is_slang)})
        seen_sentences.add(s.lower())
        pbar.update(1)
    except Exception as e:
        # most errors are handled by `tenacity`
        continue

pbar.close()

Generating dataset:   0%|          | 0/5000 [00:00<?, ?it/s]

In [1]:
# -------------------------------------------
# 4.  Save to CSV (and inspect)
# -------------------------------------------
df = pd.DataFrame(samples)
df.to_csv("slang_benchmark.csv", index=False)
df.head()


NameError: name 'pd' is not defined