In [4]:
# ## 1. Install Required Packages
# !pip install transformers torch pandas tqdm

# ## 2. Imports
import random
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from transformers import AutoConfig


# ## 3. Define the Slang Words 
slang_words = [
    "bad", "bang", "beat", "bet", "blow", "bomb", "booked", "bounce", "bread", "broke",
    "burn", "buzz", "calm", "cap", "catch", "check", "chef", "chill", "clap", "clean",
    "clutch", "cold", "come", "cook", "cool", "crack", "cringe", "cut", "dank", "dark",
    "dead", "deadass", "dope", "drag", "draw", "drip", "drop", "dust", "extra", "fam",
    "fire", "fit", "flex", "gas", "ghost", "glow", "grind", "grub", "hard", "hater",
    "head", "hit", "hot", "jam", "kick", "kill", "light", "link", "lit", "live",
    "loaded", "long", "loop", "loud", "lowkey", "mad", "man", "mood", "move", "off",
    "peak", "pop", "press", "pressed", "pull", "quiet", "ride", "ripped", "roll", "run",
    "safe", "salty", "savage", "secure", "serve", "shade", "shook", "sick", "slaps",
    "slay", "slide", "smoke", "snap", "snack", "soft", "spill", "squad", "stack",
    "stale", "stan", "stick", "sus", "swag", "tea", "thick", "thin", "thirsty", "tight",
    "ting", "tool", "touch", "trash", "trip", "turnt", "vibe", "wave", "wet", "whip",
    "woke", "work", "bag", "bars", "base", "brick", "cake", "cheese", "dash", "dip",
    "fade", "game", "heat", "ice", "juice", "plug", "poppin", "rack", "sauce", "score",
    "shine", "trap"
]

# ## 4. Choose a Hugging Face Text-Generation Model (DeepSeek-R1-0528)
model_name = "deepseek-ai/DeepSeek-R1-0528"

# Load config first
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)

# Remove quantization config if it exists
if hasattr(config, 'quantization_config'):
    del config.quantization_config

# Load tokenizer as before
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Load model with trust_remote_code=True so DeepSeek’s custom files are executed
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float32,
    config=config
)
model.eval()

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# ## 5. Prompt Templates
slang_template = (
    "Write a single sentence that uses the word “{word}” in a slang/idiomatic sense. "
    "Make it ambiguous so that someone might not be sure if it’s literal or slang.\n\n"
    "Sentence: "
)
literal_template = (
    "Write a single sentence that uses the word “{word}” in a purely literal sense "
    "(no slang meaning). Make it ambiguous enough that someone might mistake it for slang.\n\n"
    "Sentence: "
)

# ## 6. Function to Generate One Sentence per Prompt
def generate_sentence(prompt: str, max_length: int = 50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    # We’ll do greedy decoding so it’s deterministic (no sampling)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[-1] + max_length,
            do_sample=False,
            temperature=0.7,    # You can raise/lower for creativity
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
    # Drop the prompt tokens to keep only newly generated tokens
    gen_tokens = outputs[0][inputs["input_ids"].shape[-1] :]
    text = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()
    # Sometimes the model writes more than one sentence—keep only up to the first period/question/exclamation.
    for sep in [".", "?", "!"]:
        if sep in text:
            text = text.split(sep)[0].strip() + sep
            break
    return text

# ## 7. Build Exactly 2 500 Slang and 2 500 Literal Examples
NUM_TOTAL = 5000
NUM_PER_CLASS = NUM_TOTAL // 2  # 2 500 slang and 2 500 literal
sentences = []
labels = []

# We’ll cycle through `slang_words` randomly until we collect 2 500 of each type.
random.shuffle(slang_words)
slang_count = 0
literal_count = 0
i = 0

pbar = tqdm(total=NUM_TOTAL, desc="Generating sentences")
while slang_count < NUM_PER_CLASS or literal_count < NUM_PER_CLASS:
    word = slang_words[i % len(slang_words)]
    i += 1

    # Generate one slang-sense example if we still need slang
    if slang_count < NUM_PER_CLASS:
        prompt = slang_template.format(word=word)
        sent = generate_sentence(prompt)
        sentences.append(sent)
        labels.append(1)  # 1 = slang
        slang_count += 1
        pbar.update(1)

    # Generate one literal-sense example if we still need literal
    if literal_count < NUM_PER_CLASS:
        prompt = literal_template.format(word=word)
        sent = generate_sentence(prompt)
        sentences.append(sent)
        labels.append(0)  # 0 = literal
        literal_count += 1
        pbar.update(1)

pbar.close()

# ## 8. Shuffle the Combined List and Create a DataFrame
combined = list(zip(sentences, labels))
random.shuffle(combined)
sentences_shuffled, labels_shuffled = zip(*combined)

df_out = pd.DataFrame({
    "sentence": sentences_shuffled,
    "binary": labels_shuffled
})

# ## 9. Quick Sanity Check (peek at a few rows)
print("Total rows:", len(df_out))
display(df_out.head(10))

# ## 10. Save to CSV
output_path = "slang_literal_benchmark_5000.csv"
df_out.to_csv(output_path, index=False)
print(f"Saved dataset to {output_path}")


Downloading shards:   0%|          | 0/163 [00:00<?, ?it/s]

model-00001-of-000163.safetensors:   0%|          | 21.0M/5.26G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model-00002-of-000163.safetensors:   0%|          | 0.00/4.30G [00:00<?, ?B/s]

KeyboardInterrupt: 