In [1]:
# ==========================================
# 0. INSTALL DEPENDENCIES (Run this once)
# ==========================================
!pip install "unsloth[colab-new]"
!pip install --no-deps "unsloth_zoo"
!pip install --no-deps packaging ninja einops
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install datasets sentence-transformers faiss-cpu tqdm

import os
import json
import pandas as pd
from unsloth import FastLanguageModel
from google.colab import drive
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np

# Use tqdm logic to determine best progress bar
try:
    from IPython import get_ipython
    if get_ipython():
        from tqdm.notebook import tqdm
    else:
        from tqdm import tqdm
except:
    from tqdm import tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
# ==========================================
# 1. SETUP & CONFIGURATION
# ==========================================
# Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Hugging Face Cache to Drive (Persistent)
DRIVE_CACHE = "/content/drive/MyDrive/LLM project/Cache/HF"
os.makedirs(DRIVE_CACHE, exist_ok=True)

os.environ["HF_HOME"] = f"{DRIVE_CACHE}/hf_home"
os.environ["HF_HUB_CACHE"] = f"{DRIVE_CACHE}/hf_hub"
os.environ["HF_DATASETS_CACHE"] = f"{DRIVE_CACHE}/datasets"
os.environ["TRANSFORMERS_CACHE"] = f"{DRIVE_CACHE}/transformers"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = f"{DRIVE_CACHE}/sentence_transformers"

from huggingface_hub import login
# Replace 'YOUR_HF_TOKEN' with your actual token or use Colab secrets
# login(token="YOUR_HF_TOKEN")

# Paths
hf_model_id = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
# Smart Load: Use local if available to avoid stuck downloads
drive_model_path = "/content/drive/MyDrive/LLM project/Models/Qwen2.5-7B-Instruct-bnb-4bit"

if os.path.exists(drive_model_path):
    print(f"Found local Qwen model at {drive_model_path}. Using it to load faster!")
    model_id = drive_model_path
else:
    print(f"Local model not found. Downloading {hf_model_id} from Hugging Face...")
    model_id = hf_model_id

input_path = "/content/drive/MyDrive/LLM project/DATA/task-a-en.tsv"
output_file = "/content/drive/MyDrive/LLM project/DATA/outputs_qwen_rag.jsonl"

# Config for Retrieval
N_WIKI_DOCS = 25000
RETRIEVAL_K = 4
MAX_CONTEXT_CHARS = 1200


Found local Qwen model at /content/drive/MyDrive/LLM project/Models/Qwen2.5-7B-Instruct-bnb-4bit. Using it to load faster!


In [3]:
# ==========================================
# 2. LOAD RAG SYSTEM (Retriever)
# ==========================================
class HFRetriever:
    def __init__(self, n_docs=25000):
        print(f"Loading embedded Wikipedia subset: {n_docs} docs ...")
        # Load pre-embedded dataset (fast)
        self.ds = load_dataset(
            "not-lain/wikipedia",
            revision="embedded",
            split=f"train[:{n_docs}]"
        )
        self.texts = [str(x) for x in self.ds["text"]]
        # Load embeddings into numpy (fast cosine sim)
        self.embs = np.array(self.ds["embeddings"], dtype=np.float32)

        # Normalize doc vectors once
        self.embs = self.embs / (np.linalg.norm(self.embs, axis=1, keepdims=True) + 1e-12)

        print("Loading query embedding model on CPU (Safe Mode)...")
        self.encoder = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", device="cpu")
        print("RAG Index Ready!")

    def retrieve(self, query, k=4, max_chars=1200):
        try:
            # Encode query
            q = self.encoder.encode([query]).astype(np.float32)
            q = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-12)

            # Fast Cosine Similarity
            sims = self.embs @ q[0]
            top_idx = np.argsort(-sims)[:k]

            # Combine results
            combined_text = "\n\n".join([self.texts[i] for i in top_idx])
            return combined_text[:max_chars]
        except Exception as e:
            print(f"Retrieval failed: {e}")
            return ""

# Initialize Retriever (Fast Mode)
retriever = HFRetriever(n_docs=N_WIKI_DOCS)

Loading embedded Wikipedia subset: 25000 docs ...
Loading query embedding model on CPU (Safe Mode)...
RAG Index Ready!


In [4]:
# ==========================================
# 3. LOAD QWEN MODEL
# ==========================================
print(f"Loading Qwen from {model_id}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = 2048,
    load_in_4bit = True,
    dtype = None,
    device_map = "auto",
)
FastLanguageModel.for_inference(model)

# Save model if new and downloaded from HF
if model_id == hf_model_id and not os.path.exists(drive_model_path):
    print(f"Saving model to Drive for future use: {drive_model_path}")
    model.save_pretrained(drive_model_path)
    tokenizer.save_pretrained(drive_model_path)

# Fix for Unsloth padding if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Fix: Ensure pad token is DIFFERENT from EOS to avoid attention mask warnings
if tokenizer.pad_token_id == tokenizer.eos_token_id:
    pass


Loading Qwen from /content/drive/MyDrive/LLM project/Models/Qwen2.5-7B-Instruct-bnb-4bit...
==((====))==  Unsloth 2026.1.3: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# ==========================================
# 4. GENERATION LOOP
# ==========================================
# Read Input
df = pd.read_csv(input_path, sep='\t')
data = df.to_dict('records')

# DEBUG: Pick 3 headlines + 3 word-pairs (Uncomment to use)
#df_head = df[df['headline'].notna() & (df['headline'] != "-")].head(3)
#df_word = df[df['word1'].notna() & (df['word1'] != "-")].head(3)
#data = pd.concat([df_head, df_word]).to_dict('records')


# Resume Check
processed_ids = set()
if os.path.exists(output_file):
    with open(output_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                saved_item = json.loads(line)
                processed_ids.add(saved_item['id'])
            except: pass
    print(f"Resuming... Found {len(processed_ids)} jokes.")

print("Starting RAG Generation with Qwen...")

for i, row in enumerate(tqdm(data, desc="Generating Jokes")):
    current_id = row['id']
    if current_id in processed_ids: continue

    # Parse Input
    headline_val = row.get('headline')
    w1_val = row.get('word1')
    w2_val = row.get('word2')

    # Safely convert to string only if valid
    headline_str = str(headline_val).strip() if pd.notna(headline_val) else "-"
    w1_str = str(w1_val).strip() if pd.notna(w1_val) else "-"
    w2_str = str(w2_val).strip() if pd.notna(w2_val) else "-"

    # --- DETERMINE TYPE & QUERY ---
    if headline_str != "-" and headline_str != "" and headline_str.lower() != "nan":
        input_type = "headline"
        input_content = headline_str

        # Exact query wrapper for headlines
        retrieval_query = "Background facts and context about: " + headline_str
        context = retriever.retrieve(retrieval_query, k=RETRIEVAL_K, max_chars=MAX_CONTEXT_CHARS)

        # HEADLINE PROMPT
        prompt_text = f"""### Instruction
You are a witty, cynical stand-up comedian. Write ORIGINAL humor (do not reuse or paraphrase known jokes).
Use the Background Facts only if they help inspire the joke. Do not quote them.

Rules:
- Output EXACTLY ONE joke (1â€“2 sentences).
- The joke must be STANDALONE: include the premise so it makes sense without reading the headline.
- Be clever, cynical, or ironic; end with a twist if possible.
- Do NOT explain the joke.
- Do NOT summarize the headline. Make it a joke.
- Keep it punchy (max ~35 words).

Background Facts (optional):
{context}

### Examples (style only)
Headline: "Study finds 90% of office meetings could be emails."
Joke: "A new study found that 90% of office meetings could be emails, which implies the other 10% could have just been silence."

Headline: "Billionaire builds giant clock inside a mountain."
Joke: "A billionaire is building a giant clock inside a mountain, finally providing a way to tell time for the five people who actually survive the apocalypse."

### Task
Headline: "{headline_str}"

### Response
Joke:"""

    else:
        # Words Case
        real_w1 = w1_str if w1_str != "-" else "something"
        real_w2 = w2_str if w2_str != "-" else "random"
        input_type = "word-pair"
        input_content = f"{real_w1}, {real_w2}"

        # Exact query wrapper for words
        retrieval_query = "Meaning, usage, and related concepts for: " + real_w1 + " and " + real_w2
        context = retriever.retrieve(retrieval_query, k=RETRIEVAL_K, max_chars=MAX_CONTEXT_CHARS)

        # WORD-INCLUSION PROMPT
        prompt_text = f"""### Instruction
You are a witty, cynical stand-up comedian. Write ORIGINAL humor (do not reuse or paraphrase known jokes).
Use the Background Facts only if they help inspire the joke. Do not quote them.

Rules:
- Output EXACTLY ONE joke (1â€“2 sentences).
- Must include BOTH words (case-insensitive is OK): "{real_w1}" and "{real_w2}".
- Be clever, cynical, or ironic; end with a twist if possible.
- No explanations, no analysis, no extra text.
- Keep it punchy (max ~35 words).

Background Facts (optional):
{context}

### Examples (style only)
Words: "unplug" + "fridge"
Joke: "My current relationship is like an unplugged fridge: cold, dark, and Iâ€™m scared to open it and see whatâ€™s rotting inside."

### Task
Words: "{real_w1}", "{real_w2}"

### Response
Joke:"""

    # --- GENERATE WITH RETRY (Word Inclusion / Chat Template) ---
    max_retries = 2
    final_joke = ""

    for attempt in range(max_retries):
        # Apply Chat Template
        messages = [
            {"role": "user", "content": prompt_text}
        ]

        # Only add reminder on retry for word-pair
        if attempt > 0 and input_type == "word-pair":
             messages[0]["content"] += f"\n\nREMINDER: You MUST include the words '{real_w1}' and '{real_w2}' in the joke."

        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize = True,
            add_generation_prompt = True,
            return_tensors = "pt"
        )
        inputs = inputs.to(model.device)

        # Increase max tokens for retries
        current_max_tokens = 64 if attempt == 0 else 80

        with torch.inference_mode():
            # Explicitly create attention mask to fix warning
            attention_mask = (inputs != tokenizer.pad_token_id).long()

            outputs = model.generate(
                input_ids=inputs,
                attention_mask=attention_mask, # Safe mask
                do_sample = True,
                max_new_tokens = current_max_tokens,
                temperature = 0.7,
                top_p = 0.9,
                repetition_penalty = 1.15,
                pad_token_id = tokenizer.eos_token_id
            )

        # --- PARSE ---
        decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # 1. Robust extraction (Simple & Safe)
        # Try stripping the prompt first
        prompt_decoded = tokenizer.decode(inputs[0], skip_special_tokens=True)
        if decoded_text.startswith(prompt_decoded):
            generated_part = decoded_text[len(prompt_decoded):].strip()
        else:
            generated_part = decoded_text

        # Further cleanup
        if "Joke:" in generated_part:
            temp_joke = generated_part.split("Joke:")[-1]
        elif "assistant" in generated_part:
            temp_joke = generated_part.split("assistant")[-1]
        elif "Response" in generated_part:
            temp_joke = generated_part.split("Response")[-1]
        else:
            temp_joke = generated_part

        # Cleanup Artifacts
        temp_joke = temp_joke.replace("assistant", "")
        temp_joke = temp_joke.strip()
        temp_joke = temp_joke.split("\n\n")[0] # First paragraph

        # Remove leading/trailing quotes if any
        temp_joke = temp_joke.strip().strip('"').strip("'")

        # 3. Validation (Word Pair)
        if input_type == "word-pair":
            # Case insensitive check
            if real_w1.lower() in temp_joke.lower() and real_w2.lower() in temp_joke.lower():
                final_joke = temp_joke
                break
            else:
                if attempt == max_retries - 1:
                    final_joke = temp_joke
        else:
            final_joke = temp_joke
            break

    if not final_joke:
        final_joke = "Error: Generation failed."

    # Final cleanup just in case
    final_joke = final_joke.replace("assistant\n", "").strip()

    # Save
    result_entry = {
        "id": current_id,
        "type": input_type,
        "input_original": input_content,
        "retrieved_context": context,
        "generated_joke": final_joke
    }

    with open(output_file, "a", encoding='utf-8') as f:
        f.write(json.dumps(result_entry, ensure_ascii=False) + "\n")

    processed_ids.add(current_id)

    # Periodic Cache Clear
    if (i + 1) % 50 == 0:
        torch.cuda.empty_cache()

print("Finished RAG Generation!")

Starting RAG Generation with Qwen...


Generating Jokes:   0%|          | 0/1200 [00:00<?, ?it/s]

Finished RAG Generation!
