In [1]:
import os
import pandas as pd
import csv
import json
import re
from tqdm.notebook import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 1. Install necessary libraries
!pip install -q -U torch transformers bitsandbytes accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import huggingface_hub
from google.colab import userdata

HF_CACHE_DIR = "/content/drive/MyDrive/LLM project/hf_cache"

hf_token = "hf_EhcqthzoVNoqG***VCFWsGasuwobcoViZK"

try:
    huggingface_hub.login(token=hf_token)
except:
    print("Login failed. Check your token.")


# 3. Configure 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 4. Load Model and Tokenizer
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer_path = "/content/drive/MyDrive/LLM project/llama_tokenizer"

if os.path.exists(tokenizer_path) :
  print(f"Loading tokenizer from: {tokenizer_path}")
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
else :
  print("Tokenizer not found in Drive. Downloading...")
  os.makedirs(tokenizer_path, exist_ok=True)
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.save_pretrained(tokenizer_path)
  print("Tokenizer saved to Drive.")


try:
    # TRY loading purely from Drive first
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        cache_dir=HF_CACHE_DIR,
        local_files_only=True  # <--- THIS FORCES IT TO NOT DOWNLOAD
    )
    print("Success! Loaded from Google Drive cache (No internet used).")

except Exception as e:
    print(f"\nCould not load from Drive (Error: {e})")
    print("Downloading model from Hugging Face (This will happen once)...")

    # Fallback: Download it
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        cache_dir=HF_CACHE_DIR
    )
    print("Download complete and saved to Drive.")

print("Model loaded successfully.")

Loading tokenizer from: /content/drive/MyDrive/LLM project/llama_tokenizer

Could not load from Drive (Error: meta-llama/Meta-Llama-3.1-8B-Instruct does not appear to have files named ('model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors'). Checkout 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/tree/main' for available files.)
Downloading model from Hugging Face (This will happen once)...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Download complete and saved to Drive.
Model loaded successfully.


In [4]:
SYSTEM_MESSAGE = "You are an impartial humor judge. You evaluate AI-generated jokes based on quality and adherence to specific constraints."

In [5]:
def create_user_message(JOKE_A, JOKE_B, type, INPUT_CONTENT):
    # logic to handle the specific constraint text
    if type == "headline":
        CONSTRAINT_DESCRIPTION = "The joke must satirize or directly relate to the Input Content headline."
    else:
        CONSTRAINT_DESCRIPTION = f"The joke MUST contain the exact words: {INPUT_CONTENT}"

    # Return the formatted string
    return f"""I will provide you with a context and two jokes (Joke A and Joke B).
Your goal is to choose the winner based on two criteria:
1. CONSTRAINT CHECK: Does the joke follow the specific rule provided below?
2. QUALITY: Which joke is funnier, more clever, or better written?

### CONTEXT
**Constraint Rule:** {CONSTRAINT_DESCRIPTION}
**Input Content:** {INPUT_CONTENT}

### THE CANDIDATES
**Joke A:**
{JOKE_A}

**Joke B:**
{JOKE_B}

### EVALUATION STEPS
1. Analyze if Joke A satisfies the Constraint Rule.
2. Analyze if Joke B satisfies the Constraint Rule.
3. Compare the humor and wit of both jokes.
4. If one joke follows the rule and the other does not, the one that follows the rule MUST win.
5. If both follow the rule (or both fail), pick the funniest one.

### OUTPUT FORMAT
Provide your final decision in the following JSON format. Do not output any other text or markdown.

Keeping the reasoning concise (MAXIMUM 1 SENTENCE).

{{
  "reasoning": "Explain your logic here. Mention if constraints were met.",
  "winner": "A" or "B" or "Tie"
}}"""

In [6]:
input_path_1 = "/content/drive/MyDrive/LLM project/DATA/outputs_gemmaFinal.jsonl"
input_path_2 = "/content/drive/MyDrive/LLM project/DATA/outputs_qwenFinal.jsonl"

In [7]:
def append_entry(
    json_path: str,
    entry_id: str,
    input_original: str,
    joke_a: str,
    joke_b: str,
    winner: str,
    reason: str
):
    # 1. Load existing file or create base structure
    if os.path.exists(json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = {"ids": {}}

    # 2. Append new entry
    data["ids"][entry_id] = {
        "content": input_original,
        "joke_gemma": joke_a,
        "joke_qwen" : joke_b,
        "winner" : winner,
        "reason" : reason
    }

    # 3. Write back to file
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)


In [8]:
def parse_model_response(response_text):
    # 1. Clean Markdown code blocks
    text = re.sub(r"```json\s*", "", response_text, flags=re.IGNORECASE)
    text = re.sub(r"```", "", text).strip()

    # 2. Try parsing the whole text first (Best case scenario)
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass  # If it fails, we move to the smarter extraction below

    # 3. SMARTER EXTRACTION: Find the first '{' and the last '}'
    # This ignores "Here is the result:" or "Note: I chose A because..."
    try:
        start_index = text.find('{')
        end_index = text.rfind('}')

        if start_index != -1 and end_index != -1 and end_index > start_index:
            json_substring = text[start_index : end_index + 1]
            return json.loads(json_substring)

    except json.JSONDecodeError:
        pass # Still failed? Move to fallback

    # 4. Fallback: Simple string search (Last Resort)
    # We look for explicit winner declarations if JSON parsing is impossible
    if '"winner": "A"' in text or "'winner': 'A'" in text:
        return {"winner": "A", "reasoning": "Parsed via fallback"}
    elif '"winner": "B"' in text or "'winner': 'B'" in text:
        return {"winner": "B", "reasoning": "Parsed via fallback"}
    elif '"winner": "Tie"' in text or "'winner': 'Tie'" in text:
        return {"winner": "Tie", "reasoning": "Parsed via fallback"}

    # 5. Total Failure
    return {"winner": "Error", "reasoning": "JSON Parse Failed", "raw_output": text}

In [9]:
def judge(input_path_1, input_path_2,output_path="/content/drive/MyDrive/LLM project/DATA/result_competition_gemma_qwen.json") :

  total_gemma = 0
  total_qwen = 0
  total_tie = 0

  existings_ids = []

  if os.path.exists(output_path) :
    with open(output_path,"r", encoding="utf-8") as output_f :
      output_obj = json.load(output_f)
    print(f"Found an existing output folder with {len(output_obj["ids"])} entries.")
    existings_ids = list(output_obj["ids"].keys())
    for id,obj in output_obj["ids"].items() :
      if obj["winner"] == "Gemma" :
        total_gemma += 1
      elif obj["winner"] == "Qwen" :
        total_qwen += 1
      else :
        total_tie += 1
  else :
    print("Did not find  any output file, starting from scratch...")


  with open(input_path_1, "r", encoding="utf-8") as f1, \
     open(input_path_2, "r", encoding="utf-8") as f2:

    # Print the Header
    print(f"| {'ID':<8} | {'Winner':<8} | {'CURRENT SCORE (Running Total)':<35} |")
    print("-" * 60)

    pbar = tqdm(zip(f1, f2), total=1200, desc="Judging")

    for line1, line2 in pbar:
        obj1 = json.loads(line1)
        obj2 = json.loads(line2)

        if obj1["id"] in existings_ids or obj2["id"] in existings_ids :
          continue

        if obj1["id"] == obj2["id"] and obj1["type"] == obj2["type"] and obj1["input_original"] == obj2["input_original"]:
          joke_a = obj1["generated_joke"]
          joke_b = obj2["generated_joke"]
          input_type = obj1["type"]
          input_content = obj1["input_original"]


          # 1. Structure the conversation for Llama 3.1
          messages = [
              {"role": "system", "content": SYSTEM_MESSAGE},
              {"role": "user", "content": create_user_message(joke_a, joke_b, input_type, input_content)}
          ]

          # 2. Apply the Chat Template
          input_ids = tokenizer.apply_chat_template(
              messages,
              add_generation_prompt=True,
              return_tensors="pt"
          ).to(model.device)

          attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

          terminators = [
              tokenizer.eos_token_id,
              tokenizer.convert_tokens_to_ids("<|eot_id|>")
          ]


          # 3. Generate Response
          # Temperature 0.1 ensures consistency (we want a judge, not a creative writer)
          outputs = model.generate(
              input_ids,
              attention_mask = attention_mask,
              pad_token_id=tokenizer.eos_token_id,
              max_new_tokens=256,
              eos_token_id=terminators,
              do_sample=True,
              temperature=0.1,
              top_p=0.9,
          )

          response_text = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

          result = parse_model_response(response_text)

          if result["winner"].upper() == "A" :
            winner = "Gemma"
            total_gemma += 1
          elif result["winner"] == "B" :
            winner = "Qwen"
            total_qwen += 1
          elif result["winner"].upper() == "TIE" :
            winner = "Tie"
            total_tie += 1
          else :
            winner = result["winner"]

          append_entry(
              json_path = output_path,
              entry_id = obj1["id"],
              input_original = input_content,
              joke_a = joke_a,
              joke_b = joke_b,
              winner = winner,
              reason = result["reasoning"]
          )

          # 2. Print the clean row
          pbar.write(f"| {obj1["id"]:<8} | {winner:^8} | Gemma: {total_gemma:<4} Qwen: {total_qwen:<4} Tie: {total_tie:<4} |")

        else :
          print("two records did not match, passing to the next one")
          continue

In [10]:
judge(input_path_1, input_path_2)

Found an existing output folder with 1197 entries.
| ID       | Winner   | CURRENT SCORE (Running Total)       |
------------------------------------------------------------


Judging:   0%|          | 0/1200 [00:00<?, ?it/s]

| en_1198  |  Gemma   | Gemma: 595  Qwen: 588  Tie: 15   |
| en_1199  |   Qwen   | Gemma: 595  Qwen: 589  Tie: 15   |
| en_1200  |  Gemma   | Gemma: 596  Qwen: 589  Tie: 15   |
