<a href="https://colab.research.google.com/github/Seyed110-99/Applied-Machine-Learning/blob/main/notebooks/tiger_khash.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tiger Khash

Description: enter a short description about your notebook here. e.g. "boilerplate to work with both Google Colab and local"

---

## Boilerplate

In [1]:
import os
import sys

COLAB_ROOT_PATH = "/content"
IS_COLAB = os.path.exists(COLAB_ROOT_PATH)

if IS_COLAB:
  # Working on Google Colab
  import json
  from google.colab import drive

  # Mount Google Drive
  DRIVE_PATH = os.path.join(COLAB_ROOT_PATH, "drive")
  drive.flush_and_unmount()
  drive.mount(DRIVE_PATH)

  # Load config
  with open(os.path.join(DRIVE_PATH, "MyDrive", "Colab", "config.json"), "r") as f:
    config = json.load(f)

  # Set up Git credentials
  GIT_USER_NAME = config["GIT_USER_NAME"]
  GIT_TOKEN = config["GIT_TOKEN"]
  GIT_USER_EMAIL = config["GIT_USER_EMAIL"]

  !git config --global user.email {GIT_USER_EMAIL}
  !git config --global user.name {GIT_USER_NAME}

  # Set up project paths
  STORAGE_PATH = os.path.join(DRIVE_PATH, "MyDrive", config["DRIVE_PATH"], "Colab")
  ROOT_PATH = os.path.join(COLAB_ROOT_PATH, "comp0087-snlp")

  # Clone repo
  GIT_REPOSITORY = "COMP0087-Statistical-Natural-Language-Processing"
  GIT_PATH = f"https://{GIT_TOKEN}@github.com/haelai77/COMP0087-Statistical-Natural-Language-Processing.git"

  if not os.path.exists(ROOT_PATH):
    !git clone "{GIT_PATH}" "{ROOT_PATH}"
  else:
    print(f"Git repo already cloned at {ROOT_PATH}")
else:
  # Working on local machine
  path_components = os.path.abspath("").split("/") # extract path components
  root_index = next(
    (i for i, s in enumerate(path_components) if "comp0087" in s.lower()),
    -1
  ) # find comp0087 as root
  if root_index == -1:
    raise Exception("Could not find root path")
  ROOT_PATH = os.path.join("/", *path_components[:root_index + 1])
  STORAGE_PATH = ROOT_PATH

# Data and output paths
DATA_PATH = os.path.join(STORAGE_PATH, "data")
OUTPUT_PATH = os.path.join(STORAGE_PATH, "output")

if not os.path.exists(DATA_PATH):
  # Create if does not exist
  os.makedirs(DATA_PATH)
  print(f"Created data directory at {DATA_PATH}")

if not os.path.exists(OUTPUT_PATH):
  # Create if does not exist
  os.makedirs(OUTPUT_PATH)
  print(f"Created output directory at {OUTPUT_PATH}")

# Add root path to sys.path
sys.path.append(ROOT_PATH)

print("="*20)
print(f"Runtime: {'Google Colab' if IS_COLAB else 'local machine'}")
print(f"Root path: {ROOT_PATH}")
print(f"Storage path: {STORAGE_PATH}")
print(f"Data path: {DATA_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print("="*20)

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive
Cloning into '/content/comp0087-snlp'...
remote: Enumerating objects: 526, done.[K
remote: Counting objects: 100% (272/272), done.[K
remote: Compressing objects: 100% (183/183), done.[K
remote: Total 526 (delta 146), reused 196 (delta 84), pack-reused 254 (from 1)[K
Receiving objects: 100% (526/526), 42.70 MiB | 11.60 MiB/s, done.
Resolving deltas: 100% (252/252), done.
Runtime: Google Colab
Root path: /content/comp0087-snlp
Storage path: /content/drive/MyDrive/Khash Bonus/Colab
Data path: /content/drive/MyDrive/Khash Bonus/Colab/data
Output path: /content/drive/MyDrive/Khash Bonus/Colab/output


---

## Content

### Imports

In [2]:
# Start with imports at the top
import torch
import transformers

# === GPU Check ===
print(f"{torch.cuda.is_available()=}")
if torch.cuda.is_available():
    print(f"{torch.cuda.device_count()=}")
    print(f"{torch.cuda.current_device()=}")
    print(f"{torch.cuda.get_device_name(torch.cuda.current_device())=}")

torch.cuda.is_available()=True
torch.cuda.device_count()=1
torch.cuda.current_device()=0
torch.cuda.get_device_name(torch.cuda.current_device())='Tesla T4'


### Test data

In [3]:
sentence_0 = "cars are the only way to get around"
sentence_1 = "cars are a useful mode of transportation"
sentence_1_ia = "Grey cars are a useful mode of transportation"
true_class = 0

### Predicted class

In [18]:
# Load model and tokenizer
model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto"
)

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]



In [65]:
import re
import torch
import transformers

def semantic_match(intervention: str, explanation: str, model, tokenizer) -> tuple[str | None, float]:
    """
    Uses Qwen2.5 (e.g., Qwen/Qwen2.5-3B-Instruct) to determine which word or phrase in the explanation
    best captures the meaning of the target adjective/adverb (intervention).

    The function constructs a prompt instructing the model to output exactly two lines:
    - Line 1: ONLY the best matching word or phrase from the explanation that is similar in meaning to the intervention word.
              If no appropriate word is found, output "None".
    - Line 2: ONLY the confidence score as a decimal number between 0 and 1.

    It then parses the output and returns the extracted answer and score.

    Parameters:
        intervention (str): The target adjective/adverb (e.g., "happy").
        explanation (str): The explanation sentence (e.g., "She felt joy when she saw her friends.").
        model: A preloaded causal language model (e.g., Qwen2.5).
        tokenizer: The corresponding tokenizer.

    Returns:
        tuple[str | None, float]: The best matching word/phrase and the confidence score.
    """
    prompt = f"""You are an expert in semantic analysis.
Given the explanation: "{explanation}"
and the intervention word: "{intervention}",
Output exactly two lines:
Line 1: Only the best matching word or phrase from the explanation that is similar in meaning to the intervention word. If no appropriate word is found, output "None".
Line 2: Only the confidence score as a decimal number between 0 and 1.
Do not output any additional text.
"""
    # Tokenize the prompt with padding and include the attention mask.
    input_tokens = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
    ).to(model.device)

    # Generate output texts; pass attention_mask as well.
    with torch.no_grad():
        output_tokens = model.generate(
            input_ids=input_tokens["input_ids"],
            attention_mask=input_tokens["attention_mask"],
            max_new_tokens=50,  # adjust if longer output is needed
            output_scores=True,
            return_dict_in_generate=True,
            do_sample=True,   # Enable sampling for diverse output
            temperature=0.3   # Lower temperature for less randomness
        )

    # Decode the generated tokens.
    output_text = tokenizer.batch_decode(output_tokens.sequences, skip_special_tokens=True)[0]
    print("Full output:")
    print(output_text)

    # Use a regex to extract the answer and confidence.
    # We look for a pattern where there is "Line 1:" followed by some text, then "Line 2:" followed by a number.
    pattern = r'line\s*1:\s*(.+?)\s*[\r\n]+line\s*2:\s*([0-9]*\.?[0-9]+)'
    m = re.search(pattern, output_text, re.IGNORECASE)
    if m:
        extracted_answer = m.group(1).strip()
        try:
            extracted_score = float(m.group(2))
        except ValueError:
            extracted_score = 0.0
        if extracted_answer.lower() == "none":
            extracted_answer = None
        print(f"Explanation: {explanation}")
        print(f"Intervention: {intervention}")
        print(f"Best match: {extracted_answer}, Score: {extracted_score:.2f}")
        return extracted_answer, extracted_score
    else:
        print("Failed to parse output. Full output was:")
        print(output_text)
        return None, 0.0

In [66]:
print(semantic_match("quickly", "He ran fast to catch the bus.", model, tokenizer))
print(semantic_match("happy", "She felt joy when she saw her friends.", model, tokenizer))
print(semantic_match("angry", "He was furious at the decision.", model, tokenizer))
print(semantic_match("sad", "The weather was bright and sunny.", model, tokenizer))
print(semantic_match("angrily", "She spoke in a calm and polite manner.", model, tokenizer))
print(semantic_match("brave", "The cat hid under the couch.", model, tokenizer))


Full output:
You are an expert in semantic analysis.
Given the explanation: "He ran fast to catch the bus."
and the intervention word: "quickly",
Output exactly two lines:
Line 1: Only the best matching word or phrase from the explanation that is similar in meaning to the intervention word. If no appropriate word is found, output "None".
Line 2: Only the confidence score as a decimal number between 0 and 1.
Do not output any additional text.
Line 1: fast
Line 2: 1.00
Explanation: He ran fast to catch the bus.
Intervention: quickly
Best match: fast, Score: 1.00
('fast', 1.0)
Full output:
You are an expert in semantic analysis.
Given the explanation: "She felt joy when she saw her friends."
and the intervention word: "happy",
Output exactly two lines:
Line 1: Only the best matching word or phrase from the explanation that is similar in meaning to the intervention word. If no appropriate word is found, output "None".
Line 2: Only the confidence score as a decimal number between 0 and 1.
D