# Imports

In [7]:
# TODO more examples in prompt 4-shot 

# TODO Interpretation: Error Rate (n.a. responses or wrong output format), Mean vs Models, 1-shot vs 4-shot, Mean vs Human Mean

In [8]:
import csv
import ollama
import os
import pandas as pd
import glob
import numpy as np

# Config

In [9]:
# SWOW-style prompt template
PROMPT_TEMPLATE = """<<SYS>>
You MUST follow these rules:

- Rate similarity on a scale from 0 to 10.
- 0 = completely unrelated
- 10 = identical in meaning
- Only use integers (0-10).
- Consider semantic similarity, not association or co-occurrence.
- Do NOT explain your reasoning.
- Output must be exactly one line:

[word1];[word2];[rating]

Example output:
car;automobile;10

<</SYS>>

You will perform a word similarity rating task.

Task:
You will be given a pair of English words.
Your job is to judge how similar their meanings are.


Now rate the following word pair:

Word 1: {w1}
Word 2: {w2}
"""


dog;table;0
teacher;professor;7
river;lake;4

# Functions

In [10]:
# ----------------------------------------------------------
# FUNCTION TO QUERY OLLAMA
# ----------------------------------------------------------
def ask_ollama(model: str, prompt: str) -> str:
    result = ollama.generate(model=model, prompt=prompt)
    return result['response']

# ----------------------------------------------------------
# LOAD INPUT WORDS
# ----------------------------------------------------------
def load_word_pairs_from_tsv(path: str):
    """Load word pairs from a TSV file and return as list of tuples."""
    pairs = []
    with open(path, newline="", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter="\t")
        next(reader)  
        next(reader) # Skip first two rows
        for row in reader:
            if len(row) >= 2:
                pairs.append((row[0], row[1]))
    return pairs

# ----------------------------------------------------------
# SAVE OUTPUT
# ----------------------------------------------------------
def save_results(path: str, rows):
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["model", "w1", "w2", "rating"])
        writer.writerows(rows)

# Pipeline

In [11]:
def run_experiment(pairs, output, model):
    results = []
    for w1, w2 in pairs:
        prompt = PROMPT_TEMPLATE.format(w1=w1, w2=w2)
        response = ask_ollama(model, prompt)
        
        if response is None:
            results.append([model, w1, w2, "Invalid Response"])
            continue

        # Now split by semicolon
        parts = [p.strip() for p in response.split(";")]

        if len(parts) != 3:
            print(f"Warning: Unexpected format for pair'{w1} {w2}': {response}")
            results.append([model, w1, w2, "Invalid Response"])
            continue

        w1_out, w2_out, rating = parts
        results.append([model, w1, w2, rating])

    save_results(output, results)
    print(f"Done! Saved to {output}")

In [12]:
input_path = os.path.join("..", "data", "datasets", "wordsim353.tsv")
word_pairs = load_word_pairs_from_tsv(input_path)

models = [
    'qwen3:0.6b',
    'qwen3:1.7b',
    'qwen3:8b', 
    'qwen3:14b', 
    'qwen3:30b',
    'gemma3:270m', 
    'gemma3:1b',
    'gemma3:4b', 
    'gemma3:12b', 
    'gemma3:27b'
    ]

for model in models:
    output_path = os.path.join("..", "data", "results", "similarity", "1-shot", f'{model.replace(':', '_').replace('.', '_')}_similarity_rating.csv')
    print(f"Starting with model: {model}")
    run_experiment(word_pairs, output_path, model)

Starting with model: qwen3:0.6b
Done! Saved to ..\data\results\similarity\1-shot\qwen3_0_6b_similarity_rating.csv
Starting with model: qwen3:1.7b
Done! Saved to ..\data\results\similarity\1-shot\qwen3_1_7b_similarity_rating.csv
Starting with model: qwen3:8b
Done! Saved to ..\data\results\similarity\1-shot\qwen3_8b_similarity_rating.csv
Starting with model: qwen3:14b
Done! Saved to ..\data\results\similarity\1-shot\qwen3_14b_similarity_rating.csv
Starting with model: qwen3:30b
Done! Saved to ..\data\results\similarity\1-shot\qwen3_30b_similarity_rating.csv
Starting with model: gemma3:270m

















































































































































































































































































































































Done! Saved to ..\data\results\similarity\1-shot\gemma3_270m_similar

# Results

In [None]:
# Load human ratings from wordsim353
human_data = pd.read_csv("../data/datasets/wordsim353.tsv", sep="\t", comment="#")
human_data.columns = ["w1", "w2", "human_rating"]
# Create a key for matching (word pairs)
human_data["pair_key"] = human_data.apply(lambda x: tuple(sorted([x["w1"], x["w2"]])), axis=1)

# Find all model rating files
model_files = sorted(glob.glob("../data/results/similarity/*.csv"))

# Start with human data
result_df = human_data[["w1", "w2", "human_rating", "pair_key"]].copy()
# Add each model's ratings as a column
for model_file in model_files:
    model_name = model_file.split("\\")[-1].replace("_similarity_rating.csv", "")
    
    # Load model ratings
    model_data = pd.read_csv(model_file)
    model_data["pair_key"] = model_data.apply(lambda x: tuple(sorted([x["w1"], x["w2"]])), axis=1)
    
    # Merge on pair_key, keeping only rating column
    model_ratings = model_data[["pair_key", "rating"]].copy()
    model_ratings.rename(columns={"rating": model_name}, inplace=True)
    
    # Merge with result (using left join to keep all pairs)
    result_df = result_df.merge(model_ratings, on="pair_key", how="left")
    
# Remove the pair_key column
result_df = result_df.drop(columns=["pair_key"])

# Replace NaN with "n.a." for display
result_df = result_df.fillna("n.a.")

# Calculate mean of all models (excluding human_rating, ignoring n.a.)
model_cols = [col for col in result_df.columns if col not in ["w1", "w2", "human_rating"]]

def calculate_mean(row):
    values = []
    for col in model_cols:
        val = row[col]
        if val != "n.a.":
            try:
                values.append(float(val))
            except (ValueError, TypeError):
                pass
    return np.mean(values) if values else np.nan

result_df["models_mean"] = result_df.apply(calculate_mean, axis=1)
result_df["models_mean"] = result_df["models_mean"].apply(lambda x: "n.a." if pd.isna(x) else round(x, 2))

# Reorder columns: w1, w2, human_rating, models_mean, then all models
final_cols = ["w1", "w2", "human_rating", "models_mean"] + model_cols
result_df = result_df[final_cols]

# Save to CSV
result_df.to_csv("../data/results/similarity_results.csv", index=False)
print(f"Saved detailed results to ../data/results/similarity_results.csv")
print(f"Shape: {result_df.shape}")
print("\nFirst few rows:")
print(result_df.head(10))