In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import os
os.chdir('/content/drive/MyDrive/SemEvalProject')  # Adjust path if needed

In [3]:
# Cell 3: Install all required packages
!pip install -q torch transformers sentencepiece
!pip install -q pandas numpy scipy scikit-learn
!pip install -q sentence-transformers
!pip install -q hazm parsivar  # Persian NLP tools

# Verify installations
import torch
import transformers
import pandas as pd
print(f" PyTorch version: {torch.__version__}")
print(f" Transformers version: {transformers.__version__}")
print(f" CUDA available: {torch.cuda.is_available()}")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
 PyTorch version: 2.9.0+cu126
 Transformers version: 4.57.3
 CUDA available: True


In [4]:
# Cell 3: Create solution file from dev data
import json

def create_dev_solution():
    """Create solution file from dev data"""
    with open('dev.json', 'r', encoding='utf-8') as f:
        dev_data = json.load(f)

    solutions = []
    for idx, sample in dev_data.items():
        solutions.append({
            "id": idx,
            "label": sample["choices"]
        })

    with open('dev_solution.jsonl', 'w', encoding='utf-8') as f:
        for sol in solutions:
            f.write(json.dumps(sol) + '\n')

    print(f" Created dev_solution.jsonl with {len(solutions)} samples")
    return len(solutions)

num_samples = create_dev_solution()
print(f"Total dev samples: {num_samples}")

 Created dev_solution.jsonl with 588 samples
Total dev samples: 588


In [14]:
# Cell 2: Run Official Random Baseline
import subprocess
import sys

print("="*60)
print("RUNNING OFFICIAL RANDOM BASELINE")
print("="*60)

# The official scripts expect to be run from specific directory structure
# They look for data in "data/" subdirectory

# Create the expected directory structure if needed
os.makedirs('data', exist_ok=True)
os.makedirs('predictions', exist_ok=True)

# Copy data files to expected location
!cp dev.json data/dev.json
!cp train.json data/train.json

# Check if official random_baseline.py exists
if os.path.exists('official_random_baseline.py'):
    script_name = 'official_random_baseline.py'
elif os.path.exists('random_baseline.py'):
    script_name = 'random_baseline.py'
else:
    print(" No official random baseline script found!")
    print("   Please upload random_baseline.py from semeval26-05-scripts repo")
    script_name = None

if script_name:
    print(f"Running {script_name}...")

    # The official script uses these paths:
    # INPUT: "data/" + SET + ".json" where SET = "dev"
    # OUTPUT: "predictions/random_predictions_dev.jsonl"

    # Run the official script
    result = subprocess.run([sys.executable, script_name],
                          capture_output=True, text=True)

    if result.returncode == 0:
        print(" Official random baseline completed")

        # Check if output was created
        if os.path.exists('predictions/random_predictions_dev.jsonl'):
            print("  Output: predictions/random_predictions_dev.jsonl")

            # Evaluate
            !python scoring.py dev_solution.jsonl predictions/random_predictions_dev.jsonl results/official_random_scores.json

            # Show results
            with open('results/official_random_scores.json', 'r') as f:
                random_scores = json.load(f)
            print(f"\n Official Random Baseline Results:")
            print(f"  Spearman: {random_scores.get('spearman', 'N/A'):.4f}")
            print(f"  Accuracy: {random_scores.get('accuracy', 'N/A'):.4f} ({random_scores.get('accuracy', 0)*100:.1f}%)")
    else:
        print(f" Error running script:\n{result.stderr}")

RUNNING OFFICIAL RANDOM BASELINE
Running random_baseline.py...
 Official random baseline completed
  Output: predictions/random_predictions_dev.jsonl
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file predictions/random_predictions_dev.jsonl on dev_solution.jsonl
----------
Spearman Correlation: -0.03623098347971746
Spearman p-Value: 0.38050124818984
----------
Accuracy: 0.42517006802721086 (250/588)
Results dumped into scores.json successfully.

 Official Random Baseline Results:
  Spearman: -0.0362
  Accuracy: 0.4252 (42.5%)


In [15]:
# Cell 3: Run Official Majority Baseline
print("="*60)
print("RUNNING OFFICIAL MAJORITY BASELINE")
print("="*60)

# Check if official majority_baseline.py exists
if os.path.exists('official_majority_baseline.py'):
    script_name = 'official_majority_baseline.py'
elif os.path.exists('majority_baseline.py'):
    script_name = 'majority_baseline.py'
else:
    print(" No official majority baseline script found!")
    print("   Please upload majority_baseline.py from semeval26-05-scripts repo")
    script_name = None

if script_name:
    print(f"Running {script_name}...")

    # The official script configuration:
    # MAJORITY_LABEL = 4 (hardcoded in their script)
    # OUTPUT: "predictions/majority_predictions_dev.jsonl"

    # Run the official script
    result = subprocess.run([sys.executable, script_name],
                          capture_output=True, text=True)

    if result.returncode == 0:
        print(" Official majority baseline completed")

        # Check if output was created
        if os.path.exists('predictions/majority_predictions_dev.jsonl'):
            print("  Output: predictions/majority_predictions_dev.jsonl")

            # Evaluate
            !python scoring.py dev_solution.jsonl predictions/majority_predictions_dev.jsonl results/official_majority_scores.json

            # Show results
            with open('results/official_majority_scores.json', 'r') as f:
                majority_scores = json.load(f)
            print(f"\n Official Majority Baseline Results:")
            print(f"  Spearman: {majority_scores.get('spearman', 'N/A')}")
            print(f"  Accuracy: {majority_scores.get('accuracy', 'N/A'):.4f} ({majority_scores.get('accuracy', 0)*100:.1f}%)")
    else:
        print(f" Error running script:\n{result.stderr}")

RUNNING OFFICIAL MAJORITY BASELINE
Running majority_baseline.py...
 Official majority baseline completed
  Output: predictions/majority_predictions_dev.jsonl
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file predictions/majority_predictions_dev.jsonl on dev_solution.jsonl
  corr, value = spearmanr(pred_list, gold_list)
----------
Spearman Correlation: nan
Spearman p-Value: nan
----------
Accuracy: 0.5697278911564626 (335/588)
Results dumped into scores.json successfully.

 Official Majority Baseline Results:
  Spearman: nan
  Accuracy: 0.5697 (57.0%)


In [19]:
# Cell: OHPT with M2M-100 for Persian Translation
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import pandas as pd
import torch
import json

class M2M100OHPTAnnotationPreparer:
    def __init__(self, device='cuda', model_size='418M'):
        print("Setting up M2M-100 OHPT Annotation System...")

        # Choose model size (418M or 1.2B)
        if model_size == '418M':
            self.model_name = 'facebook/m2m100_418M'
        else:
            self.model_name = 'facebook/m2m100_1.2B'  # Better but larger

        print(f"  Loading {self.model_name}...")

        # Load tokenizer and model
        self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
        self.model = M2M100ForConditionalGeneration.from_pretrained(self.model_name)
        self.model.to(device)
        self.device = device

        # Set languages
        self.tokenizer.src_lang = "en"  # Source: English
        self.target_lang = "fa"  # Target: Persian/Farsi

        # Cache for translations
        self.translation_cache = {}

        print(" M2M-100 model loaded for English→Persian translation")

    def translate(self, text, max_length=512):
        """Translate English to Persian using M2M-100"""
        if not text or len(text.strip()) == 0:
            return ""

        # Check cache
        if text in self.translation_cache:
            return self.translation_cache[text]

        # Tokenize
        encoded = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length,
            padding=True
        ).to(self.device)

        # Generate translation
        with torch.no_grad():
            generated_tokens = self.model.generate(
                **encoded,
                forced_bos_token_id=self.tokenizer.get_lang_id(self.target_lang),
                max_length=max_length,
                num_beams=4,
                early_stopping=True,
                do_sample=False
            )

        # Decode
        translation = self.tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True
        )[0]

        # Cache it
        self.translation_cache[text] = translation

        return translation

    def test_translations(self):
        """Test translation quality"""
        print("\nTesting M2M-100 English→Persian translations:")
        print("-" * 60)

        test_sentences = [
            "The bank was closed",
            "I love dogs",
            "The track was muddy",
            "He played a beautiful melody",
            "The potential is high",
            "She sat on the river bank",
            "The track of the train was damaged"
        ]

        translations = []
        for sent in test_sentences:
            trans = self.translate(sent)
            print(f"EN: {sent}")
            print(f"FA: {trans}\n")
            translations.append(trans)

        print("Please verify these look like Persian/Farsi!")
        return translations

    def prepare_annotation_sheet(self):
        """Prepare annotation sheet with M2M-100 translations"""
        print("\nPreparing OHPT annotation sheet with M2M-100 translations...")

        # Load data
        with open('dev.json', 'r') as f:
            dev_data = json.load(f)

        # Load synset template
        synset_df = pd.read_csv('dev-synsets-fa.tsv', sep='\t')

        print(f"Processing {len(synset_df)} homonym entries...")

        # Add columns
        synset_df['example_sentence_1'] = ''
        synset_df['example_translation_1'] = ''
        synset_df['homonym_sentence_1'] = ''
        synset_df['homonym_translation_1'] = ''
        synset_df['example_sentence_2'] = ''
        synset_df['example_translation_2'] = ''
        synset_df['homonym_sentence_2'] = ''
        synset_df['homonym_translation_2'] = ''
        synset_df['F1'] = ''  # To be filled: Persian word for sense 1
        synset_df['F2'] = ''  # To be filled: Persian word for sense 2
        synset_df['homonym_label'] = ''  # To be filled: 'D' or 'N'

        # Process each homonym
        for idx, row in synset_df.iterrows():
            if pd.isna(row.get('instance numbers')):
                continue

            instances = str(row['instance numbers']).split()
            homonym = row['homonym (lemma)']

            print(f"\n[{idx+1}/{len(synset_df)}] Processing: {homonym}")

            # Collect examples for both senses
            sense1_examples = []
            sense2_examples = []

            # Split instances between senses
            mid_point = len(instances) // 2

            for i, inst_id in enumerate(instances[:6]):  # Process first 6 for speed
                if inst_id in dev_data:
                    sample = dev_data[inst_id]

                    # Get sentences
                    example_en = sample['example_sentence']
                    sentence_en = sample['sentence']

                    # Translate both
                    example_fa = self.translate(example_en)
                    sentence_fa = self.translate(sentence_en)

                    data_point = {
                        'example_en': example_en,
                        'example_fa': example_fa,
                        'sentence_en': sentence_en,
                        'sentence_fa': sentence_fa,
                        'gloss': sample.get('judged_meaning', '')
                    }

                    if i < mid_point:
                        sense1_examples.append(data_point)
                    else:
                        sense2_examples.append(data_point)

            # Add to dataframe
            if sense1_examples:
                synset_df.at[idx, 'example_sentence_1'] = sense1_examples[0]['example_en']
                synset_df.at[idx, 'example_translation_1'] = sense1_examples[0]['example_fa']
                synset_df.at[idx, 'homonym_sentence_1'] = sense1_examples[0]['sentence_en']
                synset_df.at[idx, 'homonym_translation_1'] = sense1_examples[0]['sentence_fa']
                synset_df.at[idx, 'gloss_1_ref'] = sense1_examples[0]['gloss']

            if sense2_examples:
                synset_df.at[idx, 'example_sentence_2'] = sense2_examples[0]['example_en']
                synset_df.at[idx, 'example_translation_2'] = sense2_examples[0]['example_fa']
                synset_df.at[idx, 'homonym_sentence_2'] = sense2_examples[0]['sentence_en']
                synset_df.at[idx, 'homonym_translation_2'] = sense2_examples[0]['sentence_fa']
                synset_df.at[idx, 'gloss_2_ref'] = sense2_examples[0]['gloss']

            print(f"  ✓ Processed {len(sense1_examples)} + {len(sense2_examples)} examples")

            # Clear cache periodically
            if len(self.translation_cache) > 100:
                self.translation_cache = {}

        # Save annotation sheet
        output_file = 'dev-synsets-fa-M2M100-TO_ANNOTATE.tsv'
        synset_df.to_csv(output_file, sep='\t', index=False)

        print("\n" + "="*70)
        print("M2M-100 ANNOTATION SHEET CREATED")
        print("="*70)
        print(f"File: {output_file}")
        print("\nFor your Persian teammate to annotate:")
        print("  1. Check Persian translations (columns: *_translation_*)")
        print("  2. Identify Persian word for homonym in each sense")
        print("  3. Fill F1 = Persian word for sense 1")
        print("  4. Fill F2 = Persian word for sense 2")
        print("  5. Fill homonym_label = 'D' if F1≠F2, 'N' if F1=F2")
        print("\nSave as: dev-synsets-fa-M2M100-ANNOTATED.tsv")

        return output_file

# Run M2M-100 OHPT Annotation Generator
print("\n" + "="*70)
print("M2M-100 OHPT ANNOTATION GENERATOR")
print("="*70)

# Initialize with 418M model (use '1.2B' for better quality but slower)
m2m_annotator = M2M100OHPTAnnotationPreparer(
    device='cuda' if torch.cuda.is_available() else 'cpu',
    model_size='418M'  # Change to '1.2B' for better translations
)

# Test translations first
print("\nStep 1: Testing translation quality...")
test_results = m2m_annotator.test_translations()

# Generate full annotation sheet
print("\nStep 2: Generate annotation sheet? (y/n)")
# Uncomment to run:
if input().strip().lower() == 'y':
  annotation_file = m2m_annotator.prepare_annotation_sheet()
  print(f"\n To be Sent '{annotation_file}' to my Persian teammate!")


M2M-100 OHPT ANNOTATION GENERATOR
Setting up M2M-100 OHPT Annotation System...
  Loading facebook/m2m100_418M...
 M2M-100 model loaded for English→Persian translation

Step 1: Testing translation quality...

Testing M2M-100 English→Persian translations:
------------------------------------------------------------
EN: The bank was closed
FA: بانک بسته شد

EN: I love dogs
FA: سگ ها را دوست دارم

EN: The track was muddy
FA: مسیر خسته کننده بود

EN: He played a beautiful melody
FA: او یک آهنگ زیبا بازی کرد.

EN: The potential is high
FA: پتانسیل بسیار بالاست

EN: She sat on the river bank
FA: نشسته در بانک رودخانه

EN: The track of the train was damaged
FA: مسیر قطار آسیب دیده است.

Please verify these look like Persian/Farsi!

Step 2: Generate annotation sheet? (y/n)
y

Preparing OHPT annotation sheet with M2M-100 translations...
Processing 98 homonym entries...

[1/98] Processing: track
  ✓ Processed 3 + 3 examples

[2/98] Processing: dribble
  ✓ Processed 3 + 3 examples

[3/98] Process

In [20]:
# Cell: Fixed OHPT with Lemmatization
import pandas as pd
import json
from collections import defaultdict

class OHPTWithLemmaMapping:
    def __init__(self, annotated_file='dev-synsets-fa-ANNOTATED.tsv'):
        print("Initializing OHPT with lemma mapping...")

        self.annotations = pd.read_csv(annotated_file, sep='\t')
        self.annotations['homonym label'] = self.annotations['homonym label'].str.strip()

        # Create lemma mapping
        self.create_lemma_mapping()
        self.build_disambiguation_map()

    def create_lemma_mapping(self):
        """Map inflected forms to lemmas"""
        self.lemma_map = {
            # Explicit mappings for dev data
            'dribbling': 'dribble',
            'dribbled': 'dribble',
            'guts': 'gut',
            'coached': 'coach',
            'suits': 'suit',
            'stars': 'star',
            'reservations': 'reservation',
            'hops': 'hop',
            'Drawing': 'draw',
            'drawing': 'draw',

            # Add more as needed
            'coaches': 'coach',
            'coaching': 'coach',
            'suited': 'suit',
            'suiting': 'suit',
            'starring': 'star',
            'starred': 'star',
            'hopping': 'hop',
            'hopped': 'hop',
            'draws': 'draw',
            'drew': 'draw',
            'drawn': 'draw'
        }

        # Also keep original lemmas
        for lemma in self.annotations['homonym (lemma)'].unique():
            self.lemma_map[lemma] = lemma

    def build_disambiguation_map(self):
        """Build disambiguation map"""
        self.disambiguation_map = {}

        d_count = 0
        n_count = 0

        for _, row in self.annotations.iterrows():
            homonym = row['homonym (lemma)']
            label = row['homonym label'].strip().upper()

            if label == 'D':
                self.disambiguation_map[homonym] = {
                    'status': 'disambiguated',
                    'instances': str(row.get('instance numbers', '')).split()
                }
                d_count += 1
            elif label == 'N':
                self.disambiguation_map[homonym] = {
                    'status': 'not_disambiguated',
                    'instances': str(row.get('instance numbers', '')).split()
                }
                n_count += 1

        print(f"\nDisambiguation Statistics:")
        print(f"  Disambiguated (D): {d_count}")
        print(f"  Not disambiguated (N): {n_count}")
        print(f"  Total: {d_count + n_count}")

    def get_lemma(self, word):
        """Get lemma for a word"""
        return self.lemma_map.get(word, word.lower())

    def predict(self, sample_id, sample):
        """Predict using lemmatized lookup"""
        homonym = sample['homonym']
        lemma = self.get_lemma(homonym)

        if lemma in self.disambiguation_map:
            info = self.disambiguation_map[lemma]

            if info['status'] == 'disambiguated':
                return 4 if sample.get('ending') else 3
            else:  # not_disambiguated
                return 3 if sample.get('ending') else 2
        else:
            # No annotation found
            return 3

    def generate_predictions(self, data_path="dev.json",
                            output_path="predictions/lemma_ohpt_predictions.jsonl"):
        """Generate predictions with lemma mapping"""
        print("\nGenerating OHPT predictions with lemma mapping...")

        with open(data_path, 'r') as f:
            data = json.load(f)

        predictions = []
        coverage = {'D': 0, 'N': 0, 'none': 0}

        for idx, sample in data.items():
            pred = self.predict(idx, sample)
            predictions.append({"id": idx, "prediction": pred})

            # Track coverage
            lemma = self.get_lemma(sample['homonym'])
            if lemma in self.disambiguation_map:
                if self.disambiguation_map[lemma]['status'] == 'disambiguated':
                    coverage['D'] += 1
                else:
                    coverage['N'] += 1
            else:
                coverage['none'] += 1

        # Save predictions
        with open(output_path, 'w') as f:
            for pred in predictions:
                f.write(json.dumps(pred) + '\n')

        print(f" Generated {len(predictions)} predictions")
        print(f"  Coverage with lemma mapping:")
        print(f"    D annotations: {coverage['D']} instances")
        print(f"    N annotations: {coverage['N']} instances")
        print(f"    No annotation: {coverage['none']} instances")
        print(f"    Total coverage: {(coverage['D']+coverage['N'])/len(predictions)*100:.1f}%")

        from collections import Counter
        dist = Counter([p['prediction'] for p in predictions])
        print(f"  Distribution: {dict(sorted(dist.items()))}")

        return output_path

# Run OHPT with lemma mapping
print("\n" + "="*60)
print("OHPT WITH LEMMA MAPPING")
print("="*60)

ohpt_lemma = OHPTWithLemmaMapping('dev-synsets-fa-ANNOTATED.tsv')
ohpt_file = ohpt_lemma.generate_predictions()

# Evaluate
!python scoring.py dev_solution.jsonl {ohpt_file} results/lemma_ohpt_scores.json

# Show results
with open('results/lemma_ohpt_scores.json', 'r') as f:
    scores = json.load(f)

print(f"\n OHPT with Lemma Mapping Results:")
print(f"  Spearman: {scores['spearman']:.4f}")
print(f"  Accuracy: {scores['accuracy']*100:.1f}%")

OFFICIAL_MAJORITY = 0.5697
if scores['accuracy'] > OFFICIAL_MAJORITY:
    print(f"   BEATS official majority by {(scores['accuracy']-OFFICIAL_MAJORITY)*100:.1f}%!")
else:
    print(f"   Below majority by {(OFFICIAL_MAJORITY-scores['accuracy'])*100:.1f}%")


OHPT WITH LEMMA MAPPING
Initializing OHPT with lemma mapping...

Disambiguation Statistics:
  Disambiguated (D): 67
  Not disambiguated (N): 30
  Total: 97

Generating OHPT predictions with lemma mapping...
 Generated 588 predictions
  Coverage with lemma mapping:
    D annotations: 324 instances
    N annotations: 174 instances
    No annotation: 90 instances
    Total coverage: 84.7%
  Distribution: {2: 58, 3: 314, 4: 216}
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file predictions/lemma_ohpt_predictions.jsonl on dev_solution.jsonl
----------
Spearman Correlation: 0.03011390690229756
Spearman p-Value: 0.4661026780672902
----------
Accuracy: 0.5612244897959183 (330/588)
Results dumped into scores.json successfully.

 OHPT with Lemma Mapping Results:
  Spearman: 0.0301
  Accuracy: 56.1%
   Below majority by 0.8%


In [21]:
# Check available resources and setup Gemma
import torch
import gc

# Clear GPU memory first
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

    # Check available GPU memory
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    free_memory = (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1e9

    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total Memory: {gpu_memory:.2f} GB")
    print(f"Free Memory: {free_memory:.2f} GB")

    if free_memory < 8:
        print(" Limited memory - will use Gemma-2B with optimization")
    else:
        print(" Sufficient memory for Gemma-2B")

GPU: Tesla T4
Total Memory: 15.83 GB
Free Memory: 11.93 GB
 Sufficient memory for Gemma-2B


In [22]:
# Install Gemma dependencies
!pip install -q transformers accelerate bitsandbytes

print(" Gemma dependencies installed")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h Gemma dependencies installed


In [23]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `4973` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `4973`


In [25]:
# Full Gemma Implementation
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import numpy as np
import gc

class GemmaLLMBaseline:
    def __init__(self, device='cuda'):
        print("Initializing Gemma LLM Baseline...")
        self.device = device

        # Use instruction-tuned Gemma 2B
        self.model_name = "google/gemma-2b-it"

        print(f"  Loading {self.model_name}...")

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load model : we have enough memory for full precision
        print("  Loading model in fp16...")
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )

        self.model.eval()
        print(" Gemma-2B-IT model loaded successfully")

        # Check memory usage
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated(0) / 1e9
            print(f"  GPU memory used: {allocated:.2f} GB")

    def create_prompt(self, sample):
        """Create an effective prompt for Gemma"""

        prompt = """<start_of_turn>user
You are an expert linguist evaluating word meanings in context.

Task: Rate how plausible a specific word meaning is in the given story context.
Use this scale:
- 5: Very plausible (meaning fits perfectly)
- 4: Plausible (meaning fits well)
- 3: Neutral (meaning could fit)
- 2: Implausible (meaning doesn't fit well)
- 1: Very implausible (meaning doesn't fit at all)

Example 1:
Context: "The man went to the downtown area. There were long lines everywhere. People looked frustrated."
Sentence: "He entered the bank nervously."
Proposed meaning of "bank": "financial institution"
Analysis: Downtown + lines + nervous = likely a financial bank
Rating: 5

Example 2:
Context: "The river was calm. Birds chirped nearby. It was peaceful."
Sentence: "He sat on the bank."
Proposed meaning of "bank": "financial institution"
Analysis: River context means "bank" is riverbank, not financial
Rating: 1

Now analyze this:
Context: {full_context}
Target sentence: "{sentence}"
The word "{homonym}" in this context means: "{meaning}"
Example of this meaning: "{example}"

Step-by-step analysis:
1. Context clues:
2. Does the meaning fit:
3. Final rating (1-5):
<end_of_turn>
<start_of_turn>model
"""

        # Build full context
        full_context = ""
        if sample.get('precontext'):
            full_context += sample['precontext'] + " "
        if sample.get('ending'):
            full_context += "[After target sentence:] " + sample['ending']

        return prompt.format(
            full_context=full_context.strip() or "No additional context",
            sentence=sample['sentence'],
            homonym=sample['homonym'],
            meaning=sample['judged_meaning'],
            example=sample['example_sentence']
        )

    def extract_rating(self, response):
        """Extract rating from Gemma's response"""
        import re

        # Clean response
        response = response.strip().lower()

        # Look for rating patterns
        patterns = [
            r'final rating[:\s]*(\d)',
            r'rating[:\s]*(\d)',
            r'rate[:\s]*(\d)',
            r'\b(\d)\s*(?:/5|\s+out)',
            r'^\s*(\d)\s*$'
        ]

        for pattern in patterns:
            match = re.search(pattern, response, re.MULTILINE)
            if match:
                try:
                    rating = int(match.group(1))
                    if 1 <= rating <= 5:
                        return rating
                except:
                    continue

        # Keyword fallback
        if 'very plausible' in response or 'fits perfectly' in response:
            return 5
        elif 'plausible' in response or 'fits well' in response:
            return 4
        elif 'neutral' in response or 'could fit' in response:
            return 3
        elif 'implausible' in response or "doesn't fit" in response:
            return 2
        elif 'very implausible' in response:
            return 1

        return 3  # Default

    def predict(self, sample):
        """Get Gemma's prediction"""
        prompt = self.create_prompt(sample)

        # Tokenize
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        ).to(self.device)

        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=150,  # Enough for analysis
                temperature=0.3,
                do_sample=True,
                top_p=0.9,
                repetition_penalty=1.1,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )

        # Decode
        response = self.tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )

        # Extract rating
        rating = self.extract_rating(response)

        return rating, response

    def generate_predictions(self, data_path="dev.json",
                            output_path="predictions/gemma_predictions.jsonl",
                            max_samples=588):
        """Generate predictions with Gemma"""
        print(f"\nGenerating Gemma LLM predictions...")

        with open(data_path, 'r') as f:
            data = json.load(f)

        predictions = []
        items = list(data.items())

        # Process samples
        process_samples = min(max_samples, len(items))
        print(f"  Processing {process_samples} samples with Gemma...")

        # Track distribution
        rating_counts = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}

        for i, (idx, sample) in enumerate(items[:process_samples]):
            try:
                rating, response = self.predict(sample)
                rating_counts[rating] += 1

                # Show first few
                if i < 3:
                    print(f"\n  Sample {i+1}:")
                    print(f"    Homonym: '{sample['homonym']}'")
                    print(f"    Meaning: '{sample['judged_meaning'][:60]}...'")
                    print(f"    Rating: {rating}")
                    if len(response) > 0:
                        print(f"    Analysis: {response[:150]}...")

            except Exception as e:
                print(f"    Error on sample {idx}: {str(e)[:50]}")
                rating = 3

            predictions.append({
                "id": idx,
                "prediction": rating
            })

            if (i + 1) % 20 == 0:
                print(f"    Processed {i+1}/{process_samples} samples...")
                # Periodic memory cleanup
                if (i + 1) % 50 == 0 and torch.cuda.is_available():
                    torch.cuda.empty_cache()

        # Fill remaining with distribution-based defaults
        if process_samples < len(items):
            print(f"\n  Filling remaining {len(items)-process_samples} samples...")

            # Create distribution based on Gemma's predictions
            rating_distribution = []
            for rating, count in rating_counts.items():
                rating_distribution.extend([rating] * count)

            if not rating_distribution:
                rating_distribution = [3]

            for idx, sample in items[process_samples:]:
                rating = np.random.choice(rating_distribution)
                predictions.append({
                    "id": idx,
                    "prediction": int(rating)
                })

        # Save
        with open(output_path, 'w') as f:
            for pred in predictions:
                f.write(json.dumps(pred) + '\n')

        print(f"\n Generated {len(predictions)} Gemma predictions")

        # Distribution
        from collections import Counter
        pred_values = [p['prediction'] for p in predictions]
        dist = Counter(pred_values)
        print(f"  Final distribution: {dict(sorted(dist.items()))}")
        print(f"  Gemma-processed samples: {process_samples}")
        print(f"  Distribution-filled samples: {len(items)-process_samples}")

        return output_path

# Run Gemma
print("\n" + "="*60)
print("GEMMA-2B-IT LLM BASELINE")
print("="*60)

# Clear memory before starting
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

try:
    # Initialize Gemma
    gemma = GemmaLLMBaseline(device='cuda')

    # Generate predictions (process up to 588 samples)
    gemma_pred_file = gemma.generate_predictions(max_samples=588)

    # Clean up model
    del gemma.model
    del gemma
    torch.cuda.empty_cache()
    gc.collect()

    # Evaluate
    !python scoring.py dev_solution.jsonl {gemma_pred_file} results/gemma_scores.json

    # Display results
    with open('results/gemma_scores.json', 'r') as f:
        gemma_scores = json.load(f)

    print(f"\n Gemma-2B-IT LLM Results:")
    print(f"  Spearman: {gemma_scores.get('spearman', 'N/A'):.4f}")
    print(f"  Accuracy: {gemma_scores.get('accuracy', 'N/A'):.4f} ({gemma_scores.get('accuracy', 0)*100:.1f}%)")

    MAJORITY_ACCURACY = 0.5697

    if gemma_scores['accuracy'] > MAJORITY_ACCURACY:
        print(f"BEATS majority baseline by {(gemma_scores['accuracy']-MAJORITY_ACCURACY)*100:.1f}%!")
    else:
        print(f"Below majority by {(MAJORITY_ACCURACY-gemma_scores['accuracy'])*100:.1f}%")

except Exception as e:
    print(f"Error: {e}")
    print("\n If Gemma fails, we can use the lightweight alternative")

print("\n" + "="*60)


GEMMA-2B-IT LLM BASELINE
Initializing Gemma LLM Baseline...
  Loading google/gemma-2b-it...
  Loading model in fp16...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

 Gemma-2B-IT model loaded successfully
  GPU memory used: 11.99 GB

Generating Gemma LLM predictions...
  Processing 588 samples with Gemma...

  Sample 1:
    Homonym: 'track'
    Meaning: 'a pair of parallel rails providing a runway for wheels...'
    Rating: 3
    Analysis: Sure, here's the analysis you requested:

**Context:** The detectives arrived at the abandoned train station. They were looking for signs of the missi...

  Sample 2:
    Homonym: 'track'
    Meaning: 'evidence pointing to a possible solution...'
    Rating: 4
    Analysis: The word "track" is plausible (4) in this context. It fits the meaning of evidence pointing to a possible solution, which is what the sentence is desc...

  Sample 3:
    Homonym: 'track'
    Meaning: 'a pair of parallel rails providing a runway for wheels...'
    Rating: 4
    Analysis: **Rating: 4**

The word "track" in this context fits well with the context of an abandoned train station and the subsequent pursuit of clues. The pass...
    

In [26]:
# Cell: Novel Narrative Coherence Score Method
import torch
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import json

class NarrativeCoherenceModel:
    def __init__(self, device='cuda'):
        print("Initializing Narrative Coherence Score (NCS) Model...")

        # Use a strong encoder
        self.encoder = SentenceTransformer('all-mpnet-base-v2', device=device)

        # Gradient Boosting for non-linear patterns
        self.model = GradientBoostingRegressor(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        )

        print("NCS Model initialized")

    def extract_narrative_features(self, sample):
        """Extract novel narrative-focused features"""

        # 1. NARRATIVE SEGMENTS
        pre = sample.get('precontext', '')
        sent = sample['sentence']
        end = sample.get('ending', '')
        meaning = sample['judged_meaning']
        example = sample['example_sentence']

        # 2. ENCODE NARRATIVE PROGRESSION
        # Encode each narrative segment separately
        segments = []
        if pre:
            segments.append(self.encoder.encode(pre))
        segments.append(self.encoder.encode(sent))
        if end:
            segments.append(self.encoder.encode(end))

        # Encode sense
        sense_emb = self.encoder.encode(f"{sample['homonym']}: {meaning}")
        example_emb = self.encoder.encode(example)

        # 3. NARRATIVE FLOW FEATURES
        features = []

        # Coherence gradient: How coherence changes through narrative
        if len(segments) > 1:
            coherence_changes = []
            for i in range(len(segments)-1):
                sim = np.dot(segments[i], segments[i+1]) / (np.linalg.norm(segments[i]) * np.linalg.norm(segments[i+1]))
                coherence_changes.append(sim)

            # Mean and variance of coherence changes
            features.append(np.mean(coherence_changes))
            features.append(np.var(coherence_changes))
        else:
            features.extend([0.5, 0.0])

        # 4. SENSE INTEGRATION FEATURES
        # How well does the sense fit at different narrative points?
        if pre:
            pre_sense_sim = np.dot(segments[0], sense_emb) / (np.linalg.norm(segments[0]) * np.linalg.norm(sense_emb))
            features.append(pre_sense_sim)
        else:
            features.append(0.0)

        # Sentence-sense similarity
        sent_emb = self.encoder.encode(sent)
        sent_sense_sim = np.dot(sent_emb, sense_emb) / (np.linalg.norm(sent_emb) * np.linalg.norm(sense_emb))
        features.append(sent_sense_sim)

        # Ending-sense similarity (strong signal!)
        if end:
            end_emb = self.encoder.encode(end)
            end_sense_sim = np.dot(end_emb, sense_emb) / (np.linalg.norm(end_emb) * np.linalg.norm(sense_emb))
            features.append(end_sense_sim)
            features.append(1.0)  # Has ending flag
        else:
            features.append(0.0)
            features.append(0.0)

        # 5. NARRATIVE COMPLETENESS
        # Full narrative vs sense
        full_narrative = f"{pre} {sent} {end}".strip()
        full_emb = self.encoder.encode(full_narrative)
        narrative_sense_sim = np.dot(full_emb, sense_emb) / (np.linalg.norm(full_emb) * np.linalg.norm(sense_emb))
        features.append(narrative_sense_sim)

        # 6. EXAMPLE ALIGNMENT
        example_sim = np.dot(full_emb, example_emb) / (np.linalg.norm(full_emb) * np.linalg.norm(example_emb))
        features.append(example_sim)

        # 7. SEMANTIC SURPRISE
        # How unexpected is this sense given context?
        if pre:
            # Measure "surprise" as inverse similarity
            surprise = 1.0 - pre_sense_sim
            features.append(surprise)
        else:
            features.append(0.5)

        # 8. DISCOURSE COHERENCE
        # Special weight for ending presence (key insight!)
        ending_weight = 2.0 if end else 1.0
        features.append(ending_weight)

        return np.array(features)

    def train(self, train_path="train.json", n_samples=1500):
        """Train on narrative features"""
        print(f"Training NCS on {n_samples} samples...")

        with open(train_path, 'r') as f:
            train_data = json.load(f)

        X = []
        y = []

        train_items = list(train_data.items())[:n_samples]

        print("  Extracting narrative features...")
        for i, (idx, sample) in enumerate(train_items):
            if (i+1) % 300 == 0:
                print(f"    Processed {i+1}/{n_samples}")

            features = self.extract_narrative_features(sample)
            X.append(features)
            y.append(sample['average'])

        X = np.array(X)
        y = np.array(y)

        print("  Training Gradient Boosting model...")
        self.model.fit(X, y)

        # Feature importance
        importance = self.model.feature_importances_
        feature_names = ['coherence_mean', 'coherence_var', 'pre_sense', 'sent_sense',
                        'end_sense', 'has_ending', 'narrative_sense', 'example_sim',
                        'surprise', 'ending_weight']

        print("\n  Feature Importance:")
        for name, imp in sorted(zip(feature_names, importance), key=lambda x: -x[1])[:5]:
            print(f"    {name}: {imp:.3f}")

        print("NCS training complete")

    def predict(self, sample):
        """Predict using narrative coherence"""
        features = self.extract_narrative_features(sample)
        score = self.model.predict([features])[0]
        return int(np.clip(np.round(score), 1, 5))

    def generate_predictions(self, data_path="dev.json",
                            output_path="predictions/ncs_predictions.jsonl"):
        """Generate NCS predictions"""
        # Train first
        self.train("train.json", n_samples=1500)

        print("\nGenerating NCS predictions...")
        with open(data_path, 'r') as f:
            data = json.load(f)

        predictions = []
        for i, (idx, sample) in enumerate(data.items()):
            pred = self.predict(sample)
            predictions.append({"id": idx, "prediction": pred})

            if (i+1) % 100 == 0:
                print(f"  Processed {i+1}/{len(data)}")

        with open(output_path, 'w') as f:
            for pred in predictions:
                f.write(json.dumps(pred) + '\n')

        print(f"Generated {len(predictions)} NCS predictions")

        from collections import Counter
        dist = Counter([p['prediction'] for p in predictions])
        print(f"  Distribution: {dict(sorted(dist.items()))}")

        return output_path

# Run NCS
print("\n" + "="*60)
print("NOVEL: NARRATIVE COHERENCE SCORE (NCS)")
print("="*60)

ncs = NarrativeCoherenceModel(device='cuda' if torch.cuda.is_available() else 'cpu')
ncs_file = ncs.generate_predictions()

# Evaluate
!python scoring.py dev_solution.jsonl {ncs_file} results/ncs_scores.json

with open('results/ncs_scores.json', 'r') as f:
    scores = json.load(f)

print(f"\n NCS Results:")
print(f"  Spearman: {scores['spearman']:.4f}")
print(f"  Accuracy: {scores['accuracy']*100:.1f}%")

OFFICIAL_MAJORITY = 0.5697
if scores['accuracy'] > OFFICIAL_MAJORITY:
    print(f"   BEATS official majority by {(scores['accuracy']-OFFICIAL_MAJORITY)*100:.1f}%!")
else:
    print(f"   Below majority by {(OFFICIAL_MAJORITY-scores['accuracy'])*100:.1f}%")

print("\n Novel aspects for report:")
print("  1. Narrative flow analysis (coherence gradients)")
print("  2. Semantic surprise measurement")
print("  3. Discourse-aware feature engineering")
print("  4. Gradient Boosting for non-linear patterns")


NOVEL: NARRATIVE COHERENCE SCORE (NCS)
Initializing Narrative Coherence Score (NCS) Model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NCS Model initialized
Training NCS on 1500 samples...
  Extracting narrative features...
    Processed 300/1500
    Processed 600/1500
    Processed 900/1500
    Processed 1200/1500
    Processed 1500/1500
  Training Gradient Boosting model...

  Feature Importance:
    sent_sense: 0.193
    narrative_sense: 0.191
    example_sim: 0.148
    end_sense: 0.126
    pre_sense: 0.087
NCS training complete

Generating NCS predictions...
  Processed 100/588
  Processed 200/588
  Processed 300/588
  Processed 400/588
  Processed 500/588
Generated 588 NCS predictions
  Distribution: {1: 3, 2: 112, 3: 304, 4: 163, 5: 6}
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file predictions/ncs_predictions.jsonl on dev_solution.jsonl
----------
Spearman Correlation: 0.1815672090295114
Spearman p-Value: 9.411188443600646e-06
----------
Accuracy: 0.5748299319727891 (338/588)
Results dumped into scores.json successfully.

 NCS Results:
  Spearman: 0.1816
  Accuracy: 57.5%
   BEATS o

In [27]:
# CSPL
import torch
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import json
from collections import defaultdict

class FixedCSPL:
    def __init__(self, device='cuda'):
        print("Initializing Fixed CSPL Model...")

        self.encoder = SentenceTransformer('all-mpnet-base-v2', device=device)
        self.device = device

        self.preference_model = RandomForestRegressor(
            n_estimators=150,
            max_depth=6,  # Reduced to prevent overfitting
            min_samples_split=10,
            random_state=42
        )

        self.homonym_patterns = defaultdict(list)
        self.sense_embeddings = {}

        print("CSPL initialized")

    def build_homonym_knowledge(self, train_path="train.json"):
        """Learn patterns for each homonym"""
        print("\nBuilding homonym knowledge base...")

        with open(train_path, 'r') as f:
            train_data = json.load(f)

        # Group by homonym
        homonym_instances = defaultdict(list)
        for idx, sample in train_data.items():
            homonym = sample['homonym']
            homonym_instances[homonym].append(sample)

        # Build sense patterns
        for homonym, instances in homonym_instances.items():
            senses = {}
            for inst in instances:
                sense_key = inst['judged_meaning'][:50]
                if sense_key not in senses:
                    senses[sense_key] = {
                        'definition': inst['judged_meaning'],
                        'example': inst['example_sentence'],
                        'instances': []
                    }
                senses[sense_key]['instances'].append(inst)

            self.homonym_patterns[homonym] = {
                'senses': senses,
                'num_senses': len(senses)
            }

        print(f"  Built knowledge for {len(self.homonym_patterns)} homonyms")
        return self.homonym_patterns

    def extract_preference_features(self, sample):
        """Extract features WITHOUT avg_rating"""

        # Get embeddings
        context_parts = []
        if sample.get('precontext'):
            context_parts.append(sample['precontext'])
        context_parts.append(sample['sentence'])
        if sample.get('ending'):
            context_parts.append(sample['ending'])

        full_context = ' '.join(context_parts)
        context_emb = self.encoder.encode(full_context)

        sense_text = f"{sample['homonym']}: {sample['judged_meaning']}"
        sense_emb = self.encoder.encode(sense_text)

        example_emb = self.encoder.encode(sample['example_sentence'])

        features = []

        # Basic similarities
        ctx_sense_sim = np.dot(context_emb, sense_emb) / (np.linalg.norm(context_emb) * np.linalg.norm(sense_emb))
        features.append(ctx_sense_sim)

        ctx_example_sim = np.dot(context_emb, example_emb) / (np.linalg.norm(context_emb) * np.linalg.norm(example_emb))
        features.append(ctx_example_sim)

        # PREFERENCE MARGIN (key feature)
        homonym = sample['homonym']
        if homonym in self.homonym_patterns:
            pattern_info = self.homonym_patterns[homonym]
            current_sense_key = sample['judged_meaning'][:50]

            # Compare to all senses
            sense_similarities = []
            current_sim = ctx_sense_sim

            for sense_key, sense_info in pattern_info['senses'].items():
                alt_sense_text = f"{homonym}: {sense_info['definition']}"
                alt_sense_emb = self.encoder.encode(alt_sense_text)
                alt_sim = np.dot(context_emb, alt_sense_emb) / (np.linalg.norm(context_emb) * np.linalg.norm(alt_sense_emb))
                sense_similarities.append(alt_sim)

            # How much better is current sense?
            if len(sense_similarities) > 1:
                other_sims = [s for s in sense_similarities if s != current_sim]
                preference_margin = current_sim - np.max(other_sims)  # vs best alternative
                features.append(preference_margin)

                # Rank
                rank = sorted(sense_similarities, reverse=True).index(current_sim) + 1
                features.append(1.0 / rank)
            else:
                features.append(1.0)
                features.append(1.0)

            features.append(pattern_info['num_senses'])
        else:
            features.extend([0.5, 1.0, 1])

        # Strong ending signal
        if sample.get('ending'):
            features.append(2.0)  # Strong positive signal
            # Ending-sense similarity
            end_emb = self.encoder.encode(sample.get('ending', ''))
            end_sim = np.dot(end_emb, sense_emb) / (np.linalg.norm(end_emb) * np.linalg.norm(sense_emb))
            features.append(end_sim)
        else:
            features.append(1.0)
            features.append(0.0)

        # Context richness
        features.append(len(full_context.split()) / 100.0)

        return np.array(features)

    def train(self, train_path="train.json", n_samples=1800):
        """Train without problematic features"""

        self.build_homonym_knowledge(train_path)

        print(f"\nTraining on {n_samples} samples...")

        with open(train_path, 'r') as f:
            train_data = json.load(f)

        X = []
        y = []

        train_items = list(train_data.items())[:n_samples]

        for i, (idx, sample) in enumerate(train_items):
            if (i+1) % 400 == 0:
                print(f"  Processed {i+1}/{n_samples}")

            features = self.extract_preference_features(sample)
            X.append(features)
            y.append(sample['average'])

        X = np.array(X)
        y = np.array(y)

        self.preference_model.fit(X, y)

        # Check predictions on training
        train_preds = self.preference_model.predict(X[:100])
        print(f"\n  Training predictions sample: {np.unique(train_preds.round(), return_counts=True)}")

        print("Training complete")

    def predict(self, sample):
        """Predict with bounds"""
        features = self.extract_preference_features(sample)
        score = self.preference_model.predict([features])[0]

        # Add some variation based on features
        if features[2] > 0.5:  # High preference margin
            score += 0.5
        if features[5] > 1.5:  # Has ending
            score += 0.3

        return int(np.clip(np.round(score), 1, 5))

    def generate_predictions(self, data_path="dev.json",
                            output_path="predictions/fixed_cspl_predictions.jsonl"):
        """Generate predictions"""

        self.train("train.json", n_samples=1800)

        print("\nGenerating predictions...")
        with open(data_path, 'r') as f:
            data = json.load(f)

        predictions = []

        for idx, sample in data.items():
            pred = self.predict(sample)
            predictions.append({"id": idx, "prediction": pred})

        with open(output_path, 'w') as f:
            for pred in predictions:
                f.write(json.dumps(pred) + '\n')

        print(f"Generated {len(predictions)} predictions")

        from collections import Counter
        dist = Counter([p['prediction'] for p in predictions])
        print(f"  Distribution: {dict(sorted(dist.items()))}")

        return output_path

# Run CSPL
print("\n" + "="*60)
print("CSPL")
print("="*60)

fixed_cspl = FixedCSPL(device='cuda' if torch.cuda.is_available() else 'cpu')
cspl_file = fixed_cspl.generate_predictions()

!python scoring.py dev_solution.jsonl {cspl_file} results/fixed_cspl_scores.json

with open('results/fixed_cspl_scores.json', 'r') as f:
    scores = json.load(f)

print(f"\n CSPL Results:")
print(f"  Spearman: {scores.get('spearman', 'N/A'):.4f}")
print(f"  Accuracy: {scores['accuracy']*100:.1f}%")

if scores['accuracy'] > 0.5697:
    print(f"BEATS majority!")


CSPL
Initializing Fixed CSPL Model...
CSPL initialized

Building homonym knowledge base...
  Built knowledge for 220 homonyms

Training on 1800 samples...
  Processed 400/1800
  Processed 800/1800
  Processed 1200/1800
  Processed 1600/1800

  Training predictions sample: (array([2., 3., 4.]), array([13, 62, 25]))
Training complete

Generating predictions...
Generated 588 predictions
  Distribution: {3: 49, 4: 536, 5: 3}
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file predictions/fixed_cspl_predictions.jsonl on dev_solution.jsonl
----------
Spearman Correlation: 0.08678333354593368
Spearman p-Value: 0.03538921518328307
----------
Accuracy: 0.5782312925170068 (340/588)
Results dumped into scores.json successfully.

 CSPL Results:
  Spearman: 0.0868
  Accuracy: 57.8%
BEATS majority!


In [29]:
# Complete Voting Ensemble
import json
from collections import Counter
import numpy as np

# Load all predictions
wsd_preds = []
cspl_preds = []
ncs_preds = []

print("Loading predictions...")
with open('/content/drive/MyDrive/SemEvalProject/dev_predictions_ridge_fixed.jsonl') as f:
    for line in f:
        wsd_preds.append(json.loads(line)['prediction'])

with open('predictions/fixed_cspl_predictions.jsonl') as f:
    for line in f:
        cspl_preds.append(json.loads(line)['prediction'])

with open('predictions/ncs_predictions.jsonl') as f:
    for line in f:
        ncs_preds.append(json.loads(line)['prediction'])

# Strategy 1: Majority Voting
print("\n1. Creating Voting Ensemble...")
voting_preds = []
for i in range(588):
    votes = [wsd_preds[i], cspl_preds[i], ncs_preds[i]]
    vote_count = Counter(votes)
    most_common = vote_count.most_common(1)[0]
    if most_common[1] == 1:  # All different
        pred = wsd_preds[i]  # Trust best model
    else:
        pred = most_common[0]

    voting_preds.append({"id": str(i), "prediction": pred})

with open('predictions/voting_ensemble.jsonl', 'w') as f:
    for pred in voting_preds:
        f.write(json.dumps(pred) + '\n')

!python scoring.py dev_solution.jsonl predictions/voting_ensemble.jsonl results/voting_scores.json

with open('results/voting_scores.json') as f:
    scores = json.load(f)
print(f"Voting Ensemble: {scores['accuracy']*100:.1f}%")

# Strategy 2: Fixed Oracle Analysis
print("\n2. Oracle Analysis...")

# First, load dev.json to get actual ratings
with open('dev.json') as f:
    dev_data = json.load(f)

oracle_correct = 0
within_one = 0

for i, (idx, sample) in enumerate(dev_data.items()):
    gold = round(sample['average'])  # The actual human rating

    # Check if ANY model got it exactly right
    if any(pred == gold for pred in [wsd_preds[i], cspl_preds[i], ncs_preds[i]]):
        oracle_correct += 1

    # Check if any model was within 1
    if any(abs(pred - gold) <= 1 for pred in [wsd_preds[i], cspl_preds[i], ncs_preds[i]]):
        within_one += 1

print(f"Oracle exact match: {oracle_correct/588*100:.1f}%")
print(f"Oracle within ±1: {within_one/588*100:.1f}%")

# Strategy 3: Confidence-based selection
print("\n3. Confidence-based Selective Ensemble...")
selective_preds = []
for i in range(588):
    # CSPL tends to predict 4s well
    if cspl_preds[i] == 4:
        pred = cspl_preds[i]
    # WSD is most balanced
    elif wsd_preds[i] in [2, 3]:
        pred = wsd_preds[i]
    # Otherwise average
    else:
        pred = round(np.mean([wsd_preds[i], cspl_preds[i], ncs_preds[i]]))

    selective_preds.append({"id": str(i), "prediction": int(pred)})

with open('predictions/selective_ensemble.jsonl', 'w') as f:
    for pred in selective_preds:
        f.write(json.dumps(pred) + '\n')

!python scoring.py dev_solution.jsonl predictions/selective_ensemble.jsonl results/selective_scores.json

with open('results/selective_scores.json') as f:
    scores = json.load(f)
print(f" Selective Ensemble: {scores['accuracy']*100:.1f}%")

# Final Summary
print("\n" + "="*60)
print("FINAL ENSEMBLE RESULTS")
print("="*60)

# Check distribution of ensemble predictions
ensemble_dist = Counter([p['prediction'] for p in voting_preds])
print(f"\nVoting Ensemble Distribution: {dict(sorted(ensemble_dist.items()))}")
with open('results/voting_scores.json') as f:
    scores = json.load(f)
print(f"Voting Ensemble: {scores['accuracy']*100:.1f}%")
print(f"\n YOUR BEST RESULT: Voting Ensemble at {scores['accuracy']*100:.1f}%")


Loading predictions...

1. Creating Voting Ensemble...
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file predictions/voting_ensemble.jsonl on dev_solution.jsonl
----------
Spearman Correlation: 0.262309608004095
Spearman p-Value: 1.0428871834107379e-10
----------
Accuracy: 0.6122448979591837 (360/588)
Results dumped into scores.json successfully.
Voting Ensemble: 61.2%

2. Oracle Analysis...
Oracle exact match: 45.2%
Oracle within ±1: 88.1%

3. Confidence-based Selective Ensemble...
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file predictions/selective_ensemble.jsonl on dev_solution.jsonl
----------
Spearman Correlation: 0.08149365766012932
Spearman p-Value: 0.048245750168251424
----------
Accuracy: 0.5782312925170068 (340/588)
Results dumped into scores.json successfully.
 Selective Ensemble: 57.8%

FINAL ENSEMBLE RESULTS

Voting Ensemble Distribution: {2.541788817869273: 1, 2.5502305516783297: 1, 2.5583177083226802: 1, 2.56193174