# Evolver Loop 1 Analysis

Analyzing the competition data and winning solutions to guide the next experiments.

**Current Status:**
- Best CV: 0.4619 (TF-IDF + Gradient Boosting)
- Target: 0.8782
- Gap: 0.4163 points

**Goal:** Understand what winning solutions did and identify the most promising directions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nColumns: {train_df.columns.tolist()}")
print(f"\nScore distribution:")
print(train_df['score'].value_counts().sort_index())

In [None]:
# Analyze the context feature
print("Context (CPC) distribution:")
print(train_df['context'].value_counts().head(10))

print(f"\nNumber of unique contexts: {train_df['context'].nunique()}")

# Look at examples with different contexts
print("\nSample rows with different contexts:")
sample_contexts = train_df['context'].unique()[:5]
for ctx in sample_contexts:
    print(f"\nContext: {ctx}")
    print(train_df[train_df['context'] == ctx][['anchor', 'target', 'score']].head(3))

In [None]:
# Analyze phrase lengths
train_df['anchor_len'] = train_df['anchor'].str.len()
train_df['target_len'] = train_df['target'].str.len()
train_df['anchor_word_count'] = train_df['anchor'].str.split().str.len()
train_df['target_word_count'] = train_df['target'].str.split().str.len()

print("Anchor length statistics:")
print(train_df['anchor_len'].describe())
print("\nTarget length statistics:")
print(train_df['target_len'].describe())

print("\nAnchor word count statistics:")
print(train_df['anchor_word_count'].describe())
print("\nTarget word count statistics:")
print(train_df['target_word_count'].describe())

In [None]:
# Analyze score distribution by context
score_by_context = train_df.groupby('context')['score'].agg(['mean', 'std', 'count']).reset_index()
score_by_context = score_by_context.sort_values('mean', ascending=False)

print("Top 10 contexts by average score:")
print(score_by_context.head(10))

print("\nBottom 10 contexts by average score:")
print(score_by_context.tail(10))

In [None]:
# Look at examples with high and low scores to understand patterns
print("Examples with score = 1.0 (perfect match):")
high_score = train_df[train_df['score'] == 1.0].sample(5, random_state=42)
for _, row in high_score.iterrows():
    print(f"Context: {row['context']} | Anchor: '{row['anchor']}' | Target: '{row['target']}'")

print("\nExamples with score = 0.0 (no match):")
low_score = train_df[train_df['score'] == 0.0].sample(5, random_state=42)
for _, row in low_score.iterrows():
    print(f"Context: {row['context']} | Anchor: '{row['anchor']}' | Target: '{row['target']}'")

In [None]:
# Analyze semantic relationships - look for synonyms and paraphrases
print("Examples showing semantic similarity (score >= 0.75):")
high_sim = train_df[train_df['score'] >= 0.75].sample(10, random_state=42)
for _, row in high_sim.iterrows():
    print(f"Score: {row['score']} | Context: {row['context']}")
    print(f"  Anchor: '{row['anchor']}'")
    print(f"  Target: '{row['target']}'")
    print()

print("\nExamples requiring context understanding:")
# Find examples where same anchor-target pair has different scores in different contexts
from collections import defaultdict
pair_scores = defaultdict(list)
for _, row in train_df.iterrows():
    pair = (row['anchor'], row['target'])
    pair_scores[pair].append((row['context'], row['score']))

context_dependent = {k: v for k, v in pair_scores.items() if len(v) > 1 and len(set([s for _, s in v])) > 1}
print(f"Found {len(context_dependent)} anchor-target pairs with context-dependent scores")

if context_dependent:
    print("\nSample context-dependent pairs:")
    for i, (pair, scores) in enumerate(list(context_dependent.items())[:3]):
        print(f"Pair {i+1}: '{pair[0]}' - '{pair[1]}'")
        for ctx, score in scores:
            print(f"  Context {ctx}: score = {score}")
        print()