# Evolver Loop 3: Analysis of exp_003 Results

## Goal
Analyze the successful TF-IDF fixes from exp_003 and identify next steps to reach gold threshold (0.979080).

Current best: 0.6555 AUC (exp_003)
Gold threshold: 0.979080
Gap: 0.3236 points

## Key Findings from exp_003
- Character n-grams dominate feature importance (4 of top 10 features)
- Feature selection worked: 3,000 features better than 10,000
- Removed stop words to keep domain vocabulary
- CV improved from 0.6217 → 0.6555 (+0.0338)
- Low variance (±0.0104) indicates stable model

In [None]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import csr_matrix, hstack
import warnings
warnings.filterwarnings('ignore')

# Set seed
np.random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

train = pd.DataFrame(train_data)
test = pd.DataFrame(test_data)

print(f"Train: {len(train)} samples, {sum(train['requester_received_pizza'])} positive ({sum(train['requester_received_pizza'])/len(train):.3f})")
print(f"Test: {len(test)} samples")

# Extract text
y = train['requester_received_pizza'].values
text_train = train['request_text_edit_aware'].fillna('').str.lower()
text_test = test['request_text_edit_aware'].fillna('').str.lower()

In [None]:
# Analyze character n-gram patterns from exp_003
# We need to understand what the top character n-grams represent

# Recreate the char n-gram vectorizer used in exp_003
char_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=2000,
    min_df=2,
    max_df=0.9,
    sublinear_tf=True
)

print("Fitting character n-gram vectorizer...")
X_char_train = char_vectorizer.fit_transform(text_train)
X_char_test = char_vectorizer.transform(text_test)

char_feature_names = char_vectorizer.get_feature_names_out()
print(f"Character n-grams shape: {X_char_train.shape}")
print(f"Top 20 character n-grams:")
for i, name in enumerate(char_feature_names[:20]):
    print(f"  {i+1:2d}. {name}")

# Let's see what these n-grams actually correspond to in the text
# by finding examples where they appear

def find_ngram_examples(text_series, ngram, n_examples=3):
    """Find examples of texts containing a specific n-gram"""
    examples = []
    for idx, text in enumerate(text_series):
        if ngram in text:
            # Find the context around the n-gram
            pos = text.find(ngram)
            start = max(0, pos - 50)
            end = min(len(text), pos + len(ngram) + 50)
            context = text[start:end].replace('\n', ' ')
            examples.append(context)
            if len(examples) >= n_examples:
                break
    return examples

print("\n" + "="*80)
print("ANALYZING TOP CHARACTER N-GRAMS")
print("="*80)

# Analyze the top 10 character n-grams
for i in range(min(10, len(char_feature_names))):
    ngram = char_feature_names[i]
    print(f"\n{i+1:2d}. '{ngram}'")
    
    # Find examples in successful and failed requests
    success_examples = find_ngram_examples(
        text_train[y == 1], ngram, n_examples=2
    )
    fail_examples = find_ngram_examples(
        text_train[y == 0], ngram, n_examples=2
    )
    
    if success_examples:
        print(f"   In SUCCESSFUL requests:")
        for ex in success_examples:
            print(f"      - ...{ex}...")
    
    if fail_examples:
        print(f"   In FAILED requests:")
        for ex in fail_examples:
            print(f"      - ...{ex}...")
    
    # Count frequency
    success_count = sum(text_train[y == 1].str.contains(ngram, na=False))
    fail_count = sum(text_train[y == 0].str.contains(ngram, na=False))
    success_rate = success_count / len(text_train[y == 1]) if len(text_train[y == 1]) > 0 else 0
    fail_rate = fail_count / len(text_train[y == 0]) if len(text_train[y == 0]) > 0 else 0
    
    print(f"   Frequency: {success_count}/{len(text_train[y == 1])} ({success_rate:.3f}) in successes")
    print(f"   Frequency: {fail_count}/{len(text_train[y == 0])} ({fail_rate:.3f}) in failures")