# Evolver Loop 3: Analysis of exp_003 Results

## Goal
Analyze the successful TF-IDF fixes from exp_003 and identify next steps to reach gold threshold (0.979080).

Current best: 0.6555 AUC (exp_003)
Gold threshold: 0.979080
Gap: 0.3236 points

## Key Findings from exp_003
- Character n-grams dominate feature importance (4 of top 10 features)
- Feature selection worked: 3,000 features better than 10,000
- Removed stop words to keep domain vocabulary
- CV improved from 0.6217 → 0.6555 (+0.0338)
- Low variance (±0.0104) indicates stable model

In [2]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import csr_matrix, hstack
import warnings
warnings.filterwarnings('ignore')

# Set seed
np.random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

train = pd.DataFrame(train_data)
test = pd.DataFrame(test_data)

print(f"Train: {len(train)} samples, {sum(train['requester_received_pizza'])} positive ({sum(train['requester_received_pizza'])/len(train):.3f})")
print(f"Test: {len(test)} samples")

# Extract text
y = train['requester_received_pizza'].values
text_train = train['request_text_edit_aware'].fillna('').str.lower()
text_test = test['request_text_edit_aware'].fillna('').str.lower()

Loading data...
Train: 2878 samples, 715 positive (0.248)
Test: 1162 samples


In [3]:
# Analyze character n-gram patterns from exp_003
# We need to understand what the top character n-grams represent

# Recreate the char n-gram vectorizer used in exp_003
char_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=2000,
    min_df=2,
    max_df=0.9,
    sublinear_tf=True
)

print("Fitting character n-gram vectorizer...")
X_char_train = char_vectorizer.fit_transform(text_train)
X_char_test = char_vectorizer.transform(text_test)

char_feature_names = char_vectorizer.get_feature_names_out()
print(f"Character n-grams shape: {X_char_train.shape}")
print(f"Top 20 character n-grams:")
for i, name in enumerate(char_feature_names[:20]):
    print(f"  {i+1:2d}. {name}")

# Let's see what these n-grams actually correspond to in the text
# by finding examples where they appear

def find_ngram_examples(text_series, ngram, n_examples=3):
    """Find examples of texts containing a specific n-gram"""
    examples = []
    for idx, text in enumerate(text_series):
        if ngram in text:
            # Find the context around the n-gram
            pos = text.find(ngram)
            start = max(0, pos - 50)
            end = min(len(text), pos + len(ngram) + 50)
            context = text[start:end].replace('\n', ' ')
            examples.append(context)
            if len(examples) >= n_examples:
                break
    return examples

print("\n" + "="*80)
print("ANALYZING TOP CHARACTER N-GRAMS")
print("="*80)

# Analyze the top 10 character n-grams
for i in range(min(10, len(char_feature_names))):
    ngram = char_feature_names[i]
    print(f"\n{i+1:2d}. '{ngram}'")
    
    # Find examples in successful and failed requests
    success_examples = find_ngram_examples(
        text_train[y == 1], ngram, n_examples=2
    )
    fail_examples = find_ngram_examples(
        text_train[y == 0], ngram, n_examples=2
    )
    
    if success_examples:
        print(f"   In SUCCESSFUL requests:")
        for ex in success_examples:
            print(f"      - ...{ex}...")
    
    if fail_examples:
        print(f"   In FAILED requests:")
        for ex in fail_examples:
            print(f"      - ...{ex}...")
    
    # Count frequency
    success_count = sum(text_train[y == 1].str.contains(ngram, na=False))
    fail_count = sum(text_train[y == 0].str.contains(ngram, na=False))
    success_rate = success_count / len(text_train[y == 1]) if len(text_train[y == 1]) > 0 else 0
    fail_rate = fail_count / len(text_train[y == 0]) if len(text_train[y == 0]) > 0 else 0
    
    print(f"   Frequency: {success_count}/{len(text_train[y == 1])} ({success_rate:.3f}) in successes")
    print(f"   Frequency: {fail_count}/{len(text_train[y == 0])} ({fail_rate:.3f}) in failures")

Fitting character n-gram vectorizer...


Character n-grams shape: (2878, 2000)
Top 20 character n-grams:
   1.  a 
   2.  a b
   3.  a c
   4.  a f
   5.  a l
   6.  a n
   7.  a p
   8.  a pi
   9.  a r
  10.  a s
  11.  a w
  12.  ab
  13.  abo
  14.  abou
  15.  ac
  16.  acc
  17.  af
  18.  aft
  19.  afte
  20.  ag

ANALYZING TOP CHARACTER N-GRAMS

 1. ' a '
   In SUCCESSFUL requests:
      - ...i will go ahead and say that i got a pizza meal from here before as to not seem like i'...
      - ...zalodad and myself would love to have a pizza with our kids tonight! my husband lost his j...
   In FAILED requests:
      - ...i will soon be going on a long deployment which i'm not aloud to discuss but...
      - ...ould all really appreciate it, and would even send a picture of the three of us enjoying the said pizza...
   Frequency: 616/715 (0.862) in successes
   Frequency: 1693/2163 (0.783) in failures

 2. ' a b'
   In SUCCESSFUL requests:
      - ...ing until they call me in for that expendable job, a background check f

In [None]:
# Now let's analyze what these character n-grams actually represent
# The top features in exp_003 were: char_ss, char_f a, char_e a, char_thi, char_ere

# Let's look for these specific patterns
key_patterns = ['ss', 'f a', 'e a', 'thi', 'ere', ' a ', ' a p', ' a pi']

print("="*80)
print("DEEP DIVE: KEY CHARACTER PATTERNS FROM exp_003")
print("="*80)

for pattern in key_patterns:
    print(f"\nPattern: '{pattern}'")
    
    # Find examples
    success_examples = find_ngram_examples(text_train[y == 1], pattern, n_examples=3)
    fail_examples = find_ngram_examples(text_train[y == 0], pattern, n_examples=3)
    
    # Count frequency
    success_count = sum(text_train[y == 1].str.contains(pattern, na=False))
    fail_count = sum(text_train[y == 0].str.contains(pattern, na=False))
    total_success = len(text_train[y == 1])
    total_fail = len(text_train[y == 0])
    
    success_rate = success_count / total_success
    fail_rate = fail_count / total_fail
    
    print(f"  Frequency in successes: {success_count}/{total_success} ({success_rate:.3f})")
    print(f"  Frequency in failures: {fail_count}/{total_fail} ({fail_rate:.3f})")
    print(f"  Ratio (success/fail): {success_rate/fail_rate:.3f}")
    
    # Show examples
    if success_examples:
        print(f"  Examples in SUCCESSFUL requests:")
        for ex in success_examples[:2]:
            print(f"    - ...{ex}...")
    
    if fail_examples:
        print(f"  Examples in FAILED requests:")
        for ex in fail_examples[:2]:
            print(f"    - ...{ex}...")
    
    # What words contain this pattern?
    # Extract all words containing this pattern from successful requests
    success_words = Counter()
    for text in text_train[y == 1]:
        words = text.split()
        for word in words:
            if pattern in word:
                success_words[word] += 1
    
    if success_words:
        print(f"  Common words with '{pattern}' in successes:")
        for word, count in success_words.most_common(5):
            print(f"    - '{word}' (x{count})")