# Evolver Loop 4 Analysis: Understanding exp_003 Failure

**Objective**: Analyze why enhanced keyword features experiment (exp_003) degraded performance from 0.6253 to 0.6196

**Key questions**:
1. Why did count-based keywords perform worse than binary indicators?
2. What can we learn about feature engineering effectiveness?
3. What should we try next based on this failure?

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Load data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

print(f"Training samples: {len(train_df)}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")
print(f"\nFirst few rows:")
print(train_df[['request_title', 'request_text', 'requester_received_pizza']].head())

In [None]:
# Analyze keyword patterns in successful vs failed requests
def extract_keywords(text, keywords):
    """Extract keyword counts from text"""
    if pd.isna(text):
        return {kw: 0 for kw in keywords}
    
    text_lower = text.lower()
    counts = {}
    for kw in keywords:
        # Use word boundaries for accurate matching
        import re
        pattern = r'\b' + re.escape(kw) + r'\b'
        counts[kw] = len(re.findall(pattern, text_lower))
    return counts

# Keywords from exp_003 (original + new)
original_keywords = ['thanks', 'thank', 'please', 'because', 'pay', 'forward']
new_keywords = ['appreciate', 'grateful', 'children', 'family', 'need', 'help', 'desperate', 'hungry']
all_keywords = original_keywords + new_keywords

print("Analyzing keyword patterns...")

# Combine title and text for analysis
train_df['full_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text'].fillna('')

# Extract keyword counts for all samples
keyword_data = []
for idx, row in train_df.iterrows():
    counts = extract_keywords(row['full_text'], all_keywords)
    counts['requester_received_pizza'] = row['requester_received_pizza']
    keyword_data.append(counts)

keyword_df = pd.DataFrame(keyword_data)

# Calculate success rates by keyword presence
print("\n" + "="*60)
print("KEYWORD ANALYSIS: Success rates by presence/absence")
print("="*60)

results = []
for kw in all_keywords:
    present = keyword_df[keyword_df[kw] > 0]
    absent = keyword_df[keyword_df[kw] == 0]
    
    if len(present) > 20:  # Only analyze keywords with sufficient samples
        success_rate_present = present['requester_received_pizza'].mean()
        success_rate_absent = absent['requester_received_pizza'].mean()
        lift = success_rate_present - success_rate_absent
        
        results.append({
            'keyword': kw,
            'present_count': len(present),
            'absent_count': len(absent),
            'success_rate_present': success_rate_present,
            'success_rate_absent': success_rate_absent,
            'lift': lift,
            'baseline': train_df['requester_received_pizza'].mean()
        })

results_df = pd.DataFrame(results).sort_values('lift', ascending=False)
print(results_df.round(4))

In [None]:
# Analyze frequency distribution of keywords
print("\n" + "="*60)
print("KEYWORD FREQUENCY DISTRIBUTION")
print("="*60)

for kw in all_keywords:
    freq_dist = keyword_df[kw].value_counts().sort_index()
    print(f"\n{kw.upper()}:")
    print(f"  Mean occurrences: {keyword_df[kw].mean():.3f}")
    print(f"  Max occurrences: {keyword_df[kw].max()}")
    print(f"  % with 0 occurrences: {(keyword_df[kw] == 0).mean()*100:.1f}%")
    print(f"  % with 1 occurrence: {(keyword_df[kw] == 1).mean()*100:.1f}%")
    print(f"  % with 2+ occurrences: {(keyword_df[kw] >= 2).mean()*100:.1f}%")
    
    # Show distribution for first few values
    for val in sorted(freq_dist.index)[:5]:
        if val > 0:
            count = freq_dist[val]
            success_rate = keyword_df[keyword_df[kw] == val]['requester_received_pizza'].mean()
            print(f"    {val} occurrence(s): {count} samples, {success_rate:.1%} success rate")

# Analyze correlation between keyword count and success
print("\n" + "="*60)
print("CORRELATION ANALYSIS: Keyword count vs success")
print("="*60)

correlations = []
for kw in all_keywords:
    corr = keyword_df[kw].corr(keyword_df['requester_received_pizza'])
    correlations.append({'keyword': kw, 'correlation': corr})

corr_df = pd.DataFrame(correlations).sort_values('correlation', ascending=False)
print(corr_df.round(4))

In [None]:
# Compare binary vs count approach effectiveness
print("\n" + "="*60)
print("BINARY vs COUNT: Which approach works better?")
print("="*60)

# For each keyword, compare predictive power of binary vs count
comparison_results = []

for kw in all_keywords:
    # Binary approach (presence/absence)
    binary_present = (keyword_df[kw] > 0).astype(int)
    binary_corr = abs(binary_present.corr(keyword_df['requester_received_pizza']))
    
    # Count approach (number of occurrences)
    count_corr = abs(keyword_df[kw].corr(keyword_df['requester_received_pizza']))
    
    # Determine which is better
    better_approach = "BINARY" if binary_corr > count_corr else "COUNT"
    difference = abs(binary_corr - count_corr)
    
    comparison_results.append({
        'keyword': kw,
        'binary_correlation': binary_corr,
        'count_correlation': count_corr,
        'better_approach': better_approach,
        'difference': difference,
        'mean_occurrences': keyword_df[kw].mean()
    })

comparison_df = pd.DataFrame(comparison_results).sort_values('difference', ascending=False)
print(comparison_df.round(4))

# Summary statistics
print(f"\n{'='*60}")
print("SUMMARY: Binary vs Count Performance")
print(f"{'='*60}")
binary_better = (comparison_df['better_approach'] == 'BINARY').sum()
count_better = (comparison_df['better_approach'] == 'COUNT').sum()
print(f"Keywords where BINARY is better: {binary_better}/{len(all_keywords)}")
print(f"Keywords where COUNT is better: {count_better}/{len(all_keywords)}")

# Analyze why binary might be better
print(f"\nKeywords with very low frequency (mean < 0.1):")
low_freq = comparison_df[comparison_df['mean_occurrences'] < 0.1]
print(low_freq[['keyword', 'mean_occurrences', 'better_approach']].round(4))

print(f"\nKeywords with higher frequency (mean >= 0.1):")
high_freq = comparison_df[comparison_df['mean_occurrences'] >= 0.1]
print(high_freq[['keyword', 'mean_occurrences', 'better_approach']].round(4))

In [None]:
# Analyze text preprocessing issues
print("\n" + "="*60)
print("TEXT PREPROCESSING ANALYSIS")
print("="*60)

# Sample some text to see what patterns exist
sample_texts = train_df['full_text'].sample(5, random_state=42).tolist()

print("Sample raw texts:")
for i, text in enumerate(sample_texts, 1):
    print(f"\n--- Sample {i} ---")
    print(text[:300] + "..." if len(text) > 300 else text)

# Check for Reddit-specific patterns
print("\n" + "="*60)
print("REDDIT PATTERN DETECTION")
print("="*60)

reddit_patterns = {
    'r/ references': r'r/[a-zA-Z0-9_]+',
    'u/ references': r'u/[a-zA-Z0-9_]+',
    'URLs': r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
    'Markdown links': r'\[.*?\]\(.*?\)',
    'Email addresses': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
}

pattern_counts = defaultdict(int)
pattern_examples = defaultdict(list)

for text in train_df['full_text']:
    if pd.isna(text):
        continue
    for pattern_name, pattern_regex in reddit_patterns.items():
        matches = re.findall(pattern_regex, text)
        if matches:
            pattern_counts[pattern_name] += 1
            if len(pattern_examples[pattern_name]) < 3:
                pattern_examples[pattern_name].extend(matches[:3-len(pattern_examples[pattern_name])])

print("Reddit pattern frequencies:")
for pattern_name, count in pattern_counts.items():
    print(f"  {pattern_name}: {count} texts ({count/len(train_df)*100:.1f}%)")
    if pattern_examples[pattern_name]:
        print(f"    Examples: {pattern_examples[pattern_name][:2]}")

# Check for common abbreviations that might affect keyword matching
print("\n" + "="*60)
print("COMMON ABBREVIATIONS")
print("="*60)

abbreviations = ['pls', 'plz', 'thx', 'thnx', 'ty', 'pl', 'pizza', 'plz', 'thanx']
for abbr in abbreviations:
    count = train_df['full_text'].str.contains(f'\b{abbr}\b', case=False, na=False).sum()
    if count > 0:
        print(f"  '{abbr}': {count} occurrences ({count/len(train_df)*100:.1f}%)")