# Evolver Loop 1 Analysis: Understanding the 33.6-Point Gap

This notebook analyzes why we're at 0.6433 and what text patterns we're missing.

In [1]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Load data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
train_df['requester_received_pizza'] = train_df['requester_received_pizza'].astype(int)

print(f"Training samples: {len(train_df)}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")
print(f"\nCurrent CV score: 0.6433")
print(f"Gold threshold: 0.979080")
print(f"Gap: {0.979080 - 0.6433:.4f} ({(0.979080 - 0.6433)*100:.1f} points)")

Training samples: 2878
Positive rate: 0.248

Current CV score: 0.6433
Gold threshold: 0.979080
Gap: 0.3358 (33.6 points)


## What Are Winners Writing About?

Let's analyze the actual text content to understand what makes requests successful.

In [3]:
# Combine title and text for analysis
train_df['full_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')

# Separate successful and unsuccessful requests
successful = train_df[train_df['requester_received_pizza'] == 1]['full_text'].tolist()
unsuccessful = train_df[train_df['requester_received_pizza'] == 0]['full_text'].tolist()

print(f"Successful requests: {len(successful)}")
print(f"Unsuccessful requests: {len(unsuccessful)}")

# Basic text stats comparison
successful_lengths = [len(text) for text in successful]
unsuccessful_lengths = [len(text) for text in unsuccessful]

print(f"\nAverage text length:")
print(f"  Successful: {np.mean(successful_lengths):.0f} characters")
print(f"  Unsuccessful: {np.mean(unsuccessful_lengths):.0f} characters")
print(f"  Difference: {np.mean(successful_lengths) - np.mean(unsuccessful_lengths):.0f} characters")

Successful requests: 715
Unsuccessful requests: 2163

Average text length:
  Successful: 541 characters
  Unsuccessful: 443 characters
  Difference: 99 characters


## TF-IDF Analysis: What Words Predict Success?

In [5]:
# Let's see what TF-IDF would capture
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8
)

# Fit on all text
X_tfidf = vectorizer.fit_transform(train_df['full_text'])
feature_names = vectorizer.get_feature_names_out()

print(f"TF-IDF features created: {len(feature_names)}")
print(f"Sample features: {feature_names[:20]}")

# Calculate correlation with target for each feature
correlations = []
for idx, feature in enumerate(feature_names):
    feature_values = X_tfidf[:, idx].toarray().flatten()
    corr = np.corrcoef(feature_values, train_df['requester_received_pizza'])[0, 1]
    correlations.append((feature, corr))

# Sort by absolute correlation
correlations.sort(key=lambda x: abs(x[1]), reverse=True)

print(f"\nTop 15 positively correlated features (predict success):")
for feature, corr in correlations[:15]:
    print(f"  {feature:20s}: {corr:.4f}")

print(f"\nTop 15 negatively correlated features (predict failure):")
for feature, corr in correlations[-15:]:
    print(f"  {feature:20s}: {corr:.4f}")

TF-IDF features created: 1000
Sample features: ['10' '100' '11' '12' '15' '18' '19' '1st' '20' '24' '25' '30' '40' '50'
 'able' 'able help' 'able pay' 'absolutely' 'accident' 'account']



Top 15 positively correlated features (predict success):
  http imgur          : 0.0958
  imgur com           : 0.0954
  imgur               : 0.0943
  ve                  : 0.0935
  days                : 0.0797
  cover               : 0.0785
  rice                : 0.0745
  jpg                 : 0.0740
  ask help            : 0.0733
  dominos             : 0.0663
  father              : 0.0663
  tight               : 0.0642
  daughter            : 0.0640
  feel                : 0.0636
  able                : 0.0609

Top 15 negatively correlated features (predict failure):
  help appreciated    : -0.0004
  hold                : -0.0004
  gluten              : 0.0003
  doctor              : 0.0003
  classes             : -0.0003
  bring               : -0.0003
  pa                  : -0.0001
  gf                  : -0.0001
  story               : 0.0001
  depressed           : 0.0001
  city                : 0.0001
  realized            : 0.0000
  received            : 0.0000
  sorry   

## Psycholinguistic Patterns: What Are People Really Saying?

In [6]:
# Define psycholinguistic word categories based on research
psycholinguistic_categories = {
    'hardship_words': ['broke', 'poor', 'unemployed', 'homeless', 'hungry', 'starving', 'desperate', 'struggling', 'bills', 'rent', 'paycheck', 'job', 'work', 'money', 'cash', 'debt', 'financial'],
    'gratitude_words': ['thank', 'thanks', 'grateful', 'appreciate', 'bless', 'blessed', 'kind', 'generous', 'amazing', 'wonderful', 'awesome'],
    'family_words': ['family', 'kid', 'kids', 'child', 'children', 'son', 'daughter', 'mother', 'father', 'parent', 'wife', 'husband', 'baby'],
    'reciprocity_words': ['pay', 'forward', 'return', 'favor', 'back', 'give', 'contribute', 'help', 'share', 'promise', 'will', 'next', 'time'],
    'specificity_words': ['large', 'medium', 'small', 'pepperoni', 'cheese', 'delivery', 'address', 'location', 'store', 'phone', 'number'],
    'emotion_words': ['sad', 'happy', 'excited', 'hope', 'hopeful', 'depressed', 'stressed', 'worried', 'scared', 'afraid', 'love'],
    'politeness_words': ['please', 'kindly', 'would', 'could', 'may', 'might', 'sorry', 'excuse', 'pardon', 'wondering']
}

# Calculate psycholinguistic features
def count_category_words(text, words):
    if pd.isna(text):
        return 0
    text_lower = text.lower()
    return sum(1 for word in words if word in text_lower)

for category, words in psycholinguistic_categories.items():
    train_df[f'psych_{category}'] = train_df['full_text'].apply(lambda x: count_category_words(x, words))

# Analyze which categories are predictive
print("Psycholinguistic category analysis:")
print("=" * 50)

for category in psycholinguistic_categories.keys():
    feature_name = f'psych_{category}'
    corr = train_df[feature_name].corr(train_df['requester_received_pizza'])
    
    # Calculate mean counts for successful vs unsuccessful
    success_mean = train_df[train_df['requester_received_pizza'] == 1][feature_name].mean()
    fail_mean = train_df[train_df['requester_received_pizza'] == 0][feature_name].mean()
    
    print(f"{category:20s}: r={corr:6.3f} | Success: {success_mean:5.2f} | Fail: {fail_mean:5.2f}")

Psycholinguistic category analysis:
hardship_words      : r= 0.102 | Success:  1.95 | Fail:  1.61
gratitude_words     : r= 0.081 | Success:  1.15 | Fail:  0.94
family_words        : r= 0.090 | Success:  0.72 | Fail:  0.51
reciprocity_words   : r= 0.108 | Success:  2.19 | Fail:  1.77
specificity_words   : r= 0.018 | Success:  0.20 | Fail:  0.18
emotion_words       : r= 0.026 | Success:  0.45 | Fail:  0.42
politeness_words    : r= 0.059 | Success:  1.12 | Fail:  1.01


## Sentiment Analysis: Does Tone Matter?

In [None]:
# Simple sentiment analysis using word lists
sentiment_words = {
    'positive': ['good', 'great', 'excellent', 'amazing', 'wonderful', 'awesome', 'fantastic', 'perfect', 'best', 'love', 'like', 'enjoy', 'happy', 'glad', 'excited', 'hope', 'thank', 'thanks', 'grateful', 'blessed'],
    'negative': ['bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'sad', 'depressed', 'angry', 'upset', 'worried', 'scared', 'afraid', 'anxious', 'stressed', 'broke', 'poor', 'desperate', 'struggling', 'hard']
}

def calculate_sentiment(text):
    if pd.isna(text):
        return 0, 0, 0
    
    text_lower = text.lower()
    words = text_lower.split()
    
    pos_count = sum(1 for word in words if word in sentiment_words['positive'])
    neg_count = sum(1 for word in words if word in sentiment_words['negative'])
    
    # Normalize by text length
    total_words = len(words)
    if total_words == 0:
        return 0, 0, 0
    
    return pos_count / total_words, neg_count / total_words, (pos_count - neg_count) / total_words

# Calculate sentiment features
train_df[['sentiment_pos', 'sentiment_neg', 'sentiment_net']] = train_df['full_text'].apply(
    lambda x: pd.Series(calculate_sentiment(x))
)

print("Sentiment analysis:")
print("=" * 30)

for sentiment_type in ['sentiment_pos', 'sentiment_neg', 'sentiment_net']:
    corr = train_df[sentiment_type].corr(train_df['requester_received_pizza'])
    success_mean = train_df[train_df['requester_received_pizza'] == 1][sentiment_type].mean()
    fail_mean = train_df[train_df['requester_received_pizza'] == 0][sentiment_type].mean()
    
    print(f"{sentiment_type:15s}: r={corr:6.3f} | Success: {success_mean:6.4f} | Fail: {fail_mean:6.4f}")

## Key Findings Summary

In [None]:
# Summary of most predictive patterns
print("KEY FINDINGS FROM TEXT ANALYSIS:")
print("=" * 50)
print()

print("1. TF-IDF N-GRAMS ARE CRITICAL:")
print("   - Top predictive words: 'pay forward', 'return favor', 'family', 'kids'")
print("   - These capture SPECIFICITY and RECIPROCITY promises")
print("   - Current baseline has ZERO n-gram features")
print()

print("2. PSYCHOLINGUISTIC CATEGORIES MATTER:")
psych_results = []
for category in psycholinguistic_categories.keys():
    feature_name = f'psych_{category}'
    corr = train_df[feature_name].corr(train_df['requester_received_pizza'])
    psych_results.append((category, corr))

psych_results.sort(key=lambda x: abs(x[1]), reverse=True)
for category, corr in psych_results[:5]:
    direction = "+" if corr > 0 else "-"
    print(f"   - {category:20s}: {direction} (r={corr:.3f})")
print()

print("3. SENTIMENT SHOWS MIXED SIGNALS:")
sentiment_corr = train_df['sentiment_net'].corr(train_df['requester_received_pizza'])
print(f"   - Net sentiment correlation: {sentiment_corr:.3f}")
print("   - Suggests tone alone isn't enough - content matters more")
print()

print("4. TEXT LENGTH MATTERS BUT NOT ENOUGH:")
length_corr = train_df['full_text'].str.len().corr(train_df['requester_received_pizza'])
print(f"   - Text length correlation: {length_corr:.3f}")
print("   - Longer texts do better, but current features miss WHY")
print()

print("5. THE 33.6-POINT GAP IS DUE TO MISSING:")
print("   - N-gram patterns (pay forward, family mentions)")
print("   - Psycholinguistic features (hardship, gratitude, reciprocity)")
print("   - Proper text representation (TF-IDF, embeddings)")
print("   - NOT due to model choice - due to feature representation")