# Loop 4 Analysis: Understanding Character N-gram Patterns

**Goal**: Analyze what the top character n-grams actually represent to guide further feature engineering

**Focus**: Map character n-grams back to actual text patterns to understand what's working

In [1]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
print("Loading training data...")
train_df = pd.read_json('/home/data/train.json', orient='records')
test_df = pd.read_json('/home/data/test.json', orient='records')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts().to_dict()}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

Loading training data...


Train shape: (2878, 32)
Test shape: (1162, 17)
Target distribution: {False: 2163, True: 715}
Positive rate: 0.248


## Analyze Top Character N-grams from exp_003

The evaluator noted these top character n-grams:
- char_ss (likely from "pizza", "bless", "blessing")
- char_f a (likely from "for a", "from a")
- char_e a (likely from "please", "get a", "have a")
- char_thi (likely from "this", "thing")
- char_ere (likely from "here", "there", "where")

Let's map these back to actual text patterns.

In [2]:
# Function to extract character n-grams
def get_char_ngrams(text, n_range=(3,5)):
    """Extract character n-grams from text"""
    text = text.lower().replace(' ', '_')  # Replace spaces with underscore to capture word boundaries
    ngrams = []
    for n in range(n_range[0], n_range[1] + 1):
        for i in range(len(text) - n + 1):
            ngrams.append(text[i:i+n])
    return ngrams

# Test on some examples
sample_texts = [
    "I would really appreciate a pizza, I'm so hungry",
    "Please help, my family is struggling",
    "I can pay back next week",
    "Thank you for reading this"
]

for text in sample_texts:
    ngrams = get_char_ngrams(text)
    print(f"\nText: {text}")
    print(f"Sample n-grams: {ngrams[:10]}")


Text: I would really appreciate a pizza, I'm so hungry
Sample n-grams: ['i_w', '_wo', 'wou', 'oul', 'uld', 'ld_', 'd_r', '_re', 'rea', 'eal']

Text: Please help, my family is struggling
Sample n-grams: ['ple', 'lea', 'eas', 'ase', 'se_', 'e_h', '_he', 'hel', 'elp', 'lp,']

Text: I can pay back next week
Sample n-grams: ['i_c', '_ca', 'can', 'an_', 'n_p', '_pa', 'pay', 'ay_', 'y_b', '_ba']

Text: Thank you for reading this
Sample n-grams: ['tha', 'han', 'ank', 'nk_', 'k_y', '_yo', 'you', 'ou_', 'u_f', '_fo']


In [3]:
# Analyze character n-gram patterns in successful vs failed requests
print("Analyzing character n-gram patterns...")

# Get texts for successful and failed requests
successful_texts = train_df[train_df['requester_received_pizza'] == 1]['request_text_edit_aware'].tolist()
failed_texts = train_df[train_df['requester_received_pizza'] == 0]['request_text_edit_aware'].tolist()

print(f"Successful texts: {len(successful_texts)}")
print(f"Failed texts: {len(failed_texts)}")

# Sample for analysis (full dataset would be too slow)
sample_size = 500
np.random.seed(42)
successful_sample = np.random.choice(successful_texts, min(sample_size, len(successful_texts)), replace=False)
failed_sample = np.random.choice(failed_texts, min(sample_size, len(failed_texts)), replace=False)

# Extract n-grams
print("\nExtracting n-grams from successful requests...")
success_ngrams = []
for text in successful_sample:
    success_ngrams.extend(get_char_ngrams(text))

print("Extracting n-grams from failed requests...")
failed_ngrams = []
for text in failed_sample:
    failed_ngrams.extend(get_char_ngrams(text))

# Count frequencies
success_counts = Counter(success_ngrams)
failed_counts = Counter(failed_ngrams)

print(f"Unique n-grams in successful: {len(success_counts)}")
print(f"Unique n-grams in failed: {len(failed_counts)}")

Analyzing character n-gram patterns...
Successful texts: 715
Failed texts: 2163

Extracting n-grams from successful requests...


Extracting n-grams from failed requests...


Unique n-grams in successful: 81229
Unique n-grams in failed: 70838


In [None]:
# Find n-grams with highest success ratio
print("\n" + "="*60)
print("TOP N-GRAMS BY SUCCESS RATIO")
print("="*60)

# Calculate success ratio for n-grams that appear at least 5 times total
min_count = 5
ratios = []

for ngram in set(list(success_counts.keys()) + list(failed_counts.keys())):
    success = success_counts.get(ngram, 0)
    failed = failed_counts.get(ngram, 0)
    total = success + failed
    
    if total >= min_count:
        # Success ratio = proportion in successful texts
        ratio = success / total if total > 0 else 0
        ratios.append({
            'ngram': ngram,
            'success': success,
            'failed': failed,
            'total': total,
            'success_ratio': ratio
        })

# Convert to DataFrame and sort
ratio_df = pd.DataFrame(ratios)
ratio_df = ratio_df.sort_values('success_ratio', ascending=False)

print("\nTop 20 n-grams most associated with SUCCESS:")
print(ratio_df.head(20)[['ngram', 'success', 'failed', 'total', 'success_ratio']].to_string(index=False))

print("\n\nTop 20 n-grams most associated with FAILURE:")
print(ratio_df.tail(20)[['ngram', 'success', 'failed', 'total', 'success_ratio']].to_string(index=False))

In [None]:
# Analyze specific patterns mentioned by evaluator
print("\n" + "="*60)
print("ANALYZING SPECIFIC PATTERNS")
print("="*60)

# Patterns to investigate
patterns = ['ss', 'f_a', 'e_a', 'thi', 'ere', ' a_p', 'pl', 'ea', 'zza', 'hun', 'ple', 'for', 'thi', 'ere']

print("\nAnalyzing evaluator's suggested patterns:")
for pattern in patterns:
    if pattern in success_counts or pattern in failed_counts:
        success = success_counts.get(pattern, 0)
        failed = failed_counts.get(pattern, 0)
        total = success + failed
        ratio = success / total if total > 0 else 0
        print(f"  {pattern:6s}: success={success:3d}, failed={failed:3d}, ratio={ratio:.3f}")

# Find n-grams containing 'pizza' or 'please' or 'help'
print("\n\nN-grams related to 'pizza', 'please', 'help':")
pizza_related = [n for n in success_counts.keys() if 'piz' in n or 'zza' in n]
please_related = [n for n in success_counts.keys() if 'ple' in n or 'eas' in n]
help_related = [n for n in success_counts.keys() if 'hel' in n]

print(f"  Pizza-related n-grams: {len(pizza_related)} unique")
print(f"  Please-related n-grams: {len(please_related)} unique")
print(f"  Help-related n-grams: {len(help_related)} unique")

# Show most common ones
if pizza_related:
    pizza_counts = [(n, success_counts[n] + failed_counts.get(n, 0)) for n in pizza_related]
    pizza_counts.sort(key=lambda x: x[1], reverse=True)
    print(f"  Top pizza n-grams: {pizza_counts[:5]}")

if please_related:
    please_counts = [(n, success_counts[n] + failed_counts.get(n, 0)) for n in please_related]
    please_counts.sort(key=lambda x: x[1], reverse=True)
    print(f"  Top please n-grams: {please_counts[:5]}")

if help_related:
    help_counts = [(n, success_counts[n] + failed_counts.get(n, 0)) for n in help_related]
    help_counts.sort(key=lambda x: x[1], reverse=True)
    print(f"  Top help n-grams: {help_counts[:5]}")

In [None]:
# Find examples of texts containing top n-grams
print("\n" + "="*60)
print("EXAMPLE TEXTS WITH TOP N-GRAMS")
print("="*60)

# Get top n-grams by success ratio (with sufficient frequency)
top_ngrams = ratio_df[(ratio_df['total'] >= 10) & (ratio_df['success_ratio'] >= 0.7)].head(10)

print("\nTop predictive n-grams (>=10 occurrences, >=70% success rate):")
for _, row in top_ngrams.iterrows():
    ngram = row['ngram']
    print(f"\n  N-gram: '{ngram}' (success ratio: {row['success_ratio']:.3f}, count: {row['total']})")
    
    # Find example texts containing this n-gram
    examples_found = 0
    for text in successful_sample[:100]:  # Check first 100 successful texts
        if ngram in get_char_ngrams(text):
            # Clean up the text for display
            clean_text = text[:150] + "..." if len(text) > 150 else text
            print(f"    Example: {clean_text}")
            examples_found += 1
            if examples_found >= 2:  # Show 2 examples max
                break
    
    if examples_found == 0:
        print(f"    (No examples found in sample)")

In [None]:
# Analyze what 'ss' pattern represents
print("\n" + "="*60)
print("DEEP DIVE: 'ss' PATTERN ANALYSIS")
print("="*60)

# Find words containing 'ss' in successful texts
ss_words_success = []
for text in successful_sample:
    words = text.lower().split()
    ss_words = [w for w in words if 'ss' in w]
    ss_words_success.extend(ss_words)

ss_words_failed = []
for text in failed_sample:
    words = text.lower().split()
    ss_words = [w for w in words if 'ss' in w]
    ss_words_failed.extend(ss_words)

ss_success_counts = Counter(ss_words_success)
ss_failed_counts = Counter(ss_words_failed)

print("\nTop words containing 'ss' in SUCCESSFUL requests:")
ss_success_df = pd.DataFrame(ss_success_counts.most_common(20), columns=['word', 'count'])
print(ss_success_df.to_string(index=False))

print("\nTop words containing 'ss' in FAILED requests:")
ss_failed_df = pd.DataFrame(ss_failed_counts.most_common(20), columns=['word', 'count'])
print(ss_failed_df.to_string(index=False))

# Calculate ratio for specific words
print("\n\nSuccess ratio for words containing 'ss':")
ss_words = set(list(ss_success_counts.keys()) + list(ss_failed_counts.keys()))
ss_ratios = []

for word in ss_words:
    success = ss_success_counts.get(word, 0)
    failed = ss_failed_counts.get(word, 0)
    total = success + failed
    if total >= 3:  # At least 3 occurrences
        ratio = success / total
        ss_ratios.append({
            'word': word,
            'success': success,
            'failed': failed,
            'total': total,
            'ratio': ratio
        })

ss_ratio_df = pd.DataFrame(ss_ratios).sort_values('ratio', ascending=False)
print(ss_ratio_df.head(15).to_string(index=False))

In [None]:
# Save findings
print("\n" + "="*60)
print("SUMMARY OF KEY FINDINGS")
print("="*60)

findings = []

# Top predictive patterns
top_patterns = ratio_df.head(10)
for _, row in top_patterns.iterrows():
    findings.append(f"Character n-gram '{row['ngram']}' has {row['success_ratio']:.3f} success ratio ({row['success']}/{row['total']} occurrences)")

# Specific insights about 'ss'
ss_in_success = sum(1 for text in successful_sample if 'ss' in get_char_ngrams(text))
ss_in_failed = sum(1 for text in failed_sample if 'ss' in get_char_ngrams(text))
ss_ratio = ss_in_success / (ss_in_success + ss_in_failed) if (ss_in_success + ss_in_failed) > 0 else 0

findings.append(f"'ss' pattern appears in {ss_ratio:.3f} of successful vs failed requests")
findings.append("'ss' pattern likely captures words like 'bless', 'blessing', 'pizza', 'pass', 'blessed'")

# Save to file
with open('/home/code/analysis_findings_loop4.txt', 'w') as f:
    f.write("LOOP 4 ANALYSIS FINDINGS\n")
    f.write("="*60 + "\n\n")
    for i, finding in enumerate(findings, 1):
        f.write(f"{i}. {finding}\n")

print("\nKey findings saved to analysis_findings_loop4.txt")
for i, finding in enumerate(findings, 1):
    print(f"{i}. {finding}")