# Experiment 002: Linguistic Features from Stanford Research

This notebook implements linguistic features based on Stanford ICWSM 2014 paper analyzing this exact dataset.

**Features engineered:**
- Gratitude indicators (thank, thanks, grateful, appreciate)
- Evidentiality markers (URLs, numbers, evidence words)
- Reciprocity language (pay it forward, return the favor, etc.)
- Narrative indicators (length, pronouns, sentence count)
- Politeness markers (please, polite phrasing)
- Interaction features from meta data

**Expected improvement:** +0.05 to +0.10 AUC over baseline (target: ~0.72-0.75)

In [None]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

# Convert to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts().to_dict()}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

In [None]:
# Define meta features (same as baseline)
meta_features = [
    # User activity (at_request only)
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    
    # Vote counts (at_request only)
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    
    # Temporal features
    'unix_timestamp_of_request',
    'unix_timestamp_of_request_utc',
    
    # Account age
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request'
]

print(f"Using {len(meta_features)} meta features")

In [None]:
# Engineer temporal features (same as baseline)
print("Engineering temporal features...")

train_df['request_datetime'] = pd.to_datetime(train_df['unix_timestamp_of_request_utc'], unit='s')
test_df['request_datetime'] = pd.to_datetime(test_df['unix_timestamp_of_request_utc'], unit='s')

# Extract hour and day of week
train_df['request_hour'] = train_df['request_datetime'].dt.hour
test_df['request_hour'] = test_df['request_datetime'].dt.hour

train_df['request_dayofweek'] = train_df['request_datetime'].dt.dayofweek
test_df['request_dayofweek'] = test_df['request_datetime'].dt.dayofweek

# Add engineered features to feature list
engineered_features = ['request_hour', 'request_dayofweek']
all_features = meta_features + engineered_features

print(f"Total features so far: {len(all_features)}")

In [None]:
# ENGINEER LINGUISTIC FEATURES BASED ON STANFORD RESEARCH
print("Engineering linguistic features from Stanford research...")

def engineer_linguistic_features(df):
    """Engineer linguistic features based on Stanford ICWSM 2014 paper"""
    
    # Combine title and text for analysis
    df['combined_text'] = df['request_title'].fillna('') + ' ' + df['request_text_edit_aware'].fillna('')
    
    # 1. GRATITUDE INDICATORS
    gratitude_words = ['thank', 'thanks', 'grateful', 'appreciate', 'appreciation', 'gratitude']
    df['gratitude_count'] = df['combined_text'].str.lower().apply(
        lambda x: sum(1 for word in gratitude_words if word in x)
    )
    
    # 2. EVIDENTIALITY MARKERS
    # URLs, numbers, specific details
    df['has_url'] = df['combined_text'].str.contains('http|www|\.com|\.org|\.net', case=False, regex=True).astype(int)
    df['number_count'] = df['combined_text'].str.count(r'\d+')
    
    evidence_words = ['proof', 'photo', 'picture', 'link', 'show', 'demonstrate', 'evidence']
    df['evidence_word_count'] = df['combined_text'].str.lower().apply(
        lambda x: sum(1 for word in evidence_words if word in x)
    )
    
    # 3. RECIPROCITY LANGUAGE
    reciprocity_phrases = [
        'pay it forward', 'return the favor', 'help others', 'give back',
        'help someone else', 'pass it on', 'when i can', 'once i get'
    ]
    df['reciprocity_count'] = df['combined_text'].str.lower().apply(
        lambda x: sum(1 for phrase in reciprocity_phrases if phrase in x)
    )
    
    # 4. NARRATIVE INDICATORS
    # Length features
    df['title_length'] = df['request_title'].fillna('').str.len()
    df['text_length'] = df['request_text_edit_aware'].fillna('').str.len()
    df['total_length'] = df['title_length'] + df['text_length']
    
    # First-person pronoun count
    first_person_pronouns = ['i', 'me', 'my', 'we', 'our', 'us']
    df['first_person_pronoun_count'] = df['combined_text'].str.lower().apply(
        lambda x: sum(1 for word in first_person_pronouns if re.search(r'\b' + word + r'\b', x))
    )
    
    # Sentence count (periods, exclamation marks, question marks)
    df['sentence_count'] = df['combined_text'].str.count(r'[.!?]+')
    
    # Paragraph breaks (double newlines)
    df['paragraph_breaks'] = df['combined_text'].str.count(r'\n\n')
    
    # 5. POLITENESS MARKERS
    df['please_count'] = df['combined_text'].str.lower().str.count(r'\bplease\b')
    
    # 6. NARRATIVE CATEGORY (simple keyword-based classification)
    # Based on Stanford's 5 narrative types: desire, family, job, money, student
    
    narrative_keywords = {
        'family': ['family', 'kid', 'child', 'children', 'mom', 'dad', 'parent', 'brother', 'sister', 'wife', 'husband'],
        'job': ['job', 'work', 'employ', 'unemployed', 'laid off', 'fired', 'interview', 'hire'],
        'money': ['money', 'broke', 'poor', 'bills', 'rent', 'paycheck', 'debt', 'financial'],
        'student': ['student', 'school', 'college', 'university', 'class', 'tuition', 'textbook', 'dorm'],
        'desire': ['craving', 'want', 'wish', 'hope', 'desire', 'hungry', 'starving', 'appetite']
    }
    
    for category, keywords in narrative_keywords.items():
        df[f'narrative_{category}'] = df['combined_text'].str.lower().apply(
            lambda x: sum(1 for word in keywords if word in x)
        )
    
    return df

# Apply to both train and test
train_df = engineer_linguistic_features(train_df)
test_df = engineer_linguistic_features(test_df)

# Add all linguistic features to feature list
linguistic_features = [
    'gratitude_count', 'has_url', 'number_count', 'evidence_word_count', 'reciprocity_count',
    'title_length', 'text_length', 'total_length', 'first_person_pronoun_count',
    'sentence_count', 'paragraph_breaks', 'please_count',
    'narrative_family', 'narrative_job', 'narrative_money', 'narrative_student', 'narrative_desire'
]

all_features.extend(linguistic_features)

print(f"Added {len(linguistic_features)} linguistic features")
print(f"Total features: {len(all_features)}")

In [None]:
# ENGINEER INTERACTION FEATURES
print("Engineering interaction features...")

def engineer_interaction_features(df):
    """Create interaction features from meta data"""
    
    # 1. Activity ratios
    df['comments_per_day'] = df['requester_number_of_comments_at_request'] / (df['requester_account_age_in_days_at_request'] + 1)
    df['posts_per_day'] = df['requester_number_of_posts_at_request'] / (df['requester_account_age_in_days_at_request'] + 1)
    
    # 2. Comment to post ratio
    df['comment_to_post_ratio'] = df['requester_number_of_comments_at_request'] / (df['requester_number_of_posts_at_request'] + 1)
    
    # 3. Activity score (weighted combination)
    df['activity_score'] = (
        df['requester_number_of_comments_at_request'] * 0.3 +
        df['requester_number_of_posts_at_request'] * 0.5 +
        df['requester_upvotes_minus_downvotes_at_request'] * 0.2
    )
    
    # 4. Subreddit diversity
    df['subreddit_diversity'] = df['requester_number_of_subreddits_at_request'] / (
        df['requester_number_of_comments_at_request'] + df['requester_number_of_posts_at_request'] + 1
    )
    
    # 5. RAOP experience ratio
    df['raop_experience'] = df['requester_days_since_first_post_on_raop_at_request'] / (
        df['requester_account_age_in_days_at_request'] + 1
    )
    
    # 6. Vote efficiency
    df['vote_efficiency'] = df['requester_upvotes_minus_downvotes_at_request'] / (
        df['requester_upvotes_plus_downvotes_at_request'] + 1
    )
    
    return df

# Apply to both train and test
train_df = engineer_interaction_features(train_df)
test_df = engineer_interaction_features(test_df)

# Add interaction features to feature list
interaction_features = [
    'comments_per_day', 'posts_per_day', 'comment_to_post_ratio',
    'activity_score', 'subreddit_diversity', 'raop_experience', 'vote_efficiency'
]

all_features.extend(interaction_features)

print(f"Added {len(interaction_features)} interaction features")
print(f"Total features: {len(all_features)}")

In [None]:
# Prepare data for modeling
print("Preparing data for modeling...")

X = train_df[all_features].copy()
y = train_df['requester_received_pizza'].astype(int).values
X_test = test_df[all_features].copy()

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

# Handle any missing values (fill with median)
for col in all_features:
    median_val = X[col].median()
    X[col].fillna(median_val, inplace=True)
    X_test[col].fillna(median_val, inplace=True)

print(f"Missing values in training: {X.isnull().sum().sum()}")
print(f"Missing values in test: {X_test.isnull().sum().sum()}")

In [None]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Model parameters (same as baseline for fair comparison)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'class_weight': 'balanced'  # Handle class imbalance
}

print(f"Training with {n_folds}-fold stratified CV...")
print(f"Total features: {len(all_features)}")
print(f"Model parameters: {params}")

In [None]:
# Train with cross-validation
fold_scores = []
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_score)
    print(f"Fold {fold + 1} ROC-AUC: {fold_score:.4f}")

# Overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)
print(f"\n{'='*50}")
print(f"Cross-Validation ROC-AUC: {cv_score:.4f} ± {cv_std:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in fold_scores]}")

# OOF score
oof_score = roc_auc_score(y, oof_predictions)
print(f"OOF ROC-AUC: {oof_score:.4f}")

# Compare with baseline
baseline_score = 0.6691
improvement = cv_score - baseline_score
print(f"Improvement over baseline: {improvement:+.4f}")

In [None]:
# Feature importance analysis
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 features by importance:")
print(importance_df.head(20).to_string(index=False))

# Categorize features by type
meta_importance = importance_df[importance_df['feature'].isin(meta_features + engineered_features)]['importance'].sum()
linguistic_importance = importance_df[importance_df['feature'].isin(linguistic_features)]['importance'].sum()
interaction_importance = importance_df[importance_df['feature'].isin(interaction_features)]['importance'].sum()

print(f"\n{'='*50}")
print("Feature importance by category:")
print(f"Meta features: {meta_importance:.2f} ({meta_importance/importance_df['importance'].sum()*100:.1f}%)")
print(f"Linguistic features: {linguistic_importance:.2f} ({linguistic_importance/importance_df['importance'].sum()*100:.1f}%)")
print(f"Interaction features: {interaction_importance:.2f} ({interaction_importance/importance_df['importance'].sum()*100:.1f}%)")

In [None]:
# Save predictions for submission
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

submission_path = '/home/submission/submission_002_linguistic_features.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\n{'='*50}")
print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")

# Summary
print(f"\n{'='*50}")
print("EXPERIMENT SUMMARY:")
print(f"Model: LightGBM with linguistic features")
print(f"Features: {len(all_features)} total")
print(f"  - Meta features: {len(meta_features + engineered_features)}")
print(f"  - Linguistic features: {len(linguistic_features)}")
print(f"  - Interaction features: {len(interaction_features)}")
print(f"CV ROC-AUC: {cv_score:.4f} ± {cv_std:.4f}")
print(f"Improvement over baseline: {improvement:+.4f}")