# Experiment 003: Enhanced Keyword Features

**Objective**: Convert binary keyword indicators to COUNT features and add new high-lift keywords

**Expected gain**: +0.02-0.04 AUC based on strategy analysis

**Key improvements**:
- Convert binary keyword indicators to count features (frequency matters)
- Add new high-lift keywords: 'appreciate', 'grateful', 'children', 'family', 'because'
- Create keyword density features (count / word_count)
- Maintain all safe features from honest baseline

In [None]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load Data

In [None]:
# Load training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

# Load test data
test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

## Enhanced Feature Engineering

**Key enhancement**: Convert binary keyword indicators to COUNT features

From the strategy analysis:
- 'forward' shows +0.0689 lift (31.7% vs 24.8% baseline)
- 'need' shows similar lift
- Frequency matters more than binary presence

In [None]:
def count_keyword_occurrences(text, keyword):
    """Count occurrences of a keyword in text (case-insensitive)"""
    if pd.isna(text):
        return 0
    # Use word boundaries to avoid partial matches
    pattern = r'\b' + re.escape(keyword) + r'\b'
    return len(re.findall(pattern, text, re.IGNORECASE))

def engineer_enhanced_features(df, is_train=True):
    """Engineer enhanced features with keyword counts instead of binary indicators"""
    features = pd.DataFrame(index=df.index)
    
    # 1. Text features - combine title and text for full context
    if 'request_text' in df.columns:
        text_col = 'request_text'
    else:
        text_col = 'request_text_edit_aware'
    
    features['full_text'] = df['request_title'].fillna('') + ' ' + df[text_col].fillna('')
    
    # 2. Text length features (from honest baseline)
    features['text_length'] = df[text_col].fillna('').str.len()
    features['title_length'] = df['request_title'].fillna('').str.len()
    features['total_text_length'] = features['text_length'] + features['title_length']
    
    # Word count features
    features['word_count'] = df[text_col].fillna('').str.split().str.len()
    features['title_word_count'] = df['request_title'].fillna('').str.split().str.len()
    features['total_word_count'] = features['word_count'] + features['title_word_count']
    
    # 3. User activity features (SAFE - at request time only)
    features['requester_number_of_comments_at_request'] = df['requester_number_of_comments_at_request']
    features['requester_number_of_posts_at_request'] = df['requester_number_of_posts_at_request']
    features['requester_upvotes_minus_downvotes_at_request'] = df['requester_upvotes_minus_downvotes_at_request']
    features['requester_upvotes_plus_downvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    features['requester_number_of_comments_in_raop_at_request'] = df['requester_number_of_comments_in_raop_at_request']
    features['requester_number_of_posts_on_raop_at_request'] = df['requester_number_of_posts_on_raop_at_request']
    
    # 4. Temporal features
    features['requester_account_age_in_days_at_request'] = df['requester_account_age_in_days_at_request']
    features['requester_days_since_first_post_on_raop_at_request'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # Extract hour from unix timestamp
    features['hour_of_request'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.hour
    features['day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.dayofweek
    
    # 5. Enhanced keyword features - COUNT instead of binary
    # Original keywords from honest baseline
    keywords = ['thanks', 'thank', 'please', 'because', 'pay', 'forward']
    
    # New high-lift keywords from strategy analysis
    new_keywords = ['appreciate', 'grateful', 'children', 'family', 'need', 'appreciation']
    all_keywords = keywords + new_keywords
    
    for keyword in all_keywords:
        features[f'keyword_{keyword}_count'] = features['full_text'].apply(
            lambda x: count_keyword_occurrences(x, keyword)
        )
    
    # 6. Keyword density features (count / word_count)
    for keyword in all_keywords:
        count_col = f'keyword_{keyword}_count'
        features[f'keyword_{keyword}_density'] = features[count_col] / (features['total_word_count'] + 1)  # +1 to avoid division by zero
    
    # 7. Post was edited (binary feature)
    features['post_was_edited'] = df['post_was_edited'].map({'True': 1, 'False': 0, True: 1, False: 0}).fillna(0)
    
    # 8. Additional meta features
    features['requester_number_of_subreddits_at_request'] = df['requester_number_of_subreddits_at_request']
    
    return features

## TF-IDF Vectorization

Same as honest baseline - 5000 features, unigrams+bigrams

In [None]:
# Create TF-IDF features from text
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Fit TF-IDF on combined train and test text for consistency
combined_text = pd.concat([
    train_features['full_text'],
    test_features['full_text']
], axis=0)

tfidf.fit(combined_text)

# Transform text to TF-IDF features
train_tfidf = tfidf.transform(train_features['full_text'])
test_tfidf = tfidf.transform(test_features['full_text'])

print(f"TF-IDF features shape: {train_tfidf.shape}")

# Convert TF-IDF to DataFrame
tfidf_feature_names = [f'tfidf_{i}' for i in range(train_tfidf.shape[1])]
train_tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns=tfidf_feature_names, index=train_features.index)
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=tfidf_feature_names, index=test_features.index)

# Combine TF-IDF with meta features
meta_features = [col for col in train_features.columns if col != 'full_text']
train_features_combined = pd.concat([train_features[meta_features], train_tfidf_df], axis=1)
test_features_combined = pd.concat([test_features[meta_features], test_tfidf_df], axis=1)

print(f"Combined train features shape: {train_features_combined.shape}")
print(f"Combined test features shape: {test_features_combined.shape}")

## Model Training with Cross-Validation

In [None]:
# Prepare data for training
X = train_features_combined
y = train_df['requester_received_pizza'].astype(int)

# Stratified K-Fold to handle class imbalance
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

# Store predictions for ensembling
train_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features_combined))
fold_scores = []

print(f"Starting {n_splits}-fold cross-validation with ENHANCED keyword features...")
print(f"Tracking {len([col for col in X.columns if 'keyword_' in col])} keyword features\n")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Calculate scale_pos_weight for handling class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"  Scale pos weight: {scale_pos_weight:.2f}")
    
    # Train LightGBM model
    model = lgb.LGBMClassifier(
        random_state=RANDOM_SEED,
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict on validation set
    valid_pred = model.predict_proba(X_valid)[:, 1]
    fold_auc = roc_auc_score(y_valid, valid_pred)
    fold_scores.append(fold_auc)
    
    print(f"  Fold {fold + 1} AUC: {fold_auc:.4f}")
    
    # Store out-of-fold predictions
    train_predictions[valid_idx] = valid_pred
    
    # Predict on test set
    test_pred = model.predict_proba(test_features_combined)[:, 1]
    test_predictions += test_pred / n_splits

# Calculate overall CV score
cv_mean = np.mean(fold_scores)
cv_std = np.std(fold_scores)
print(f"\n{'='*50}")
print(f"CROSS-VALIDATION RESULTS")
print(f"{'='*50}")
print(f"Fold scores: {[f'{score:.4f}' for score in fold_scores]}")
print(f"Mean AUC: {cv_mean:.4f}")
print(f"Std AUC: {cv_std:.4f}")
print(f"Improvement over baseline: {cv_mean - 0.6253:.4f}")
print(f"{'='*50}")

## Feature Importance Analysis

In [None]:
# Get feature importance from the last fold model
feature_importance = model.booster_.feature_importance(importance_type='gain')
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(importance_df.head(20))

# Analyze feature types
tfidf_features = importance_df[importance_df['feature'].str.startswith('tfidf_')]
meta_features = importance_df[~importance_df['feature'].str.startswith('tfidf_')]

print(f"\nFeature type summary:")
print(f"TF-IDF features: {len(tfidf_features)} features, total importance: {tfidf_features['importance'].sum():.2f}")
print(f"Meta features: {len(meta_features)} features, total importance: {meta_features['importance'].sum():.2f}")

# Analyze keyword feature importance
keyword_features = importance_df[importance_df['feature'].str.contains('keyword_')]
print(f"\nKeyword features: {len(keyword_features)} features, total importance: {keyword_features['importance'].sum():.2f}")
print("\nTop 10 Keyword Features:")
print(keyword_features.head(10))

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission shape:", submission.shape)
print("\nFirst few rows:")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission_003_enhanced_keywords.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Check distribution of predictions
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")

# Compare to target distribution
target_mean = train_df['requester_received_pizza'].mean()
print(f"\nTarget distribution in training:")
print(f"Mean (positive rate): {target_mean:.4f}")
print(f"Our prediction mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Difference: {abs(submission['requester_received_pizza'].mean() - target_mean):.4f}")