# Experiment 003: Safe Flair Handling + Temporal Features

This experiment focuses on the highest priority from the strategy: SAFE FLAIR HANDLING.

**Strategy:**
- Target encode user flair with aggressive smoothing to prevent overfitting
- Add temporal features (hour, day of week, month) with cyclical encoding
- Use Reddit-aware text preprocessing for keyword features
- Combine with existing tabular features
- Use 5-fold stratified CV
- LightGBM for training

**Expected improvements:**
- Capture the extremely predictive flair signal safely
- Add temporal patterns that exist in data
- Better prediction distribution and confidence

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
import lightgbm as lgb
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load Data

In [None]:
# Load training data
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

with open(train_path, 'r') as f:
    train_data = json.load(f)

with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Training columns: {train_df.shape[1]}")
print(f"Test columns: {test_df.shape[1]}")

# Check flair distribution
print("\nFlair distribution in training data:")
print(train_df['requester_user_flair'].value_counts().head(10))

## Reddit-Aware Text Preprocessing

In [None]:
def reddit_aware_preprocess(text):
    """Reddit-aware text preprocessing"""
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text).lower()
    
    # Handle Reddit-specific patterns
    # User mentions: u/username -> USER_MENTION
    text = re.sub(r'u/\w+', 'USER_MENTION', text)
    
    # Subreddit mentions: r/subreddit -> SUBREDDIT_MENTION
    text = re.sub(r'r/\w+', 'SUBREDDIT_MENTION', text)
    
    # URLs -> URL
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'URL', text)
    
    # Markdown syntax
    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # **bold**
    text = re.sub(r'\*(.+?)\*', r'\1', text)      # *italic*
    text = re.sub(r'~~(.+?)~~', r'\1', text)       # ~~strikethrough~~
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Combine request text and title
text_col = 'request_text' if 'request_text' in train_df.columns else 'request_text_edit_aware'

train_df['combined_text'] = train_df[text_col].fillna('') + ' ' + train_df['request_title'].fillna('')
test_df['combined_text'] = test_df['request_text_edit_aware'].fillna('') + ' ' + test_df['request_title'].fillna('')

# Apply preprocessing
train_df['combined_text_clean'] = train_df['combined_text'].apply(reddit_aware_preprocess)
test_df['combined_text_clean'] = test_df['combined_text'].apply(reddit_aware_preprocess)

print("Reddit-aware text preprocessing completed")
print(f"Sample cleaned text: {train_df['combined_text_clean'].iloc[0][:100]}...")

## Extract High-Impact Keyword Features

In [None]:
def extract_keyword_features(df, text_col):
    """Extract binary features for high-impact keywords"""
    features = pd.DataFrame(index=df.index)
    
    # Keywords from EDA that indicate need/urgency
    need_keywords = ['hungry', 'broke', 'starving', 'desperate', 'struggling']
    student_keywords = ['student', 'college', 'university', 'tuition', 'loan']
    family_keywords = ['family', 'kids', 'children', 'baby', 'mother', 'father', 'parent']
    job_keywords = ['job', 'work', 'unemployed', 'laid off', 'fired', 'paycheck']
    gratitude_keywords = ['thank', 'appreciate', 'grateful', 'bless', 'kind']
    edit_keywords = ['edit', 'update']
    
    # Count occurrences of each keyword group
    for keyword_list, prefix in [
        (need_keywords, 'need'),
        (student_keywords, 'student'),
        (family_keywords, 'family'),
        (job_keywords, 'job'),
        (gratitude_keywords, 'gratitude'),
        (edit_keywords, 'edit')
    ]:
        features[f'keyword_{prefix}_count'] = df[text_col].apply(
            lambda x: sum(1 for word in keyword_list if word in str(x).lower())
        )
    
    # Binary features for presence of any keyword in each group
    for keyword_list, prefix in [
        (need_keywords, 'need'),
        (student_keywords, 'student'),
        (family_keywords, 'family'),
        (job_keywords, 'job'),
        (gratitude_keywords, 'gratitude'),
        (edit_keywords, 'edit')
    ]:
        features[f'has_{prefix}_keyword'] = features[f'keyword_{prefix}_count'] > 0
    
    # Text length features
    features['text_length'] = df[text_col].str.len()
    features['word_count'] = df[text_col].apply(lambda x: len(str(x).split()))
    
    return features

# Extract keyword features
train_keyword_features = extract_keyword_features(train_df, 'combined_text_clean')
test_keyword_features = extract_keyword_features(test_df, 'combined_text_clean')

print(f"Keyword features shape: {train_keyword_features.shape}")
print("Sample keyword features:")
print(train_keyword_features.head())

## Safe Flair Target Encoding

In [None]:
def safe_target_encode_flair(train_df, test_df, target_col, flair_col='requester_user_flair'):
    """Safely target encode user flair with aggressive smoothing"""
    
    # Check if flair column exists in train data
    if flair_col not in train_df.columns:
        print(f"WARNING: {flair_col} not found in train data. Creating dummy features.")
        
        # Create dummy features with neutral values
        train_encoded = pd.DataFrame(index=train_df.index)
        test_encoded = pd.DataFrame(index=test_df.index)
        
        # Use global mean as target encoding (neutral)
        global_mean = train_df[target_col].mean()
        train_encoded['flair_target_encoded'] = global_mean
        test_encoded['flair_target_encoded'] = global_mean
        
        # No perfect flairs
        train_encoded['has_perfect_flair'] = 0
        test_encoded['has_perfect_flair'] = 0
        
        return train_encoded, test_encoded
    
    # Check if flair column exists in test data
    if flair_col not in test_df.columns:
        print(f"WARNING: {flair_col} not found in test data. Creating dummy features.")
        
        # Create dummy features with neutral values
        train_encoded = pd.DataFrame(index=train_df.index)
        test_encoded = pd.DataFrame(index=test_df.index)
        
        # Use global mean as target encoding (neutral)
        global_mean = train_df[target_col].mean()
        train_encoded['flair_target_encoded'] = global_mean
        test_encoded['flair_target_encoded'] = global_mean
        
        # No perfect flairs in test data
        train_encoded['has_perfect_flair'] = 0
        test_encoded['has_perfect_flair'] = 0
        
        return train_encoded, test_encoded
    
    # Prepare data
    X_train = train_df[[flair_col]].copy()
    X_test = test_df[[flair_col]].copy()
    y_train = train_df[target_col]
    
    # Use TargetEncoder with high smoothing to prevent overfitting
    # smoothing=10 means we need 10 samples to trust the category mean
    encoder = TargetEncoder(smoothing=10.0, min_samples_leaf=5)
    
    # Fit and transform
    train_encoded = encoder.fit_transform(X_train, y_train)
    test_encoded = encoder.transform(X_test)
    
    # Rename columns
    train_encoded.columns = ['flair_target_encoded']
    test_encoded.columns = ['flair_target_encoded']
    
    # Also create a binary feature for perfect flairs (PIF and shroom)
    perfect_flairs = ['shroom', 'PIF']
    train_encoded['has_perfect_flair'] = train_df[flair_col].isin(perfect_flairs).astype(int)
    test_encoded['has_perfect_flair'] = test_df[flair_col].isin(perfect_flairs).astype(int)
    
    return train_encoded, test_encoded

# Target encode flair
train_flair_features, test_flair_features = safe_target_encode_flair(
    train_df, test_df, 'requester_received_pizza'
)

print("Flair encoding completed")
print("Target encoded flair distribution:")
print(train_flair_features['flair_target_encoded'].describe())
print(f"\nPerfect flair count in train: {train_flair_features['has_perfect_flair'].sum()}")
print(f"Perfect flair count in test: {test_flair_features['has_perfect_flair'].sum()}")

## Temporal Features with Cyclical Encoding

In [None]:
def extract_temporal_features(df):
    """Extract cyclical temporal features from Unix timestamps"""
    features = pd.DataFrame(index=df.index)
    
    # Convert Unix timestamp to datetime
    # Use request timestamp if available, otherwise use a retrieval timestamp
    if 'unix_timestamp_of_request' in df.columns:
        timestamp_col = 'unix_timestamp_of_request'
    elif 'unix_timestamp_of_request_utc' in df.columns:
        timestamp_col = 'unix_timestamp_of_request_utc'
    else:
        # If no request timestamp, skip temporal features
        print("No request timestamp found, skipping temporal features")
        return features
    
    df['datetime'] = pd.to_datetime(df[timestamp_col], unit='s')
    
    # Extract time components
    features['hour'] = df['datetime'].dt.hour
    features['day_of_week'] = df['datetime'].dt.dayofweek  # 0=Monday, 6=Sunday
    features['day_of_month'] = df['datetime'].dt.day
    features['month'] = df['datetime'].dt.month
    
    # Cyclical encoding for hour (24 hours)
    features['hour_sin'] = np.sin(2 * np.pi * features['hour'] / 24)
    features['hour_cos'] = np.cos(2 * np.pi * features['hour'] / 24)
    
    # Cyclical encoding for day of week (7 days)
    features['day_of_week_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 7)
    features['day_of_week_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 7)
    
    # Cyclical encoding for month (12 months)
    features['month_sin'] = np.sin(2 * np.pi * features['month'] / 12)
    features['month_cos'] = np.cos(2 * np.pi * features['month'] / 12)
    
    # Drop raw temporal features (keep only cyclical)
    features = features.drop(['hour', 'day_of_week', 'month'], axis=1)
    
    return features

# Extract temporal features
train_temporal_features = extract_temporal_features(train_df)
test_temporal_features = extract_temporal_features(test_df)

print(f"Temporal features shape: {train_temporal_features.shape}")
if len(train_temporal_features.columns) > 0:
    print("Sample temporal features:")
    print(train_temporal_features.head())
else:
    print("No temporal features extracted (no timestamp column found)")

## Extract Tabular Features

In [None]:
def extract_tabular_features(df):
    """Extract and preprocess tabular features"""
    features = pd.DataFrame(index=df.index)
    
    # Numeric features that might exist in the dataset
    numeric_cols = [
        'requester_account_age_in_days_at_request',
        'requester_account_age_in_days_at_retrieval',
        'requester_number_of_comments_at_request',
        'requester_number_of_comments_at_retrieval',
        'requester_number_of_posts_at_request',
        'requester_number_of_posts_at_retrieval',
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_minus_downvotes_at_retrieval',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_retrieval',
        'number_of_upvotes_of_request_at_retrieval',
        'number_of_downvotes_of_request_at_retrieval',
        'request_number_of_comments_at_retrieval'
    ]
    
    # Add numeric features if they exist
    for col in numeric_cols:
        if col in df.columns:
            features[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            features[col] = 0  # Add column with zeros if it doesn't exist
    
    # Binary features
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].astype(int)
    else:
        features['post_was_edited'] = 0
    
    return features

# Extract tabular features
train_tabular_features = extract_tabular_features(train_df)
test_tabular_features = extract_tabular_features(test_df)

print(f"Tabular features shape: {train_tabular_features.shape}")
print("Sample tabular features:")
print(train_tabular_features.head())

## Combine All Features

In [None]:
# Combine all feature sets
feature_sets = [
    train_keyword_features, train_flair_features, train_temporal_features, train_tabular_features
]
test_sets = [
    test_keyword_features, test_flair_features, test_temporal_features, test_tabular_features
]

train_features = pd.concat(feature_sets, axis=1)
test_features = pd.concat(test_sets, axis=1)

# Ensure both have same columns and order
common_cols = [col for col in train_features.columns if col in test_features.columns]
train_features = train_features[common_cols]
test_features = test_features[common_cols]

print(f"Final train features shape: {train_features.shape}")
print(f"Final test features shape: {test_features.shape}")
print(f"Columns match: {list(train_features.columns) == list(test_features.columns)}")

# Prepare target
y = train_df['requester_received_pizza'].astype(int)
print(f"\nTarget distribution: {y.value_counts().to_dict()}")

## Model Training with Cross-Validation

In [None]:
# Fill any remaining NaN values
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

# Define cross-validation strategy
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_SEED)

# Store predictions
oof_predictions = np.zeros(len(train_features))
test_predictions = np.zeros(len(test_features))
cv_scores = []

print(f"Starting {n_folds}-fold stratified cross-validation...")

fold = 1
for train_idx, valid_idx in skf.split(train_features, y):
    print(f"\nFold {fold}/{n_folds}")
    
    X_train, X_valid = train_features.iloc[train_idx], train_features.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Define parameters - slightly tuned for better confidence
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 63,  # Increased from 31 for more capacity
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': RANDOM_SEED,
        'min_child_samples': 20  # Added to prevent overfitting
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1500,  # Increased for more training
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )
    
    # Predict on validation set
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    oof_predictions[valid_idx] = valid_pred
    
    # Calculate AUC for this fold
    fold_auc = roc_auc_score(y_valid, valid_pred)
    cv_scores.append(fold_auc)
    print(f"Fold {fold} AUC: {fold_auc:.4f}")
    
    # Predict on test set
    test_pred = model.predict(test_features, num_iteration=model.best_iteration)
    test_predictions += test_pred / n_folds
    
    fold += 1

# Calculate overall CV score
overall_auc = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_auc:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

## Feature Importance Analysis

In [None]:
# Get feature importance from the last fold model
feature_importance = pd.DataFrame({
    'feature': train_features.columns,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importance.head(20))

# Check flair feature importance
flair_importance = feature_importance[feature_importance['feature'].str.contains('flair')]
print(f"\nFlair feature importance:")
print(flair_importance)

# Check keyword feature importance
keyword_importance = feature_importance[feature_importance['feature'].str.contains('keyword|has_')]
print(f"\nTop keyword features:")
print(keyword_importance.head(10))

## Generate Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission shape:", submission.shape)
print("\nPrediction distribution:")
print(submission['requester_received_pizza'].describe())

# Save submission
submission_path = '/home/submission/submission_003_flair_temporal.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print("\nVerification:")
print(f"Columns match: {list(submission.columns) == list(sample_sub.columns)}")
print(f"Request IDs match: {set(submission['request_id']) == set(sample_sub['request_id'])}")