# Experiment 005: Remove Psycholinguistic Features

**Goal**: Remove redundant psycholinguistic features to reduce dimensionality from ~5,015 to ~5,000 features

**Hypothesis**: These features are redundant with TF-IDF and not in top 10 importance for either LightGBM or Logistic Regression. Removing them will simplify the model without performance loss.

**Expected outcome**: Maintain or improve CV AUC (currently 0.6599)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading data...")
train_data = pd.read_csv('/home/data/train.csv')
test_data = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")
print(f"Positive rate: {train_data['requester_received_pizza'].mean():.4f}")

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.4f}")

In [None]:
def extract_features(df, is_train=True):
    """Extract features WITHOUT psycholinguistic features"""
    features = []
    feature_names = []
    
    # 1. Text length features
    text_length = df['request_text_edit_aware'].str.len()
    word_count = df['request_text_edit_aware'].str.split().str.len()
    sentence_count = df['request_text_edit_aware'].str.count(r'[.!?]+') + 1
    
    features.extend([text_length.values, word_count.values, sentence_count.values])
    feature_names.extend(['text_length', 'word_count', 'sentence_count'])
    
    # 2. Account age and activity features (at request time)
    features.extend([
        df['requester_account_age_in_days_at_request'].values,
        df['requester_days_since_first_post_on_raop_at_request'].values,
        df['requester_number_of_comments_in_raop_at_request'].values,
        df['requester_number_of_posts_on_raop_at_request'].values,
        df['requester_number_of_comments_at_request'].values,
        df['requester_number_of_posts_at_request'].values,
        df['requester_number_of_subreddits_at_request'].values
    ])
    feature_names.extend([
        'account_age_days', 'days_since_first_raop_post', 'raop_comments', 
        'raop_posts', 'total_comments', 'total_posts', 'subreddit_diversity'
    ])
    
    # 3. Upvotes and karma features (at request time)
    features.extend([
        df['requester_upvotes_plus_downvotes_at_request'].values,
        df['requester_upvotes_minus_downvotes_at_request'].values
    ])
    feature_names.extend(['total_votes', 'net_karma'])
    
    # 4. Time features (at request time)
    features.extend([
        df['requester_hours_since_first_post_on_raop_at_request'].values,
        df['unix_timestamp_of_request_utc'].values
    ])
    feature_names.extend(['hours_since_first_raop', 'timestamp_utc'])
    
    # 5. Subreddit features (at request time)
    features.extend([
        df['requester_subreddits_at_request'].notna().astype(int).values
    ])
    feature_names.extend(['has_subreddit_info'])
    
    # 6. Visual evidence features (imgur links)
    imgur_pattern = r'imgur\.com'
    imgur_count = df['request_text_edit_aware'].str.count(imgur_pattern, flags=re.IGNORECASE)
    features.extend([imgur_count.values])
    feature_names.extend(['imgur_link_count'])
    
    # 7. Key phrase features (specific high-value patterns)
    key_phrases = [
        r'\bfor a\b',
        r'\bplease\b',
        r'\bthis\b',
        r'\bhere\b',
        r'\bthere\b',
        r'\bmy\b',
        r'\bwould appreciate\b',
        r'\bwould be grateful\b'
    ]
    
    for phrase in key_phrases:
        count = df['request_text_edit_aware'].str.count(phrase, flags=re.IGNORECASE)
        features.extend([count.values])
        feature_names.extend([f'phrase_{phrase.replace("\\b", "").replace(" ", "_")}'])
    
    # Stack all features
    X_dense = np.column_stack(features)
    
    return X_dense, feature_names

In [None]:
# Extract dense features for train and test
print("Extracting dense features...")
X_train_dense, dense_feature_names = extract_features(train_data, is_train=True)
X_test_dense, _ = extract_features(test_data, is_train=False)

print(f"Dense feature shape: {X_train_dense.shape}")
print(f"Number of dense features: {len(dense_feature_names)}")
print(f"Dense feature names: {dense_feature_names}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# 8. TF-IDF word n-grams
print("Creating TF-IDF word n-grams...")
tfidf_word = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
)

X_train_tfidf_word = tfidf_word.fit_transform(train_data['request_text_edit_aware'])
X_test_tfidf_word = tfidf_word.transform(test_data['request_text_edit_aware'])

print(f"TF-IDF word shape: {X_train_tfidf_word.shape}")
print(f"Top word features: {tfidf_word.get_feature_names_out()[:10]}")

In [None]:
# 9. TF-IDF character n-grams
print("Creating TF-IDF character n-grams...")
tfidf_char = TfidfVectorizer(
    analyzer='char',
    max_features=2000,
    ngram_range=(3, 5),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
)

X_train_tfidf_char = tfidf_char.fit_transform(train_data['request_text_edit_aware'])
X_test_tfidf_char = tfidf_char.transform(test_data['request_text_edit_aware'])

print(f"TF-IDF char shape: {X_train_tfidf_char.shape}")
print(f"Top char features: {tfidf_char.get_feature_names_out()[:10]}")

In [None]:
# Scale dense features
print("Scaling dense features...")
scaler = StandardScaler()
X_train_dense_scaled = scaler.fit_transform(X_train_dense)
X_test_dense_scaled = scaler.transform(X_test_dense)

# Combine all features
from scipy.sparse import csr_matrix

X_train_dense_sparse = csr_matrix(X_train_dense_scaled)
X_test_dense_sparse = csr_matrix(X_test_dense_scaled)

X_train = hstack([X_train_dense_sparse, X_train_tfidf_word, X_train_tfidf_char])
X_test = hstack([X_test_dense_sparse, X_test_tfidf_word, X_test_tfidf_char])

print(f"Final train shape: {X_train.shape}")
print(f"Final test shape: {X_test.shape}")

# Target
y = train_data['requester_received_pizza'].values
print(f"Target shape: {y.shape}")
print(f"Positive rate: {y.mean():.4f}")

In [None]:
# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Model parameters
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'scale_pos_weight': 3.025,  # Handle class imbalance
    'random_state': 42
}

print("Starting cross-validation...")
cv_scores = []
fold_predictions = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # Create datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)
    
    # Train model
    model = lgb.train(
        lgb_params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict and evaluate
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fold_auc = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_auc)
    
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")
    
    # Store out-of-fold predictions
    fold_predictions.append((val_idx, val_pred))

# Overall CV score
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)
print(f"\nCV AUC: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")

In [None]:
# Train final model on full training data
print("Training final model on full data...")
train_set = lgb.Dataset(X_train, label=y)

final_model = lgb.train(
    lgb_params,
    train_set,
    num_boost_round=1000,
    callbacks=[
        lgb.log_evaluation(100)
    ]
)

# Make predictions on test set
print("Making predictions on test set...")
test_predictions = final_model.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'request_id': test_data['request_id'],
    'requester_received_pizza': test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")

# Save submission
submission_path = '/home/code/submission_candidates/candidate_005.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

In [None]:
# Feature importance analysis
print("\nTop 20 Feature Importances:")
importance_df = pd.DataFrame({
    'feature': dense_feature_names + list(tfidf_word.get_feature_names_out()) + list(tfidf_char.get_feature_names_out()),
    'importance': final_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print(importance_df.head(20).to_string(index=False))

# Save feature importance
importance_path = '/home/code/experiments/005_feature_importance.csv'
importance_df.to_csv(importance_path, index=False)
print(f"\nFeature importance saved to: {importance_path}")

# Summary
print(f"\n{'='*60}")
print(f"EXPERIMENT 005 SUMMARY")
print(f"{'='*60}")
print(f"Model: LightGBM")
print(f"Features: {X_train.shape[1]} total")
print(f"  - Dense metadata: {len(dense_feature_names)}")
print(f"  - TF-IDF word n-grams: {X_train_tfidf_word.shape[1]}")
print(f"  - TF-IDF char n-grams: {X_train_tfidf_char.shape[1]}")
print(f"Psycholinguistic features: REMOVED (9 features)")
print(f"CV AUC: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Previous best (exp_004): 0.6599")
print(f"Change: {cv_mean - 0.6599:+.4f}")
print(f"{'='*60}")