# Experiment 005: Remove Psycholinguistic Features

**Goal**: Remove redundant psycholinguistic features to reduce dimensionality from ~5,015 to ~5,000 features

**Hypothesis**: These features are redundant with TF-IDF and not in top 10 importance for either LightGBM or Logistic Regression. Removing them will simplify the model without performance loss.

**Expected outcome**: Maintain or improve CV AUC (currently 0.6599)

In [4]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.4f}")

Loading data...
Train shape: (2878, 32)
Test shape: (1162, 17)
Positive rate: 0.2484


In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.4f}")

In [11]:
def extract_features(df, is_train=True):
    """Extract features WITHOUT psycholinguistic features"""
    features = []
    feature_names = []
    
    # 1. Text length features
    text_length = df['request_text_edit_aware'].str.len()
    word_count = df['request_text_edit_aware'].str.split().str.len()
    sentence_count = df['request_text_edit_aware'].str.count(r'[.!?]+') + 1
    
    features.extend([text_length.values, word_count.values, sentence_count.values])
    feature_names.extend(['text_length', 'word_count', 'sentence_count'])
    
    # 2. Account age and activity features (at request time)
    features.extend([
        df['requester_account_age_in_days_at_request'].values,
        df['requester_days_since_first_post_on_raop_at_request'].values,
        df['requester_number_of_comments_in_raop_at_request'].values,
        df['requester_number_of_posts_on_raop_at_request'].values,
        df['requester_number_of_comments_at_request'].values,
        df['requester_number_of_posts_at_request'].values,
        df['requester_upvotes_minus_downvotes_at_request'].values,
        df['requester_upvotes_plus_downvotes_at_request'].values
    ])
    feature_names.extend([
        'account_age_days', 'days_since_first_post', 'comments_in_raop', 
        'posts_in_raop', 'total_comments', 'total_posts', 
        'upvotes_minus_downvotes', 'upvotes_plus_downvotes'
    ])
    
    # 3. Subreddit-specific features
    features.extend([
        df['requester_number_of_subreddits_at_request'].values
    ])
    feature_names.extend(['number_of_subreddits'])
    
    # 4. Time-based features
    # Convert unix timestamps to datetime features
    request_time = pd.to_datetime(df['unix_timestamp_of_request_utc'], unit='s')
    features.extend([
        request_time.dt.hour.values,  # Hour of day
        request_time.dt.dayofweek.values,  # Day of week
        request_time.dt.day.values  # Day of month
    ])
    feature_names.extend(['request_hour', 'request_dayofweek', 'request_day'])
    
    # 5. Text-based features (non-psycholinguistic)
    # Count of numbers in text (might indicate specific amounts)
    number_count = df['request_text_edit_aware'].str.count(r'\d+')
    features.append(number_count.values)
    feature_names.append('number_count')
    
    # Count of dollar signs
    dollar_count = df['request_text_edit_aware'].str.count(r'\$')
    features.append(dollar_count.values)
    feature_names.append('dollar_count')
    
    # Has imgur link (visual evidence)
    has_imgur = df['request_text_edit_aware'].str.contains('imgur', case=False, na=False).astype(int)
    features.append(has_imgur.values)
    feature_names.append('has_imgur')
    
    # Return as numpy array
    return np.column_stack(features), feature_names

In [12]:
# Extract dense features for train and test
print("Extracting dense features...")
X_train_dense, dense_feature_names = extract_features(train_df, is_train=True)
X_test_dense, _ = extract_features(test_df, is_train=False)

print(f"Dense feature shape: {X_train_dense.shape}")
print(f"Number of dense features: {len(dense_feature_names)}")
print(f"Dense feature names: {dense_feature_names}")

Extracting dense features...
Dense feature shape: (2878, 18)
Number of dense features: 18
Dense feature names: ['text_length', 'word_count', 'sentence_count', 'account_age_days', 'days_since_first_post', 'comments_in_raop', 'posts_in_raop', 'total_comments', 'total_posts', 'upvotes_minus_downvotes', 'upvotes_plus_downvotes', 'number_of_subreddits', 'request_hour', 'request_dayofweek', 'request_day', 'number_count', 'dollar_count', 'has_imgur']


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# 8. TF-IDF word n-grams
print("Creating TF-IDF word n-grams...")
tfidf_word = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
)

X_train_tfidf_word = tfidf_word.fit_transform(train_df['request_text_edit_aware'])
X_test_tfidf_word = tfidf_word.transform(test_df['request_text_edit_aware'])

print(f"TF-IDF word shape: {X_train_tfidf_word.shape}")
print(f"Top word features: {tfidf_word.get_feature_names_out()[:10]}")

Creating TF-IDF word n-grams...


TF-IDF word shape: (2878, 3000)
Top word features: ['00' '10' '100' '11' '12' '15' '15th' '18' '19' '1st']


In [14]:
# 9. TF-IDF character n-grams
print("Creating TF-IDF character n-grams...")
tfidf_char = TfidfVectorizer(
    analyzer='char',
    max_features=2000,
    ngram_range=(3, 5),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
)

X_train_tfidf_char = tfidf_char.fit_transform(train_df['request_text_edit_aware'])
X_test_tfidf_char = tfidf_char.transform(test_df['request_text_edit_aware'])

print(f"TF-IDF char shape: {X_train_tfidf_char.shape}")
print(f"Top char features: {tfidf_char.get_feature_names_out()[:10]}")

Creating TF-IDF character n-grams...


TF-IDF char shape: (2878, 2000)
Top char features: [' a ' ' a b' ' a c' ' a f' ' a l' ' a n' ' a p' ' a pi' ' a r' ' a s']


In [15]:
# Scale dense features
print("Scaling dense features...")
scaler = StandardScaler()
X_train_dense_scaled = scaler.fit_transform(X_train_dense)
X_test_dense_scaled = scaler.transform(X_test_dense)

# Combine all features
from scipy.sparse import csr_matrix

X_train_dense_sparse = csr_matrix(X_train_dense_scaled)
X_test_dense_sparse = csr_matrix(X_test_dense_scaled)

X_train = hstack([X_train_dense_sparse, X_train_tfidf_word, X_train_tfidf_char])
X_test = hstack([X_test_dense_sparse, X_test_tfidf_word, X_test_tfidf_char])

print(f"Final train shape: {X_train.shape}")
print(f"Final test shape: {X_test.shape}")

# Target
y = train_df['requester_received_pizza'].values
print(f"Target shape: {y.shape}")
print(f"Positive rate: {y.mean():.4f}")

Scaling dense features...
Final train shape: (2878, 5018)
Final test shape: (1162, 5018)
Target shape: (2878,)
Positive rate: 0.2484


In [22]:
# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Model parameters
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'scale_pos_weight': 3.025,  # Handle class imbalance
    'random_state': 42
}

print("Starting cross-validation...")
cv_scores = []
fold_predictions = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # Create datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)
    
    # Train model
    model = lgb.train(
        lgb_params,
        train_set,
        num_boost_round=500,
        valid_sets=[val_set],
        valid_names=['val'],
        callbacks=[
            lgb.log_evaluation(100),
            lgb.early_stopping(50),
            lgb.record_evaluation({})
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    val_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(val_score)
    
    print(f"Fold {fold + 1} AUC: {val_score:.4f}")

print(f"\nCV Results: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Starting cross-validation...

Fold 1/5


Training until validation scores don't improve for 50 rounds


[100]	val's auc: 0.634797


Early stopping, best iteration is:
[145]	val's auc: 0.637607
Fold 1 AUC: 0.6376

Fold 2/5


Training until validation scores don't improve for 50 rounds


[100]	val's auc: 0.611832


Early stopping, best iteration is:
[113]	val's auc: 0.620698
Fold 2 AUC: 0.6207

Fold 3/5


Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[33]	val's auc: 0.670763
Fold 3 AUC: 0.6708

Fold 4/5


Training until validation scores don't improve for 50 rounds


[100]	val's auc: 0.6081


Early stopping, best iteration is:
[65]	val's auc: 0.617586
Fold 4 AUC: 0.6176

Fold 5/5


Training until validation scores don't improve for 50 rounds


[100]	val's auc: 0.63141


Early stopping, best iteration is:
[58]	val's auc: 0.65829
Fold 5 AUC: 0.6583

CV Results: 0.6410 ± 0.0208


In [23]:
# Train final model on full training data
print("Training final model on full data...")
train_set = lgb.Dataset(X_train, label=y)

final_model = lgb.train(
    lgb_params,
    train_set,
    num_boost_round=1000,
    callbacks=[
        lgb.log_evaluation(100)
    ]
)

# Make predictions on test set
print("Making predictions on test set...")
test_predictions = final_model.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")

# Save submission
submission_path = '/home/code/submission_candidates/candidate_005.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

Training final model on full data...


Making predictions on test set...
Submission shape: (1162, 2)
Prediction range: [0.0000, 0.9946]
Mean prediction: 0.1040
Submission saved to: /home/code/submission_candidates/candidate_005.csv


In [24]:
# Feature importance analysis
print("\nTop 20 Feature Importances:")
importance_df = pd.DataFrame({
    'feature': dense_feature_names + list(tfidf_word.get_feature_names_out()) + list(tfidf_char.get_feature_names_out()),
    'importance': final_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print(importance_df.head(20).to_string(index=False))

# Save feature importance
importance_path = '/home/code/experiments/005_feature_importance.csv'
importance_df.to_csv(importance_path, index=False)
print(f"\nFeature importance saved to: {importance_path}")

# Summary
print(f"\n{'='*60}")
print(f"EXPERIMENT 005 SUMMARY")
print(f"{'='*60}")
print(f"Model: LightGBM")
print(f"Features: {X_train.shape[1]} total")
print(f"  - Dense metadata: {len(dense_feature_names)}")
print(f"  - TF-IDF word n-grams: {X_train_tfidf_word.shape[1]}")
print(f"  - TF-IDF char n-grams: {X_train_tfidf_char.shape[1]}")
print(f"Psycholinguistic features: REMOVED (9 features)")
print(f"CV AUC: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Previous best (exp_004): 0.6599")
print(f"Change: {cv_mean - 0.6599:+.4f}")
print(f"{'='*60}")


Top 20 Feature Importances:
                feature  importance
            text_length  921.294869
upvotes_minus_downvotes  720.686824
       comments_in_raop  702.540417
 upvotes_plus_downvotes  624.865646
         total_comments  444.988090
  days_since_first_post  415.146118
            request_day  365.047904
       account_age_days  329.146109
                    i w  312.800116
                   . i   309.919118
                     an  293.874672
                    ere  291.693144
                    e w  276.026127
                    aid  272.826835
                     my  262.587784
             word_count  259.931909
                    ien  253.576614
           request_hour  244.830690
                    ide  241.517045
                     ho  230.690708

Feature importance saved to: /home/code/experiments/005_feature_importance.csv

EXPERIMENT 005 SUMMARY
Model: LightGBM
Features: 5018 total
  - Dense metadata: 18
  - TF-IDF word n-grams: 3000
  - TF-IDF char n-gra