# Experiment 002: Adding TF-IDF Text Features

This experiment adds TF-IDF features to capture actual text content from request_text and request_title.

**Strategy:**
- Extract TF-IDF features from text (unigrams and bigrams)
- Limit to top 200 features to avoid sparsity
- Combine with existing tabular features
- Use 5-fold stratified CV
- LightGBM for training

**Expected improvements:**
- Capture semantic content beyond just length
- Identify key phrases that indicate need/urgency
- Better prediction distribution (less underconfident)

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import re
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load Data

In [2]:
# Load training data
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

with open(train_path, 'r') as f:
    train_data = json.load(f)

with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Training columns: {train_df.shape[1]}")
print(f"Test columns: {test_df.shape[1]}")

Training samples: 2878
Test samples: 1162
Training columns: 32
Test columns: 17


## Text Preprocessing Function

In [3]:
def preprocess_text(text):
    """Simple text preprocessing for TF-IDF"""
    if pd.isna(text) or text == '':
        return ''
    
    # Convert to string
    text = str(text)
    
    # Lowercase
    text = text.lower()
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Combine request text and title for richer features
text_col = 'request_text' if 'request_text' in train_df.columns else 'request_text_edit_aware'

train_df['combined_text'] = train_df[text_col].fillna('') + ' ' + train_df['request_title'].fillna('')
test_df['combined_text'] = test_df['request_text_edit_aware'].fillna('') + ' ' + test_df['request_title'].fillna('')

# Apply preprocessing
train_df['combined_text_clean'] = train_df['combined_text'].apply(preprocess_text)
test_df['combined_text_clean'] = test_df['combined_text'].apply(preprocess_text)

print("Text preprocessing completed")
print(f"Sample cleaned text: {train_df['combined_text_clean'].iloc[0][:100]}...")

Text preprocessing completed
Sample cleaned text: i will soon be going on a long deployment which i m not aloud to discuss but willing to give some in...


## Extract TF-IDF Features

In [4]:
# Extract TF-IDF features
# Limit to top features to avoid sparsity and overfitting
max_features = 200

vectorizer = TfidfVectorizer(
    max_features=max_features,
    ngram_range=(1, 2),  # unigrams and bigrams
    stop_words='english',
    min_df=5,  # ignore very rare terms
    max_df=0.8  # ignore very common terms
)

# Fit on training data and transform both train and test
tfidf_train = vectorizer.fit_transform(train_df['combined_text_clean'])
tfidf_test = vectorizer.transform(test_df['combined_text_clean'])

# Convert to DataFrame
tfidf_train_df = pd.DataFrame(
    tfidf_train.toarray(),
    columns=[f'tfidf_{i}' for i in range(max_features)],
    index=train_df.index
)

tfidf_test_df = pd.DataFrame(
    tfidf_test.toarray(),
    columns=[f'tfidf_{i}' for i in range(max_features)],
    index=test_df.index
)

print(f"TF-IDF train shape: {tfidf_train_df.shape}")
print(f"TF-IDF test shape: {tfidf_test_df.shape}")
print(f"Top TF-IDF terms: {vectorizer.get_feature_names_out()[:10]}")

TF-IDF train shape: (2878, 200)
TF-IDF test shape: (1162, 200)
Top TF-IDF terms: ['able' 'account' 'advance' 'afford' 'ago' 'amp' 'apartment' 'appreciate'
 'appreciated' 'area']


## Extract Tabular Features

In [5]:
def extract_tabular_features(df):
    """Extract and preprocess tabular features"""
    features = pd.DataFrame(index=df.index)
    
    # Numeric features that might exist in the dataset
    numeric_cols = [
        'requester_account_age_in_days_at_request',
        'requester_account_age_in_days_at_retrieval',
        'requester_number_of_comments_at_request',
        'requester_number_of_comments_at_retrieval',
        'requester_number_of_posts_at_request',
        'requester_number_of_posts_at_retrieval',
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_minus_downvotes_at_retrieval',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_retrieval',
        'number_of_upvotes_of_request_at_retrieval',
        'number_of_downvotes_of_request_at_retrieval',
        'request_number_of_comments_at_retrieval'
    ]
    
    # Add numeric features if they exist
    for col in numeric_cols:
        if col in df.columns:
            features[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            features[col] = 0  # Add column with zeros if it doesn't exist
    
    # Binary features
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].astype(int)
    else:
        features['post_was_edited'] = 0
    
    return features

# Extract tabular features
train_tabular_features = extract_tabular_features(train_df)
test_tabular_features = extract_tabular_features(test_df)

print(f"Tabular train shape: {train_tabular_features.shape}")
print(f"Tabular test shape: {test_tabular_features.shape}")

Tabular train shape: (2878, 14)
Tabular test shape: (1162, 14)


## Combine All Features

In [6]:
# Combine TF-IDF and tabular features
train_features = pd.concat([tfidf_train_df, train_tabular_features], axis=1)
test_features = pd.concat([tfidf_test_df, test_tabular_features], axis=1)

# Ensure both have same columns
train_features = train_features.reindex(columns=test_features.columns)

print(f"Final train features shape: {train_features.shape}")
print(f"Final test features shape: {test_features.shape}")
print(f"Columns match: {list(train_features.columns) == list(test_features.shape)}")

# Prepare target
y = train_df['requester_received_pizza'].astype(int)
print(f"Target distribution: {y.value_counts().to_dict()}")

Final train features shape: (2878, 214)
Final test features shape: (1162, 214)
Columns match: False
Target distribution: {0: 2163, 1: 715}


## Model Training with Cross-Validation

In [7]:
# Fill any remaining NaN values
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

# Define cross-validation strategy
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_SEED)

# Store predictions
oof_predictions = np.zeros(len(train_features))
test_predictions = np.zeros(len(test_features))
cv_scores = []

print(f"Starting {n_folds}-fold stratified cross-validation...")

fold = 1
for train_idx, valid_idx in skf.split(train_features, y):
    print(f"\nFold {fold}/{n_folds}")
    
    X_train, X_valid = train_features.iloc[train_idx], train_features.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': RANDOM_SEED
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )
    
    # Predict on validation set
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    oof_predictions[valid_idx] = valid_pred
    
    # Calculate AUC for this fold
    fold_auc = roc_auc_score(y_valid, valid_pred)
    cv_scores.append(fold_auc)
    print(f"Fold {fold} AUC: {fold_auc:.4f}")
    
    # Predict on test set
    test_pred = model.predict(test_features, num_iteration=model.best_iteration)
    test_predictions += test_pred / n_folds
    
    fold += 1

# Calculate overall CV score
overall_auc = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_auc:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Starting 5-fold stratified cross-validation...

Fold 1/5


Fold 1 AUC: 0.7935

Fold 2/5


Fold 2 AUC: 0.7724

Fold 3/5


Fold 3 AUC: 0.7851

Fold 4/5


Fold 4 AUC: 0.7611

Fold 5/5


Fold 5 AUC: 0.7956

Overall CV AUC: 0.7697
Mean CV AUC: 0.7815 ± 0.0131


## Feature Importance Analysis

In [8]:
# Get feature importance from the last fold model
feature_importance = pd.DataFrame({
    'feature': train_features.columns,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_importance.head(15))

# Check how many TF-IDF features are in top features
tfidf_top_features = feature_importance[feature_importance['feature'].str.startswith('tfidf_')].head(10)
print(f"\nTop TF-IDF features:")
print(tfidf_top_features)

Top 15 most important features:
                                            feature   importance
212         request_number_of_comments_at_retrieval  2947.047934
201      requester_account_age_in_days_at_retrieval   744.801151
207  requester_upvotes_minus_downvotes_at_retrieval   731.755104
203       requester_number_of_comments_at_retrieval   536.245789
209   requester_upvotes_plus_downvotes_at_retrieval   502.315290
137                                       tfidf_137   481.312739
200        requester_account_age_in_days_at_request   466.218610
204            requester_number_of_posts_at_request   386.892970
210       number_of_upvotes_of_request_at_retrieval   375.567380
206    requester_upvotes_minus_downvotes_at_request   343.229799
205          requester_number_of_posts_at_retrieval   311.337381
202         requester_number_of_comments_at_request   294.601562
186                                       tfidf_186   271.277141
211     number_of_downvotes_of_request_at_retrieval   228.

## Generate Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission shape:", submission.shape)
print("\nPrediction distribution:")
print(submission['requester_received_pizza'].describe())

# Save submission
submission_path = '/home/submission/submission_002_tfidf.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print("\nVerification:")
print(f"Columns match: {list(submission.columns) == list(sample_sub.columns)}")
print(f"Request IDs match: {set(submission['request_id']) == set(sample_sub['request_id'])}")