# Enhanced Text Representation Experiment

This experiment focuses on improving text representation using:
1. **TruncatedSVD (LSA)**: Dimensionality reduction on TF-IDF matrix to capture latent semantics (100 components)
2. **Character n-grams**: Add 2-4 character n-grams to capture stylistic patterns
3. **Enhanced TF-IDF**: Better parameters and preprocessing

Based on the strategy, these techniques should capture more signal than basic TF-IDF.

In [1]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Positive class rate: {df_train['requester_received_pizza'].mean():.3f}")

Loading data...
Training samples: 2878
Test samples: 1162
Positive class rate: 0.248


In [2]:
# Combine text features
df_train['combined_text'] = df_train['request_title'].fillna('') + ' ' + df_train['request_text_edit_aware'].fillna('')
df_test['combined_text'] = df_test['request_title'].fillna('') + ' ' + df_test['request_text_edit_aware'].fillna('')

# Basic text preprocessing function
def preprocess_text(text):
    """Clean text while preserving important patterns"""
    if pd.isna(text):
        return ""
    
    # Convert to string
    text = str(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove Reddit usernames (e.g., /u/username)
    text = re.sub(r'/u/\w+', '', text)
    
    # Remove edit markers
    text = re.sub(r'EDIT:\s*', '', text)
    
    return text.strip()

# Apply preprocessing
df_train['combined_text_clean'] = df_train['combined_text'].apply(preprocess_text)
df_test['combined_text_clean'] = df_test['combined_text'].apply(preprocess_text)

print("Text preprocessing completed")

Text preprocessing completed


In [3]:
# Create enhanced numeric features
y = df_train['requester_received_pizza'].values

# Log transforms for count features
count_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_plus_downvotes_at_request'
]

for feat in count_features:
    df_train[f'{feat}_log'] = np.log1p(df_train[feat])
    df_test[f'{feat}_log'] = np.log1p(df_test[feat])

# Ratios
df_train['upvotes_per_comment'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_comments_at_request'] + 1)
df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_test['upvotes_per_comment'] = df_test['requester_upvotes_plus_downvotes_at_request'] / (df_test['requester_number_of_comments_at_request'] + 1)
df_test['comments_per_post'] = df_test['requester_number_of_comments_at_request'] / (df_test['requester_number_of_posts_at_request'] + 1)

# Account age in years
df_train['account_age_years'] = df_train['requester_account_age_in_days_at_request'] / 365.25
df_test['account_age_years'] = df_test['requester_account_age_in_days_at_request'] / 365.25

# Text length features
df_train['text_length'] = df_train['combined_text_clean'].str.len()
df_train['word_count'] = df_train['combined_text_clean'].str.split().str.len()
df_test['text_length'] = df_test['combined_text_clean'].str.len()
df_test['word_count'] = df_test['combined_text_clean'].str.split().str.len()

# Post edited flag (handle missing in test)
if 'post_was_edited' in df_test.columns:
    df_test['post_was_edited'] = df_test['post_was_edited'].astype(int)
    post_was_edited_flag = True
else:
    df_test['post_was_edited'] = 0
    post_was_edited_flag = False

numeric_features = [f'{feat}_log' for feat in count_features] + [
    'upvotes_per_comment', 'comments_per_post', 'account_age_years',
    'text_length', 'word_count'
]

if post_was_edited_flag:
    df_train['post_was_edited'] = df_train['post_was_edited'].astype(int)
    numeric_features.append('post_was_edited')

# Prepare numeric matrices
train_numeric = df_train[numeric_features].fillna(0).values
test_numeric = df_test[numeric_features].fillna(0).values

print(f"Created {len(numeric_features)} numeric features: {numeric_features}")
print(f"Train numeric shape: {train_numeric.shape}")
print(f"Test numeric shape: {test_numeric.shape}")

Created 8 numeric features: ['requester_number_of_comments_at_request_log', 'requester_number_of_posts_at_request_log', 'requester_upvotes_plus_downvotes_at_request_log', 'upvotes_per_comment', 'comments_per_post', 'account_age_years', 'text_length', 'word_count']
Train numeric shape: (2878, 8)
Test numeric shape: (1162, 8)


In [4]:
# Enhanced TF-IDF with character n-grams
print("Creating enhanced TF-IDF features...")

# Word-level TF-IDF
tfidf_word = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)

# Character-level TF-IDF for stylistic patterns
tfidf_char = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2, 4),
    max_features=3000,
    lowercase=False,
    min_df=5,
    max_df=0.95,
    sublinear_tf=True
)

# Fit and transform
train_text_word = tfidf_word.fit_transform(df_train['combined_text_clean'])
test_text_word = tfidf_word.transform(df_test['combined_text_clean'])

train_text_char = tfidf_char.fit_transform(df_train['combined_text_clean'])
test_text_char = tfidf_char.transform(df_test['combined_text_clean'])

print(f"Word TF-IDF shape: {train_text_word.shape}")
print(f"Char TF-IDF shape: {train_text_char.shape}")

Creating enhanced TF-IDF features...


Word TF-IDF shape: (2878, 8000)
Char TF-IDF shape: (2878, 3000)


In [5]:
# Apply TruncatedSVD for dimensionality reduction
print("Applying TruncatedSVD for latent semantic analysis...")

# Reduce word TF-IDF to 100 components
svd = TruncatedSVD(n_components=100, random_state=42)
train_svd = svd.fit_transform(train_text_word)
test_svd = svd.transform(test_text_word)

print(f"SVD components shape: {train_svd.shape}")
print(f"Explained variance ratio: {svd.explained_variance_ratio_.sum():.4f}")

# Also reduce char TF-IDF to 50 components
svd_char = TruncatedSVD(n_components=50, random_state=42)
train_svd_char = svd_char.fit_transform(train_text_char)
test_svd_char = svd_char.transform(test_text_char)

print(f"Char SVD components shape: {train_svd_char.shape}")
print(f"Char explained variance ratio: {svd_char.explained_variance_ratio_.sum():.4f}")

Applying TruncatedSVD for latent semantic analysis...


SVD components shape: (2878, 100)
Explained variance ratio: 0.1785


Char SVD components shape: (2878, 50)
Char explained variance ratio: 0.2724


In [6]:
# Stratified CV setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store predictions
oof_predictions = np.zeros(len(df_train))
test_predictions = np.zeros(len(df_test))
cv_scores = []

print("Starting 5-fold CV with enhanced text representation...")

fold = 0
for train_idx, val_idx in skf.split(df_train, y):
    fold += 1
    print(f"\nFold {fold}/5")
    
    # Split data
    X_train_word, X_val_word = train_text_word[train_idx], train_text_word[val_idx]
    X_train_char, X_val_char = train_text_char[train_idx], train_text_char[val_idx]
    X_train_svd, X_val_svd = train_svd[train_idx], train_svd[val_idx]
    X_train_svd_char, X_val_svd_char = train_svd_char[train_idx], train_svd_char[val_idx]
    X_train_num, X_val_num = train_numeric[train_idx], train_numeric[val_idx]
    
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Combine all features
    X_train_combined = hstack([
        X_train_word, X_train_char, X_train_svd, X_train_svd_char, X_train_num
    ])
    X_val_combined = hstack([
        X_val_word, X_val_char, X_val_svd, X_val_svd_char, X_val_num
    ])
    
    print(f"  Training features shape: {X_train_combined.shape}")
    print(f"  Validation features shape: {X_val_combined.shape}")
    
    # Train model
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        n_jobs=-1,
        C=0.5  # Slightly stronger regularization for high-dimensional data
    )
    
    model.fit(X_train_combined, y_train)
    
    # Predict
    val_pred = model.predict_proba(X_val_combined)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate score
    score = roc_auc_score(y_val, val_pred)
    cv_scores.append(score)
    print(f"  Fold {fold} AUC: {score:.4f}")
    
    # Predict on test for this fold
    test_combined = hstack([
        test_text_word, test_text_char, test_svd, test_svd_char, test_numeric
    ])
    fold_test_pred = model.predict_proba(test_combined)[:, 1]
    test_predictions += fold_test_pred

# Average test predictions across folds
test_predictions /= 5

# Overall CV score
overall_score = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"CV scores: {cv_scores}")
print(f"Mean ± Std: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Starting 5-fold CV with enhanced text representation...

Fold 1/5
  Training features shape: (2302, 11158)
  Validation features shape: (576, 11158)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 1 AUC: 0.6559

Fold 2/5
  Training features shape: (2302, 11158)
  Validation features shape: (576, 11158)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 2 AUC: 0.6321

Fold 3/5
  Training features shape: (2302, 11158)
  Validation features shape: (576, 11158)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 3 AUC: 0.6623

Fold 4/5
  Training features shape: (2303, 11158)
  Validation features shape: (575, 11158)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Fold 4 AUC: 0.6166

Fold 5/5
  Training features shape: (2303, 11158)
  Validation features shape: (575, 11158)


  Fold 5 AUC: 0.6557

Overall CV AUC: 0.6450
CV scores: [0.6558729953649122, 0.632148452009884, 0.6623491981459649, 0.6166472416472417, 0.6557239057239057]
Mean ± Std: 0.6445 ± 0.0173


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Generate final predictions using full training data
print("Training final model on full data...")

# Combine all features for final model
train_combined = hstack([
    train_text_word, train_text_char, train_svd, train_svd_char, train_numeric
])
test_combined = hstack([
    test_text_word, test_text_char, test_svd, test_svd_char, test_numeric
])

# Train final model
final_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    C=0.5
)

final_model.fit(train_combined, y)

# Generate final predictions
final_predictions = final_model.predict_proba(test_combined)[:, 1]

print(f"Final model trained on {train_combined.shape[1]} features")
print(f"Final predictions shape: {final_predictions.shape}")

Training final model on full data...


Final model trained on 11158 features
Final predictions shape: (1162,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Create submission file
submission = pd.DataFrame({
    'request_id': df_test['request_id'],
    'requester_received_pizza': final_predictions
})

# Ensure proper format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission_path = '/home/submission/submission_enhanced_text.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction range: {submission['requester_received_pizza'].min():.4f} to {submission['requester_received_pizza'].max():.4f}")
print("\nSubmission preview:")
print(submission.head())

Submission saved to /home/submission/submission_enhanced_text.csv
Submission shape: (1162, 2)
Prediction range: 0.1069 to 0.8835

Submission preview:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.601710
1   t3_roiuw                  0.467747
2   t3_mjnbq                  0.338807
3   t3_t8wd1                  0.518842
4  t3_1m4zxu                  0.565939
