# Linguistic Feature Engineering Experiment

Based on Stanford ICWSM 2014 paper findings, this experiment extracts specific linguistic patterns that predict pizza request success:
- Need-based narratives (family hardship, job loss, financial strain, student status, medical issues)
- Gratitude expressions (thank you, thanks, appreciate, grateful)
- Evidential language (concrete details, numbers, dates, specific situations)
- Reciprocity promises (pay it forward, help others, contribute back)
- Status signals (Reddit karma, account age references)
- Sentiment analysis using VADER
- Readability metrics

In [5]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
from textstat import flesch_reading_ease, flesch_kincaid_grade, smog_index
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Positive class rate: {df_train['requester_received_pizza'].mean():.3f}")

Loading data...
Training samples: 2878
Test samples: 1162
Positive class rate: 0.248


In [6]:
# Combine text features
df_train['combined_text'] = df_train['request_title'].fillna('') + ' ' + df_train['request_text_edit_aware'].fillna('')
df_test['combined_text'] = df_test['request_title'].fillna('') + ' ' + df_test['request_text_edit_aware'].fillna('')

# Convert to lowercase for pattern matching
train_text_lower = df_train['combined_text'].str.lower()
test_text_lower = df_test['combined_text'].str.lower()

print("Text combined and lowercased for pattern matching")

Text combined and lowercased for pattern matching


In [7]:
# Define linguistic pattern dictionaries based on academic research

# Need narratives - family hardship, job loss, financial strain, student status, medical issues
need_patterns = {
    'family_hardship': r'\b(family|child|children|kid|kids|baby|babies|mother|father|parent|parents|wife|husband|brother|sister)\b',
    'job_loss': r'\b(lost job|unemployed|laid off|fired|no job|out of work|jobless)\b',
    'financial_strain': r'\b(broke|poor|bills|rent|mortgage|utilities|electric|gas|water|heat|heating|money|cash|paycheck|pay check|pay day|payday)\b',
    'student_status': r'\b(student|college|university|school|class|tuition|loan|loans|textbook|textbooks)\b',
    'medical_issues': r'\b(hospital|doctor|sick|ill|injury|injured|medicine|medical|health|pain|surgery)\b',
    'food_insecurity': r'\b(hungry|starving|no food|empty fridge|empty stomach|haven.t eaten|haven.t ate)\b'
}

# Gratitude expressions
gratitude_patterns = {
    'thanks_words': r'\b(thank|thanks|thankful|grateful|appreciate|appreciation)\b',
    'thanks_phrases': r'\b(thank you|thanks in advance|thanks so much|thank you so much|really appreciate|truly grateful)\b'
}

# Reciprocity promises
reciprocity_patterns = {
    'pay_forward': r'\b(pay it forward|pay forward|forward the kindness|forward the generosity)\b',
    'help_others': r'\b(help others|help someone|help people|help another|contribute back|give back)\b',
    'future_reciprocity': r'\b(when i get paid|when i get money|when i.m back on my feet|once i.m stable|return the favor)\b'
}

# Status signals (Reddit-specific)
status_patterns = {
    'karma_mention': r'\b(karma|upvotes|downvotes|reputation)\b',
    'account_mention': r'\b(account|profile|user|reddit|member|joined)\b',
    'new_user': r'\b(new here|new to reddit|new account|first post|first time)\b'
}

# Evidential language - numbers, dates, specific details
evidential_patterns = {
    'numbers': r'\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten|first|second|third)\b',
    'time_refs': r'\b(days?|weeks?|months?|years?|today|tomorrow|yesterday|tonight|this morning|this afternoon)\b',
    'money_amounts': r'\b(\$\d+|\d+ dollars|\d+ bucks|\d+ cents)\b',
    'specific_details': r'\b(specifically|exactly|precisely|particularly|especially|for example|for instance|like|such as)\b'
}

print("Pattern dictionaries defined")

Pattern dictionaries defined


In [8]:
# Function to count pattern matches
def count_patterns(text_series, pattern_dict):
    """Count matches for each pattern in the dictionary"""
    results = {}
    for pattern_name, pattern_regex in pattern_dict.items():
        # Use str.contains with regex=True instead of str.count
        results[pattern_name] = text_series.str.contains(pattern_regex, regex=True).astype(int)
    return pd.DataFrame(results)

# Define all pattern dictionaries

# Need narratives - family hardship, job loss, financial strain, student status, medical issues
need_patterns = {
    'family_hardship': r'\b(family|child|children|kid|kids|baby|babies|mother|father|parent|parents|wife|husband|brother|sister)\b',
    'job_loss': r'\b(lost job|unemployed|laid off|fired|no job|out of work|jobless)\b',
    'financial_strain': r'\b(broke|poor|bills|rent|mortgage|utilities|electric|gas|water|heat|heating|money|cash|paycheck|pay check|pay day|payday)\b',
    'student_status': r'\b(student|college|university|school|class|tuition|loan|loans|textbook|textbooks)\b',
    'medical_issues': r'\b(hospital|doctor|sick|ill|injury|injured|medicine|medical|health|pain|surgery)\b',
    'food_insecurity': r'\b(hungry|starving|no food|empty fridge|empty stomach|haven.t eaten|haven.t ate)\b'
}

# Gratitude expressions
gratitude_patterns = {
    'thanks_words': r'\b(thank|thanks|thankful|grateful|appreciate|appreciation)\b',
    'thanks_phrases': r'\b(thank you|thanks so much|thank you so much|thanks a lot|thank you very much)\b'
}

# Reciprocity promises
reciprocity_patterns = {
    'pay_forward': r'\b(pay it forward|pay forward|forward the kindness|forward this kindness)\b',
    'help_others': r'\b(help others|help someone else|help other people|give back|contribute back|return the favor)\b',
    'promise_future': r'\b(will help|will pay|will contribute|will give back|when i can|once i get|promise to)\b'
}

# Status signals (Reddit-specific)
status_patterns = {
    'karma_mentions': r'\b(karma|upvote|downvote|points|score)\b',
    'account_mentions': r'\b(account|new account|old account|long time|lurker|lurking)\b',
    'reddit_status': r'\b(redditor|reddit user|member of reddit)\b'
}

# Evidential language (concrete details)
evidential_patterns = {
    'numbers': r'\b(\d+|one|two|three|four|five|six|seven|eight|nine|ten|first|second|third)\b',
    'dates_times': r'\b(january|february|march|april|may|june|july|august|september|october|november|december|monday|tuesday|wednesday|thursday|friday|saturday|sunday|today|tomorrow|yesterday|week|month|year)\b',
    'specifics': r'\b(specific|specifically|exact|exactly|precise|precisely|detail|details|detailed)\b'
}

# Extract all linguistic features
print("Extracting need narrative features...")
train_need_features = count_patterns(train_text_lower, need_patterns)
test_need_features = count_patterns(test_text_lower, need_patterns)

print("Extracting gratitude features...")
train_gratitude_features = count_patterns(train_text_lower, gratitude_patterns)
test_gratitude_features = count_patterns(test_text_lower, gratitude_patterns)

print("Extracting reciprocity features...")
train_reciprocity_features = count_patterns(train_text_lower, reciprocity_patterns)
test_reciprocity_features = count_patterns(test_text_lower, reciprocity_patterns)

print("Extracting status signal features...")
train_status_features = count_patterns(train_text_lower, status_patterns)
test_status_features = count_patterns(test_text_lower, status_patterns)

print("Extracting evidential language features...")
train_evidential_features = count_patterns(train_text_lower, evidential_patterns)
test_evidential_features = count_patterns(test_text_lower, evidential_patterns)

Extracting need narrative features...


Extracting gratitude features...
Extracting reciprocity features...


Extracting status signal features...
Extracting evidential language features...


In [9]:
# Sentiment analysis using VADER
print("Computing VADER sentiment scores...")
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_scores(text_series):
    """Extract VADER sentiment scores"""
    sentiments = []
    for text in text_series:
        scores = analyzer.polarity_scores(text)
        sentiments.append(scores)
    return pd.DataFrame(sentiments)

train_sentiment = get_sentiment_scores(df_train['combined_text'])
test_sentiment = get_sentiment_scores(df_test['combined_text'])

print(f"VADER sentiment features: {list(train_sentiment.columns)}")

Computing VADER sentiment scores...


VADER sentiment features: ['neg', 'neu', 'pos', 'compound']


In [10]:
# Readability metrics
print("Computing readability metrics...")

def compute_readability(text_series):
    """Compute readability metrics for each text"""
    results = []
    for text in text_series:
        try:
            flesch = flesch_reading_ease(text)
            fk_grade = flesch_kincaid_grade(text)
            smog = smog_index(text)
        except:
            # Handle errors for very short texts
            flesch = 0
            fk_grade = 0
            smog = 0
        results.append({
            'flesch_reading_ease': flesch,
            'flesch_kincaid_grade': fk_grade,
            'smog_index': smog
        })
    return pd.DataFrame(results)

train_readability = compute_readability(df_train['combined_text'])
test_readability = compute_readability(df_test['combined_text'])

print(f"Readability features: {list(train_readability.columns)}")

Computing readability metrics...


Readability features: ['flesch_reading_ease', 'flesch_kincaid_grade', 'smog_index']


In [11]:
# Combine all linguistic features
print("Combining all linguistic features...")

train_linguistic_features = pd.concat([
    train_need_features,
    train_gratitude_features,
    train_reciprocity_features,
    train_status_features,
    train_evidential_features,
    train_sentiment,
    train_readability
], axis=1)

test_linguistic_features = pd.concat([
    test_need_features,
    test_gratitude_features,
    test_reciprocity_features,
    test_status_features,
    test_evidential_features,
    test_sentiment,
    test_readability
], axis=1)

print(f"Total linguistic features: {train_linguistic_features.shape[1]}")
print(f"Training shape: {train_linguistic_features.shape}")
print(f"Test shape: {test_linguistic_features.shape}")

# Check feature correlations with target
correlations = []
for col in train_linguistic_features.columns:
    corr = np.corrcoef(train_linguistic_features[col], df_train['requester_received_pizza'])[0, 1]
    correlations.append((col, abs(corr) if not np.isnan(corr) else 0))

correlations.sort(key=lambda x: x[1], reverse=True)
print("\nTop 10 features by absolute correlation:")
for feat, corr in correlations[:10]:
    print(f"  {feat}: {corr:.4f}")

Combining all linguistic features...
Total linguistic features: 24
Training shape: (2878, 24)
Test shape: (1162, 24)

Top 10 features by absolute correlation:
  numbers: 0.1108
  dates_times: 0.0924
  financial_strain: 0.0607
  thanks_words: 0.0606
  thanks_phrases: 0.0597
  neu: 0.0544
  compound: 0.0522
  job_loss: 0.0516
  student_status: 0.0496
  pay_forward: 0.0485


In [12]:
# Prepare text and numeric features for modeling
print("Preparing text and numeric features...")

# Text features
text_features = df_train['combined_text'].values
test_text_features = df_test['combined_text'].values

# Basic numeric features (from previous baseline)
count_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_plus_downvotes_at_request'
]

for feat in count_features:
    df_train[f'{feat}_log'] = np.log1p(df_train[feat])
    df_test[f'{feat}_log'] = np.log1p(df_test[feat])

df_train['upvotes_per_comment'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_comments_at_request'] + 1)
df_test['upvotes_per_comment'] = df_test['requester_upvotes_plus_downvotes_at_request'] / (df_test['requester_number_of_comments_at_request'] + 1)

df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_test['comments_per_post'] = df_test['requester_number_of_comments_at_request'] / (df_test['requester_number_of_posts_at_request'] + 1)

df_train['account_age_years'] = df_train['requester_account_age_in_days_at_request'] / 365.25
df_test['account_age_years'] = df_test['requester_account_age_in_days_at_request'] / 365.25

numeric_features = [
    'requester_number_of_comments_at_request_log',
    'requester_number_of_posts_at_request_log',
    'requester_upvotes_plus_downvotes_at_request_log',
    'upvotes_per_comment',
    'comments_per_post',
    'account_age_years'
]

train_numeric = df_train[numeric_features].fillna(0).values
test_numeric = df_test[numeric_features].fillna(0).values

# Target
y = df_train['requester_received_pizza'].values

print(f"Numeric features shape: {train_numeric.shape}")
print(f"Linguistic features shape: {train_linguistic_features.shape}")
print(f"Target shape: {y.shape}")

Preparing text and numeric features...
Numeric features shape: (2878, 6)
Linguistic features shape: (2878, 24)
Target shape: (2878,)


In [13]:
# Stratified CV setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Logistic Regression with class weighting
model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

# Store predictions
oof_predictions = np.zeros(len(df_train))
test_predictions = np.zeros(len(df_test))
cv_scores = []

print("Starting 5-fold CV with linguistic features...")

Starting 5-fold CV with linguistic features...


In [14]:
# Cross-validation loop
fold = 0
for train_idx, val_idx in skf.split(df_train, y):
    fold += 1
    print(f"\nFold {fold}/5")
    
    # Split data
    X_train_text, X_val_text = text_features[train_idx], text_features[val_idx]
    X_train_num, X_val_num = train_numeric[train_idx], train_numeric[val_idx]
    X_train_ling, X_val_ling = train_linguistic_features.iloc[train_idx].values, train_linguistic_features.iloc[val_idx].values
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Fit TF-IDF on training text
    X_train_text_tfidf = tfidf.fit_transform(X_train_text)
    X_val_text_tfidf = tfidf.transform(X_val_text)
    
    # Combine all features
    X_train_combined = hstack([X_train_text_tfidf, X_train_num, X_train_ling])
    X_val_combined = hstack([X_val_text_tfidf, X_val_num, X_val_ling])
    
    # Fit model
    model.fit(X_train_combined, y_train)
    
    # Predict
    val_pred = model.predict_proba(X_val_combined)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate score
    score = roc_auc_score(y_val, val_pred)
    cv_scores.append(score)
    print(f"Fold {fold} AUC: {score:.4f}")
    
    # Predict on test for this fold
    test_text_tfidf = tfidf.transform(test_text_features)
    test_combined = hstack([test_text_tfidf, test_numeric, test_linguistic_features.values])
    fold_test_pred = model.predict_proba(test_combined)[:, 1]
    test_predictions += fold_test_pred

# Average test predictions across folds
test_predictions /= 5

# Overall CV score
overall_score = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"CV scores: {cv_scores}")
print(f"Mean \u00b1 Std: {np.mean(cv_scores):.4f} \u00b1 {np.std(cv_scores):.4f}")


Fold 1/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1 AUC: 0.6104

Fold 2/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2 AUC: 0.6196

Fold 3/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3 AUC: 0.5913

Fold 4/5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4 AUC: 0.6206

Fold 5/5


Fold 5 AUC: 0.6208

Overall CV AUC: 0.6118
CV scores: [0.6103942247129314, 0.6195997997383678, 0.591320919265492, 0.6206293706293705, 0.6208398083398083]
Mean ± Std: 0.6126 ± 0.0113


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Generate final predictions using full training data
print("Training final model on full data...")

# Fit TF-IDF on all training data
final_tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    min_df=2,
    max_df=0.95
)

# Transform all text
train_text_tfidf = final_tfidf.fit_transform(text_features)
test_text_tfidf = final_tfidf.transform(test_text_features)

# Combine all features
train_combined = hstack([train_text_tfidf, train_numeric, train_linguistic_features.values])
test_combined = hstack([test_text_tfidf, test_numeric, test_linguistic_features.values])

# Train final model
final_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

final_model.fit(train_combined, y)

# Generate final predictions
final_predictions = final_model.predict_proba(test_combined)[:, 1]

print(f"Final model trained on {train_combined.shape[1]} features")
print(f"Final predictions shape: {final_predictions.shape}")

Training final model on full data...


Final model trained on 5030 features
Final predictions shape: (1162,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Create submission file
submission = pd.DataFrame({
    'request_id': df_test['request_id'],
    'requester_received_pizza': final_predictions
})

# Ensure proper format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission_path = '/home/submission/submission_linguistic_features.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction range: {submission['requester_received_pizza'].min():.4f} to {submission['requester_received_pizza'].max():.4f}")
print("\nSubmission preview:")
print(submission.head())

Submission saved to /home/submission/submission_linguistic_features.csv
Submission shape: (1162, 2)
Prediction range: 0.0800 to 0.9226

Submission preview:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.582336
1   t3_roiuw                  0.448394
2   t3_mjnbq                  0.448839
3   t3_t8wd1                  0.469571
4  t3_1m4zxu                  0.524425
