In [2]:
import pandas as pd
import numpy as np
import json

# Load the training data - it's a JSON array
train_df = pd.read_json('/home/data/train.json')
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

In [3]:
# Check target distribution
target_counts = train_df['requester_received_pizza'].value_counts()
print("Target distribution:")
print(target_counts)
print(f"Success rate: {target_counts[True] / len(train_df):.3f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Success rate: 0.248


In [4]:
# Check data types and missing values
print("Data types:")
print(train_df.dtypes.value_counts())

print("\nMissing values:")
missing = train_df.isnull().sum()
print(missing[missing > 0])

# Check text length characteristics
print("\nText length statistics:")
train_df['request_text_length'] = train_df['request_text'].str.len()
train_df['request_title_length'] = train_df['request_title'].str.len()
print(train_df[['request_text_length', 'request_title_length']].describe())

Data types:
int64      19
object      8
float64     4
bool        1
Name: count, dtype: int64

Missing values:
requester_user_flair    2163
dtype: int64

Text length statistics:
       request_text_length  request_title_length
count          2878.000000           2878.000000
mean            402.521543             71.572967
std             362.393727             36.233487
min               0.000000              7.000000
25%             182.000000             46.000000
50%             308.000000             64.000000
75%             503.750000             90.000000
max            4460.000000            272.000000


In [None]:
# Check key meta-data features correlation with target
import matplotlib.pyplot as plt

# Numeric features that might be important
numeric_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'request_number_of_comments_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval'
]

# Calculate correlation with target (convert bool to int for correlation)
target_numeric = train_df['requester_received_pizza'].astype(int)
correlations = {}
for feature in numeric_features:
    if feature in train_df.columns:
        corr = train_df[feature].corr(target_numeric)
        correlations[feature] = corr

# Sort by absolute correlation
sorted_correlations = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
print("Feature correlations with target (sorted by absolute value):")
for feature, corr in sorted_correlations:
    print(f"{feature}: {corr:.3f}")

# Check user flair distribution
print("\nUser flair distribution:")
print(train_df['requester_user_flair'].value_counts(dropna=False))

# Check success rate by flair
print("\nSuccess rate by user flair:")
flair_success = train_df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean'])
print(flair_success)