In [1]:
import json
import pandas as pd
import numpy as np

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nPercentage of successful requests: {df['requester_received_pizza'].mean():.2%}")

# Check data types
print("\nData types:")
print(df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage of successful requests: 24.84%

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [3]:
# Explore text fields
print("Sample request title:")
print(df['request_title'].iloc[0])
print("\nSample request text (first 500 chars):")
print(df['request_text'].iloc[0][:500])

# Check text lengths
print("\nText length statistics:")
print(f"Title length - Mean: {df['request_title'].str.len().mean():.1f}, Max: {df['request_title'].str.len().max()}")
print(f"Text length - Mean: {df['request_text'].str.len().mean():.1f}, Max: {df['request_text'].str.len().max()}")

# Check some key numerical features
print("\nKey numerical features distribution:")
numerical_features = ['requester_account_age_in_days_at_request', 
                      'requester_number_of_comments_at_request',
                      'requester_number_of_posts_at_request',
                      'requester_upvotes_minus_downvotes_at_request']

for feature in numerical_features:
    print(f"\n{feature}:")
    print(df[feature].describe())

Sample request title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Sample request text (first 500 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.

Text length statistics:
Title length - Mean: 71.6, Max: 272
Text length - Mean: 402.5, Max: 4460

Key numerical features distribution:

requester_account_age_in_days_at_request:
count    2878.000000
mean      250.682364
std       301.838771
min         0.000000
25%         3.038877
50%       155.156377
75%       383.640090
max      2809.750787
Name: requester_account_age_in_days_at_request, dtype: float64

requester_number_of_comments_at_request:
count    2878.000000
mean      112.311327
std       192.017515
min         0.000000
25%         0.000000
50%        22.000000
75%       132.000000
max       981.000000
Name: requester_number_of_comments_at

In [4]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

# Check user flair distribution
print("\nUser flair distribution:")
print(df['requester_user_flair'].value_counts())

# Check correlation between flair and success
print("\nSuccess rate by user flair:")
print(df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'sum', 'mean']))

Missing values:
requester_user_flair                           2163
giver_username_if_known                           0
number_of_upvotes_of_request_at_retrieval         0
number_of_downvotes_of_request_at_retrieval       0
request_id                                        0
request_number_of_comments_at_retrieval           0
request_text                                      0
post_was_edited                                   0
request_title                                     0
requester_account_age_in_days_at_request          0
dtype: int64

User flair distribution:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

Success rate by user flair:
                      count  sum  mean
requester_user_flair                  
PIF                      38   38   1.0
shroom                  677  677   1.0


In [5]:
# Check for data leakage - compare at_request vs at_retrieval features
print("Checking for potential data leakage...")

# Features that should be identical or very similar
leakage_pairs = [
    ('requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval'),
    ('requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval'),
    ('requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval'),
    ('requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_minus_downvotes_at_retrieval')
]

for req_feat, ret_feat in leakage_pairs:
    diff = (df[req_feat] - df[ret_feat]).abs()
    print(f"{req_feat} vs {ret_feat}:")
    print(f"  Max difference: {diff.max()}")
    print(f"  Mean difference: {diff.mean()}")
    print(f"  Samples with difference > 0: {(diff > 0).sum()}")
    print()

Checking for potential data leakage...
requester_account_age_in_days_at_request vs requester_account_age_in_days_at_retrieval:
  Max difference: 927.4474421296296
  Mean difference: 503.2615808745464
  Samples with difference > 0: 2878

requester_number_of_comments_at_request vs requester_number_of_comments_at_retrieval:
  Max difference: 1000
  Mean difference: 168.23835997220291
  Samples with difference > 0: 2486

requester_number_of_posts_at_request vs requester_number_of_posts_at_retrieval:
  Max difference: 998
  Mean difference: 19.294996525364837
  Samples with difference > 0: 2869

requester_upvotes_minus_downvotes_at_request vs requester_upvotes_minus_downvotes_at_retrieval:
  Max difference: 70785
  Mean difference: 1550.639680333565
  Samples with difference > 0: 2732



In [None]:
# Analyze temporal patterns
import matplotlib.pyplot as plt

# Convert timestamps to datetime
df['request_datetime'] = pd.to_datetime(df['unix_timestamp_of_request_utc'], unit='s')

# Extract temporal features
df['request_hour'] = df['request_datetime'].dt.hour
df['request_dayofweek'] = df['request_datetime'].dt.dayofweek
df['request_month'] = df['request_datetime'].dt.month

print("Success rate by hour of day:")
hourly_success = df.groupby('request_hour')['requester_received_pizza'].agg(['count', 'mean']).round(3)
print(hourly_success)

print("\nSuccess rate by day of week:")
daily_success = df.groupby('request_dayofweek')['requester_received_pizza'].agg(['count', 'mean']).round(3)
print(daily_success)