In [1]:
import json
import pandas as pd
import numpy as np

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nClass balance: {df['requester_received_pizza'].mean():.3f} (positive rate)")

# Check for missing values
print(f"\nMissing values per column:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Check data types
print(f"\nData types:")
print(df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class balance: 0.248 (positive rate)

Missing values per column:
requester_user_flair    2163
dtype: int64

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [3]:
# Explore text features
print("Text feature examples:")
print(f"Request title (first sample): {df['request_title'].iloc[0][:100]}...")
print(f"Request text (first sample): {df['request_text'].iloc[0][:200]}...")
print(f"Request text length - min: {df['request_text'].str.len().min()}, max: {df['request_text'].str.len().max()}, mean: {df['request_text'].str.len().mean():.0f}")

# Explore user flair
print(f"\nUser flair distribution:")
print(df['requester_user_flair'].value_counts(dropna=False))

# Explore numeric features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric features ({len(numeric_cols)}): {numeric_cols[:10]}...")

# Check correlation with target
correlations = df[numeric_cols].corrwith(df['requester_received_pizza']).abs().sort_values(ascending=False)
print(f"\nTop correlations with target:")
print(correlations.head(10))

Text feature examples:
Request title (first sample): [REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy....
Request text (first sample): I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time ...
Request text length - min: 0, max: 4460, mean: 403

User flair distribution:
requester_user_flair
None      2163
shroom     677
PIF         38
Name: count, dtype: int64

Numeric features (22): ['number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'request_number_of_comments_at_retrieval', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comment

In [4]:
# Explore subreddit features
print("Subreddit analysis:")
# Count of subreddits
subreddit_counts = df['requester_number_of_subreddits_at_request']
print(f"Number of subreddits - min: {subreddit_counts.min()}, max: {subreddit_counts.max()}, mean: {subreddit_counts.mean():.1f}")

# Success rate by subreddit count quantiles
df['subreddit_quantile'] = pd.qcut(subreddit_counts, q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
success_by_quantile = df.groupby('subreddit_quantile')['requester_received_pizza'].agg(['count', 'mean'])
print(f"\nSuccess rate by subreddit count quartile:")
print(success_by_quantile)

# Explore user flair impact
print(f"\nSuccess rate by user flair:")
flair_success = df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean'])
print(flair_success)

# Explore text length impact
df['text_length'] = df['request_text'].str.len()
df['text_length_quantile'] = pd.qcut(df['text_length'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
length_success = df.groupby('text_length_quantile')['requester_received_pizza'].agg(['count', 'mean'])
print(f"\nSuccess rate by text length quartile:")
print(length_success)

Subreddit analysis:
Number of subreddits - min: 0, max: 186, mean: 17.9

Success rate by subreddit count quartile:
                    count      mean
subreddit_quantile                 
Q1                    786  0.170483
Q2                    715  0.276923
Q3                    658  0.290274
Q4                    719  0.267038

Success rate by user flair:
                      count  mean
requester_user_flair             
PIF                      38   1.0
shroom                  677   1.0

Success rate by text length quartile:
                      count      mean
text_length_quantile                 
Q1                      721  0.166436
Q2                      720  0.201389
Q3                      717  0.271967
Q4                      720  0.354167


  success_by_quantile = df.groupby('subreddit_quantile')['requester_received_pizza'].agg(['count', 'mean'])
  length_success = df.groupby('text_length_quantile')['requester_received_pizza'].agg(['count', 'mean'])
