In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nTarget balance: {df['requester_received_pizza'].mean():.3f}")

# Check data types
print("\nData types:")
print(df.dtypes.value_counts())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().sum())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target balance: 0.248

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Missing values:
2163


In [3]:
# Check which columns have missing values
missing_counts = df.isnull().sum()
print("Columns with missing values:")
print(missing_counts[missing_counts > 0])

# Check text features
print("\nText features sample:")
print("Request title sample:")
print(df['request_title'].iloc[0])
print("\nRequest text sample (first 200 chars):")
print(df['request_text'].iloc[0][:200])

# Check text lengths
print("\nText length statistics:")
print(f"Title length - mean: {df['request_title'].str.len().mean():.1f}, max: {df['request_title'].str.len().max()}")
print(f"Text length - mean: {df['request_text'].str.len().mean():.1f}, max: {df['request_text'].str.len().max()}")

Columns with missing values:
requester_user_flair    2163
dtype: int64

Text features sample:
Request title sample:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Request text sample (first 200 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 

Text length statistics:
Title length - mean: 71.6, max: 272
Text length - mean: 402.5, max: 4460


In [4]:
# Check key numerical features correlation with target
numerical_features = ['requester_account_age_in_days_at_request', 
                      'requester_number_of_comments_at_request',
                      'requester_number_of_posts_at_request',
                      'requester_upvotes_minus_downvotes_at_request',
                      'requester_number_of_subreddits_at_request',
                      'number_of_upvotes_of_request_at_retrieval',
                      'number_of_downvotes_of_request_at_retrieval',
                      'request_number_of_comments_at_retrieval']

print("Correlation with target:")
correlations = df[numerical_features + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print(correlations)

# Check user flair distribution (excluding missing)
print("\nUser flair distribution (non-missing):")
print(df['requester_user_flair'].value_counts(dropna=False))

# Check success rate by flair
print("\nSuccess rate by user flair:")
print(df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean']).round(3))

Correlation with target:
requester_received_pizza                        1.000000
request_number_of_comments_at_retrieval         0.290709
number_of_upvotes_of_request_at_retrieval       0.090767
requester_number_of_subreddits_at_request       0.047001
requester_account_age_in_days_at_request        0.043374
requester_upvotes_minus_downvotes_at_request    0.043084
requester_number_of_posts_at_request            0.037975
requester_number_of_comments_at_request         0.036705
number_of_downvotes_of_request_at_retrieval     0.020646
Name: requester_received_pizza, dtype: float64

User flair distribution (non-missing):
requester_user_flair
None      2163
shroom     677
PIF         38
Name: count, dtype: int64

Success rate by user flair:
                      count  mean
requester_user_flair             
PIF                      38   1.0
shroom                  677   1.0
