In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"Success rate: {df['requester_received_pizza'].mean():.3f}")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().sum())

# Check data types
print("\nData types:")
print(df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Success rate: 0.248

Missing values:
2163

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [3]:
# Examine text fields
print("Sample request text:")
print(df['request_text'].iloc[0][:500])
print("\n" + "="*80 + "\n")

print("Sample request title:")
print(df['request_title'].iloc[0])
print("\n" + "="*80 + "\n")

# Check text lengths
print("Text length statistics:")
print(f"Request text - Mean: {df['request_text'].str.len().mean():.0f}, Max: {df['request_text'].str.len().max()}")
print(f"Request title - Mean: {df['request_title'].str.len().mean():.0f}, Max: {df['request_title'].str.len().max()}")
print(f"Request text (edit aware) - Mean: {df['request_text_edit_aware'].str.len().mean():.0f}")

# Check some key numerical features
print("\nKey numerical features:")
numerical_features = ['requester_number_of_comments_at_request', 'requester_number_of_posts_at_request', 
                      'requester_upvotes_minus_downvotes_at_request', 'request_number_of_comments_at_retrieval']
for feat in numerical_features:
    print(f"{feat}: mean={df[feat].mean():.1f}, std={df[feat].std():.1f}")

Sample request text:
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.


Sample request title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.


Text length statistics:
Request text - Mean: 403, Max: 4460
Request title - Mean: 72, Max: 272
Request text (edit aware) - Mean: 395

Key numerical features:
requester_number_of_comments_at_request: mean=112.3, std=192.0
requester_number_of_posts_at_request: mean=21.6, std=51.6
requester_upvotes_minus_downvotes_at_request: mean=1184.6, std=4198.3
request_number_of_comments_at_retrieval: mean=2.9, std=4.8


In [None]:
# Load test data to see what features are available
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Number of test samples: {len(test_data)}")
print(f"Test sample keys: {list(test_data[0].keys())}")

# Check what's missing compared to train
train_keys = set(df.columns)
test_keys = set(test_data[0].keys())
print(f"\nFeatures in train but not in test: {train_keys - test_keys}")
print(f"Features in test but not in train: {test_keys - train_keys}")