In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check basic info
print("DataFrame shape:", df.shape)
print("\nTarget distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nSuccess rate: {df['requester_received_pizza'].mean():.3f}")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().head(10))

DataFrame shape: (2878, 32)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Success rate: 0.248

Missing values:
giver_username_if_known                        0
number_of_downvotes_of_request_at_retrieval    0
number_of_upvotes_of_request_at_retrieval      0
post_was_edited                                0
request_id                                     0
request_number_of_comments_at_retrieval        0
request_text                                   0
request_text_edit_aware                        0
request_title                                  0
requester_account_age_in_days_at_request       0
dtype: int64


In [3]:
# Explore text features
print("Sample request title:")
print(df['request_title'].iloc[0])
print("\n" + "="*50)
print("Sample request text (first 500 chars):")
print(df['request_text'].iloc[0][:500])

# Check text lengths
print("\n" + "="*50)
print("Text length statistics:")
print("Title length - mean:", df['request_title'].str.len().mean(), "max:", df['request_title'].str.len().max())
print("Text length - mean:", df['request_text'].str.len().mean(), "max:", df['request_text'].str.len().max())
print("Edit-aware text length - mean:", df['request_text_edit_aware'].str.len().mean())

Sample request title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Sample request text (first 500 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.

Text length statistics:
Title length - mean: 71.57296733842946 max: 272
Text length - mean: 402.5215427380125 max: 4460
Edit-aware text length - mean: 394.5677553856845


In [4]:
# Explore metadata features
metadata_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request', 
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval'
]

print("Metadata feature statistics:")
for feature in metadata_features:
    if feature in df.columns:
        print(f"\n{feature}:")
        print(f"  mean: {df[feature].mean():.2f}, std: {df[feature].std():.2f}")
        print(f"  min: {df[feature].min():.2f}, max: {df[feature].max():.2f}")
        print(f"  median: {df[feature].median():.2f}")

Metadata feature statistics:

requester_account_age_in_days_at_request:
  mean: 250.68, std: 301.84
  min: 0.00, max: 2809.75
  median: 155.16

requester_number_of_comments_at_request:
  mean: 112.31, std: 192.02
  min: 0.00, max: 981.00
  median: 22.00

requester_number_of_posts_at_request:
  mean: 21.61, std: 51.58
  min: 0.00, max: 867.00
  median: 4.00

requester_number_of_comments_in_raop_at_request:
  mean: 0.69, std: 3.75
  min: 0.00, max: 88.00
  median: 0.00

requester_number_of_posts_on_raop_at_request:
  mean: 0.07, std: 0.34
  min: 0.00, max: 5.00
  median: 0.00

requester_upvotes_minus_downvotes_at_request:
  mean: 1184.58, std: 4198.26
  min: -67.00, max: 155010.00
  median: 171.00

requester_upvotes_plus_downvotes_at_request:
  mean: 3988.57, std: 30127.47
  min: 0.00, max: 1286864.00
  median: 335.50

requester_number_of_subreddits_at_request:
  mean: 17.86, std: 21.78
  min: 0.00, max: 186.00
  median: 11.00

number_of_upvotes_of_request_at_retrieval:
  mean: 6.09, std