In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")
print(f"\nFirst sample preview:")
for key, value in list(train_data[0].items())[:5]:
    print(f"  {key}: {value}")

Number of training samples: 2878
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Analyze target distribution
target_counts = df['requester_received_pizza'].value_counts()
target_pct = df['requester_received_pizza'].value_counts(normalize=True) * 100

print("Target Distribution:")
print(f"No pizza: {target_counts[False]} ({target_pct[False]:.1f}%)")
print(f"Pizza received: {target_counts[True]} ({target_pct[True]:.1f}%)")
print(f"Class imbalance ratio: {target_counts[False]/target_counts[True]:.2f}:1")

# Check for missing values
print(f"\nMissing values:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values found")

Target Distribution:
No pizza: 2163 (75.2%)
Pizza received: 715 (24.8%)
Class imbalance ratio: 3.03:1

Missing values:
requester_user_flair    2163
dtype: int64


In [3]:
# Analyze text features
print("Text Features Analysis:")
print(f"Request title length (chars): {df['request_title'].str.len().describe()}")
print(f"\nRequest text length (chars): {df['request_text'].str.len().describe()}")
print(f"\nRequest text edit-aware length (chars): {df['request_text_edit_aware'].str.len().describe()}")

# Show some examples of successful vs unsuccessful requests
print("\n" + "="*80)
print("EXAMPLE SUCCESSFUL REQUEST:")
success_example = df[df['requester_received_pizza'] == True].iloc[0]
print(f"Title: {success_example['request_title']}")
print(f"Text (first 200 chars): {success_example['request_text'][:200]}...")

print("\n" + "="*80)
print("EXAMPLE UNSUCCESSFUL REQUEST:")
fail_example = df[df['requester_received_pizza'] == False].iloc[0]
print(f"Title: {fail_example['request_title']}")
print(f"Text (first 200 chars): {fail_example['request_text'][:200]}...")

Text Features Analysis:
Request title length (chars): count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length (chars): count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request text edit-aware length (chars): count    2878.000000
mean      394.567755
std       351.922518
min         0.000000
25%       180.000000
50%       302.000000
75%       498.000000
max      4460.000000
Name: request_text_edit_aware, dtype: float64

EXAMPLE SUCCESSFUL REQUEST:
Title: [REQUEST] Not much food until tomorrow.
Text (first 200 chars): I will go ahead and say that I got a pizza meal from here before as to not seem like I'm scamming anyone. I have been promised 2 well-paying jobs and one 

In [4]:
# Analyze numerical/meta-data features
numerical_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request', 
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'request_number_of_comments_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval'
]

print("Numerical Features Summary:")
for feature in numerical_features:
    if feature in df.columns:
        print(f"\n{feature}:")
        print(df[feature].describe())

# Analyze user flair distribution
print("\n" + "="*80)
print("User Flair Distribution:")
flair_counts = df['requester_user_flair'].value_counts(dropna=False)
print(flair_counts)

# Cross-tabulate flair with target
print("\nFlair vs Target:")
flair_target = pd.crosstab(df['requester_user_flair'], df['requester_received_pizza'], normalize='index') * 100
print(flair_target.round(1))

Numerical Features Summary:

requester_account_age_in_days_at_request:
count    2878.000000
mean      250.682364
std       301.838771
min         0.000000
25%         3.038877
50%       155.156377
75%       383.640090
max      2809.750787
Name: requester_account_age_in_days_at_request, dtype: float64

requester_number_of_comments_at_request:
count    2878.000000
mean      112.311327
std       192.017515
min         0.000000
25%         0.000000
50%        22.000000
75%       132.000000
max       981.000000
Name: requester_number_of_comments_at_request, dtype: float64

requester_number_of_posts_at_request:
count    2878.000000
mean       21.614663
std        51.580719
min         0.000000
25%         0.000000
50%         4.000000
75%        21.000000
max       867.000000
Name: requester_number_of_posts_at_request, dtype: float64

requester_number_of_subreddits_at_request:
count    2878.000000
mean       17.857192
std        21.784934
min         0.000000
25%         1.000000
50%        