In [1]:
import json
import pandas as pd
import numpy as np

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nClass balance: {df['requester_received_pizza'].mean():.3f} (positive rate)")

# Check for missing values
print(f"\nMissing values per column:")
print(df.isnull().sum().head(10))  # Show first 10 columns

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class balance: 0.248 (positive rate)

Missing values per column:
giver_username_if_known                        0
number_of_downvotes_of_request_at_retrieval    0
number_of_upvotes_of_request_at_retrieval      0
post_was_edited                                0
request_id                                     0
request_number_of_comments_at_retrieval        0
request_text                                   0
request_text_edit_aware                        0
request_title                                  0
requester_account_age_in_days_at_request       0
dtype: int64


In [3]:
# Examine text features
print("Text feature analysis:")
print(f"\nRequest title length (characters):")
print(df['request_title'].str.len().describe())

print(f"\nRequest text length (characters):")
print(df['request_text'].str.len().describe())

print(f"\nRequest text edit aware length (characters):")
print(df['request_text_edit_aware'].str.len().describe())

# Show some examples
print("\n" + "="*50)
print("EXAMPLE 1 (successful):")
successful = df[df['requester_received_pizza'] == True].iloc[0]
print(f"Title: {successful['request_title']}")
print(f"Text (first 200 chars): {successful['request_text'][:200]}...")

print("\n" + "="*50)
print("EXAMPLE 2 (unsuccessful):")
unsuccessful = df[df['requester_received_pizza'] == False].iloc[0]
print(f"Title: {unsuccessful['request_title']}")
print(f"Text (first 200 chars): {unsuccessful['request_text'][:200]}...")

Text feature analysis:

Request title length (characters):
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length (characters):
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request text edit aware length (characters):
count    2878.000000
mean      394.567755
std       351.922518
min         0.000000
25%       180.000000
50%       302.000000
75%       498.000000
max      4460.000000
Name: request_text_edit_aware, dtype: float64

EXAMPLE 1 (successful):
Title: [REQUEST] Not much food until tomorrow.
Text (first 200 chars): I will go ahead and say that I got a pizza meal from here before as to not seem like I'm scamming anyone. I have been promised 2 well-paying jo

In [None]:
# Examine numerical features
numerical_cols = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'request_number_of_comments_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval'
]

print("Numerical feature statistics:")
for col in numerical_cols:
    print(f"\n{col}:")
    print(df[col].describe())

# Check categorical features
print("\n" + "="*60)
print("Categorical features:")
print(f"\nUser flair distribution:")
print(df['requester_user_flair'].value_counts())

print(f"\nPost edited distribution:")
print(df['post_was_edited'].value_counts())

# Correlation with target for numerical features
print("\n" + "="*60)
print("Correlation with target:")
correlations = df[numerical_cols + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print(correlations)