In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
target_counts = df['requester_received_pizza'].value_counts()
print("Target distribution:")
print(target_counts)
print(f"\nClass balance:")
print(f"Positive rate: {target_counts[True] / len(df):.3f}")
print(f"Negative rate: {target_counts[False] / len(df):.3f}")

# Check for missing values
print(f"\nMissing values per column:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Check data types
print(f"\nData types:")
print(df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class balance:
Positive rate: 0.248
Negative rate: 0.752

Missing values per column:
requester_user_flair    2163
dtype: int64

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [3]:
# Explore text features
print("Text feature examples:")
print(f"\nRequest title example:")
print(df['request_title'].iloc[0][:200])

print(f"\nRequest text example:")
print(df['request_text'].iloc[0][:300])

print(f"\nRequest text length statistics:")
df['text_length'] = df['request_text'].str.len()
print(df['text_length'].describe())

print(f"\nTitle length statistics:")
df['title_length'] = df['request_title'].str.len()
print(df['title_length'].describe())

# Check correlation between text length and success
print(f"\nCorrelation between text length and success:")
print(df[['text_length', 'requester_received_pizza']].corr())

Text feature examples:

Request title example:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Request text example:
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.

Request text length statistics:
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: text_length, dtype: float64

Title length statistics:
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: title_length, dtype: float64

Correlation between text length and success:
                          text_length  requester_received_pizza
text_length                   1.00000                   0.13013
requester_r

In [None]:
# Explore key metadata features
metadata_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request', 
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'request_number_of_comments_at_retrieval'
]

print("Key metadata features correlation with target:")
for feature in metadata_features:
    if feature in df.columns:
        corr = df[feature].corr(df['requester_received_pizza'])
        print(f"{feature}: {corr:.3f}")

# Explore user flair feature
print(f"\nUser flair distribution:")
flair_counts = df['requester_user_flair'].value_counts()
print(flair_counts)

# Check success rate by flair
print(f"\nSuccess rate by user flair:")
flair_success = df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean'])
print(flair_success)