In [2]:
import pandas as pd
import numpy as np
import json

# Load the training data - it's a JSON array
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

In [3]:
# Check target distribution
print("Target distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"\nPercentage of successful requests: {train_df['requester_received_pizza'].mean():.3f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage of successful requests: 0.248


In [4]:
# Check data types and missing values
print("Data types:")
print(train_df.dtypes.value_counts())
print("\nMissing values:")
missing = train_df.isnull().sum()
print(missing[missing > 0])

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Missing values:
requester_user_flair    2163
dtype: int64


In [5]:
# Examine text features
print("Text features sample:")
print("\nRequest title sample:")
print(train_df['request_title'].iloc[0])
print("\nRequest text sample (first 200 chars):")
print(train_df['request_text'].iloc[0][:200])
print("\nRequest text edit aware sample (first 200 chars):")
print(train_df['request_text_edit_aware'].iloc[0][:200])

Text features sample:

Request title sample:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Request text sample (first 200 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 

Request text edit aware sample (first 200 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 


In [6]:
# Examine key numeric features
numeric_features = ['requester_account_age_in_days_at_request', 
                   'requester_number_of_comments_at_request',
                   'requester_number_of_posts_at_request',
                   'requester_upvotes_minus_downvotes_at_request',
                   'requester_number_of_subreddits_at_request']

print("Key numeric features statistics:")
print(train_df[numeric_features].describe())

Key numeric features statistics:
       requester_account_age_in_days_at_request  \
count                               2878.000000   
mean                                 250.682364   
std                                  301.838771   
min                                    0.000000   
25%                                    3.038877   
50%                                  155.156377   
75%                                  383.640090   
max                                 2809.750787   

       requester_number_of_comments_at_request  \
count                              2878.000000   
mean                                112.311327   
std                                 192.017515   
min                                   0.000000   
25%                                   0.000000   
50%                                  22.000000   
75%                                 132.000000   
max                                 981.000000   

       requester_number_of_posts_at_request  \
count    