In [3]:
import pandas as pd
import numpy as np
import json

# Load the training data - it's a JSON array
train_df = pd.read_json('/home/data/train.json')
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))
print("\nFirst few rows:")
train_df.head()

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,2,5,0,t3_q8ycf,0,I will soon be going on a long deployment whic...,I will soon be going on a long deployment whic...,"[REQUEST] Oceanside, Ca. USA- US Marine getti...",0.0,...,False,[Random_Acts_Of_Pizza],3,3,7,7,,SDMarine,1330391255,1330391255
1,,2,4,0,t3_ixnia,20,"We would all really appreciate it, and would e...","We would all really appreciate it, and would e...",[REQUEST] Three (verified) medical students in...,99.526863,...,False,"[AskReddit, IAmA, TwoXChromosomes, circlejerk,...",491,883,1459,2187,,TheycallmeFoxJohnson,1311433992,1311430392
2,,1,2,1,t3_ndy6g,0,"It took a lot of courage to make this post, an...","It took a lot of courage to make this post, an...",(REQUEST) not home 4 the holidays &amp; would ...,0.0,...,False,[Random_Acts_Of_Pizza],1,1,3,3,,riverfrontmom,1323968350,1323968350
3,,1,1,1363315140,t3_1abbu1,32,I will go ahead and say that I got a pizza mea...,I will go ahead and say that I got a pizza mea...,[REQUEST] Not much food until tomorrow.,491.088264,...,True,"[Entroductions, RandomActsOfChristmas, RandomK...",25,21,165,195,shroom,Joeramos,1363304920,1363301320
4,,3,14,0,t3_kseg4,3,My '99 Jeep Cherokee I've had for 10 years now...,My '99 Jeep Cherokee I've had for 10 years now...,[Request] Had my car stolen today,369.417558,...,False,"[DetroitRedWings, DoesAnybodyElse, FoodPorn, K...",942,2043,1906,3483,,m4ngo,1317087833,1317084233


In [4]:
# Analyze text features
print("Text feature analysis:")
print(f"\nRequest title length stats:")
print(train_df['request_title'].str.len().describe())
print(f"\nRequest text length stats:")
print(train_df['request_text'].str.len().describe())
print(f"\nRequest text (edit aware) length stats:")
print(train_df['request_text_edit_aware'].str.len().describe())

# Check for missing values in key features
print("\nMissing values in key features:")
key_features = ['request_text', 'request_title', 'requester_received_pizza']
for col in key_features:
    missing = train_df[col].isnull().sum()
    print(f"{col}: {missing} ({missing/len(train_df)*100:.2f}%)")

Text feature analysis:

Request title length stats:
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length stats:
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request text (edit aware) length stats:
count    2878.000000
mean      394.567755
std       351.922518
min         0.000000
25%       180.000000
50%       302.000000
75%       498.000000
max      4460.000000
Name: request_text_edit_aware, dtype: float64

Missing values in key features:
request_text: 0 (0.00%)
request_title: 0 (0.00%)
requester_received_pizza: 0 (0.00%)


In [5]:
# Analyze numerical features
numerical_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval'
]

print("Numerical features statistics:")
print(train_df[numerical_features].describe())

# Check correlation with target
print("\nCorrelation with target (requester_received_pizza):")
correlations = train_df[numerical_features + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print(correlations)

Numerical features statistics:
       requester_account_age_in_days_at_request  \
count                               2878.000000   
mean                                 250.682364   
std                                  301.838771   
min                                    0.000000   
25%                                    3.038877   
50%                                  155.156377   
75%                                  383.640090   
max                                 2809.750787   

       requester_number_of_comments_at_request  \
count                              2878.000000   
mean                                112.311327   
std                                 192.017515   
min                                   0.000000   
25%                                   0.000000   
50%                                  22.000000   
75%                                 132.000000   
max                                 981.000000   

       requester_number_of_posts_at_request  \
count      