In [2]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file - it's a JSON array, not line-delimited JSON
with open(train_path, 'r') as f:
    train_data = json.load(f)

# Convert to DataFrame
df_train = pd.DataFrame(train_data)
print(f"Training data shape: {df_train.shape}")
print(f"Columns: {df_train.columns.tolist()}")
print("\nFirst few rows:")
df_train.head()

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,2,5,False,t3_q8ycf,0,I will soon be going on a long deployment whic...,I will soon be going on a long deployment whic...,"[REQUEST] Oceanside, Ca. USA- US Marine getti...",0.0,...,False,[Random_Acts_Of_Pizza],3,3,7,7,,SDMarine,1330391000.0,1330391000.0
1,,2,4,False,t3_ixnia,20,"We would all really appreciate it, and would e...","We would all really appreciate it, and would e...",[REQUEST] Three (verified) medical students in...,99.526863,...,False,"[AskReddit, IAmA, TwoXChromosomes, circlejerk,...",491,883,1459,2187,,TheycallmeFoxJohnson,1311434000.0,1311430000.0
2,,1,2,True,t3_ndy6g,0,"It took a lot of courage to make this post, an...","It took a lot of courage to make this post, an...",(REQUEST) not home 4 the holidays &amp; would ...,0.0,...,False,[Random_Acts_Of_Pizza],1,1,3,3,,riverfrontmom,1323968000.0,1323968000.0
3,,1,1,1363315140.0,t3_1abbu1,32,I will go ahead and say that I got a pizza mea...,I will go ahead and say that I got a pizza mea...,[REQUEST] Not much food until tomorrow.,491.088264,...,True,"[Entroductions, RandomActsOfChristmas, RandomK...",25,21,165,195,shroom,Joeramos,1363305000.0,1363301000.0
4,,3,14,False,t3_kseg4,3,My '99 Jeep Cherokee I've had for 10 years now...,My '99 Jeep Cherokee I've had for 10 years now...,[Request] Had my car stolen today,369.417558,...,False,"[DetroitRedWings, DoesAnybodyElse, FoodPorn, K...",942,2043,1906,3483,,m4ngo,1317088000.0,1317084000.0


In [3]:
# Analyze target distribution
target = 'requester_received_pizza'
print("Target distribution:")
print(df_train[target].value_counts())
print(f"\nPercentage of successful requests: {df_train[target].mean():.2%}")

# Check for class imbalance
print(f"\nClass imbalance ratio: {df_train[target].value_counts().min() / df_train[target].value_counts().max():.3f}")

# Check missing values
print(f"\nMissing values per column:")
missing = df_train.isnull().sum()
print(missing[missing > 0])

# Check data types
print(f"\nData types:")
print(df_train.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage of successful requests: 24.84%

Class imbalance ratio: 0.331

Missing values per column:
requester_user_flair    2163
dtype: int64

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [4]:
# Analyze text features
print("Text feature analysis:")
print(f"Request title length (chars): {df_train['request_title'].str.len().describe()}")
print(f"\nRequest text length (chars): {df_train['request_text'].str.len().describe()}")
print(f"\nRequest text edit aware length (chars): {df_train['request_text_edit_aware'].str.len().describe()}")

# Check for unique values in categorical features
print(f"\nUnique values in requester_user_flair:")
print(df_train['requester_user_flair'].value_counts(dropna=False))

print(f"\nUnique values in post_was_edited:")
print(df_train['post_was_edited'].value_counts())

# Analyze numerical features that might be important
print(f"\nRequester account age at request (days):")
print(df_train['requester_account_age_in_days_at_request'].describe())

print(f"\nRequester number of comments at request:")
print(df_train['requester_number_of_comments_at_request'].describe())

Text feature analysis:
Request title length (chars): count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length (chars): count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request text edit aware length (chars): count    2878.000000
mean      394.567755
std       351.922518
min         0.000000
25%       180.000000
50%       302.000000
75%       498.000000
max      4460.000000
Name: request_text_edit_aware, dtype: float64

Unique values in requester_user_flair:
requester_user_flair
None      2163
shroom     677
PIF         38
Name: count, dtype: int64

Unique values in post_was_edited:
post_was_edited
False           2423
True             241
1375324604.0       1
136631433

In [5]:
# Check correlation between features and target
correlation_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval'
]

print("Correlation with target (requester_received_pizza):")
for feature in correlation_features:
    if feature in df_train.columns:
        corr = df_train[feature].corr(df_train[target])
        print(f"{feature}: {corr:.4f}")

# Analyze success rate by user flair
print(f"\nSuccess rate by requester_user_flair:")
flair_success = df_train.groupby('requester_user_flair')[target].agg(['count', 'mean'])
print(flair_success)

# Analyze success rate by post_was_edited (clean the data first)
df_train['post_was_edited_clean'] = df_train['post_was_edited'].apply(lambda x: x if isinstance(x, bool) else True)
print(f"\nSuccess rate by post_was_edited:")
edited_success = df_train.groupby('post_was_edited_clean')[target].agg(['count', 'mean'])
print(edited_success)

Correlation with target (requester_received_pizza):
requester_account_age_in_days_at_request: 0.0434
requester_number_of_comments_at_request: 0.0367
requester_number_of_posts_at_request: 0.0380
requester_number_of_comments_in_raop_at_request: 0.1320
requester_number_of_posts_on_raop_at_request: 0.1327
requester_upvotes_minus_downvotes_at_request: 0.0431
requester_upvotes_plus_downvotes_at_request: 0.0332
requester_number_of_subreddits_at_request: 0.0470
number_of_upvotes_of_request_at_retrieval: 0.0908
number_of_downvotes_of_request_at_retrieval: 0.0206
request_number_of_comments_at_retrieval: 0.2907

Success rate by requester_user_flair:
                      count  mean
requester_user_flair             
PIF                      38   1.0
shroom                  677   1.0

Success rate by post_was_edited:
                       count      mean
post_was_edited_clean                 
False                   2423  0.221626
True                     455  0.391209
