In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 4040
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df_train = pd.DataFrame(train_data)

# Target distribution
print("Target distribution:")
print(df_train['requester_received_pizza'].value_counts())
print(f"\nSuccess rate: {df_train['requester_received_pizza'].mean():.3f}")

# Basic statistics on key features
print("\nKey numerical features:")
numeric_cols = ['requester_number_of_comments_at_request', 'requester_number_of_posts_at_request', 
                'requester_upvotes_plus_downvotes_at_request', 'requester_number_of_subreddits_at_request']
print(df_train[numeric_cols].describe())

Target distribution:
requester_received_pizza
False    3046
True      994
Name: count, dtype: int64

Success rate: 0.246

Key numerical features:
       requester_number_of_comments_at_request  \
count                              4040.000000   
mean                                115.098267   
std                                 193.318968   
min                                   0.000000   
25%                                   0.000000   
50%                                  24.000000   
75%                                 140.250000   
max                                 994.000000   

       requester_number_of_posts_at_request  \
count                           4040.000000   
mean                              21.601485   
std                               50.895060   
min                                0.000000   
25%                                0.000000   
50%                                5.000000   
75%                               22.000000   
max                        

In [3]:
# Text features analysis
print("Text features:")
print(f"Request title length (chars): {df_train['request_title'].str.len().describe()}")
print(f"\nRequest text length (chars): {df_train['request_text'].str.len().describe()}")
print(f"\nRequest text edit aware length (chars): {df_train['request_text_edit_aware'].str.len().describe()}")

# Check for missing values
print("\nMissing values:")
print(df_train.isnull().sum().sum())  # Should be 0 for JSON data

# Check unique values in categorical features
print(f"\nUnique user flair values: {df_train['requester_user_flair'].unique()}")
print(f"Post edited values: {df_train['post_was_edited'].unique()}")

# Check if giver username is useful
print(f"\nGiver username known (not N/A): {(df_train['giver_username_if_known'] != 'N/A').sum()}")
print(f"Total samples: {len(df_train)}")

Text features:
Request title length (chars): count    4040.000000
mean       71.899505
std        36.154216
min         7.000000
25%        46.000000
50%        65.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length (chars): count    4040.000000
mean      405.243317
std       373.329466
min         0.000000
25%       182.000000
50%       307.000000
75%       510.000000
max      4460.000000
Name: request_text, dtype: float64

Request text edit aware length (chars): count    4040.000000
mean      397.111634
std       362.353943
min         0.000000
25%       180.000000
50%       301.000000
75%       502.250000
max      4460.000000
Name: request_text_edit_aware, dtype: float64

Missing values:
3046

Unique user flair values: [None 'shroom' 'PIF']
Post edited values: [False True 1342028318.0 1378419359.0 1344540616.0 1370925485.0
 1370641675.0 1346634254.0 1342808379.0 1366844439.0 1352047954.0
 1360772856.0 1367280954.0 1339642100.0 13