In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nClass balance: {df['requester_received_pizza'].mean():.3f}")

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum().sum())

# Examine text fields
print(f"\nFirst request title: {df['request_title'].iloc[0]}")
print(f"\nFirst request text (first 200 chars): {df['request_text'].iloc[0][:200]}")

# Check some numerical features
print(f"\nSample numerical features:")
numerical_cols = ['requester_number_of_comments_at_request', 'requester_number_of_posts_at_request', 
                  'requester_upvotes_minus_downvotes_at_request', 'requester_account_age_in_days_at_request']
print(df[numerical_cols].describe())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class balance: 0.248

Missing values:
2163

First request title: [REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

First request text (first 200 chars): I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 

Sample numerical features:
       requester_number_of_comments_at_request  \
count                              2878.000000   
mean                                112.311327   
std                                 192.017515   
min                                   0.000000   
25%                                   0.000000   
50%                                  22.000000   
75%                                 132.000000   
max                                 981.000000   

       requester_number_of_posts_at_request  \
co

In [3]:
# Analyze text features more deeply
print("Text feature analysis:")
print(f"Average title length: {df['request_title'].str.len().mean():.1f} characters")
print(f"Average text length: {df['request_text'].str.len().mean():.1f} characters")
print(f"Average edit-aware text length: {df['request_text_edit_aware'].str.len().mean():.1f} characters")

# Check for class imbalance in text features
print(f"\nClass distribution by text presence:")
print(f"Posts with 'EDIT' in text: {df['request_text'].str.contains('EDIT', case=False).sum()}")
print(f"Success rate with 'EDIT': {df[df['request_text'].str.contains('EDIT', case=False)]['requester_received_pizza'].mean():.3f}")
print(f"Success rate without 'EDIT': {df[~df['request_text'].str.contains('EDIT', case=False)]['requester_received_pizza'].mean():.3f}")

# Check user flair distribution
print(f"\nUser flair distribution:")
print(df['requester_user_flair'].value_counts())
print(f"\nSuccess rate by flair:")
print(df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean']).round(3))

Text feature analysis:
Average title length: 71.6 characters
Average text length: 402.5 characters
Average edit-aware text length: 394.6 characters

Class distribution by text presence:
Posts with 'EDIT' in text: 339
Success rate with 'EDIT': 0.416
Success rate without 'EDIT': 0.226

User flair distribution:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

Success rate by flair:
                      count  mean
requester_user_flair             
PIF                      38   1.0
shroom                  677   1.0


In [4]:
# Check temporal features
print("Temporal analysis:")
df['request_date'] = pd.to_datetime(df['unix_timestamp_of_request_utc'], unit='s')
print(f"Date range: {df['request_date'].min()} to {df['request_date'].max()}")

# Extract time features
df['request_hour'] = df['request_date'].dt.hour
df['request_dayofweek'] = df['request_date'].dt.dayofweek

print(f"\nSuccess rate by hour of day:")
hourly_success = df.groupby('request_hour')['requester_received_pizza'].agg(['count', 'mean']).round(3)
print(hourly_success.head(10))

print(f"\nSuccess rate by day of week:")
daily_success = df.groupby('request_dayofweek')['requester_received_pizza'].agg(['count', 'mean']).round(3)
print(daily_success)

# Check correlation between request and retrieval metrics
print(f"\nCorrelation between request and retrieval metrics:")
req_ret_cols = ['requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval',
                'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval']
corr_matrix = df[req_ret_cols].corr()
print(corr_matrix)

Temporal analysis:
Date range: 2011-05-23 20:29:10 to 2013-10-09 18:51:12

Success rate by hour of day:
              count   mean
request_hour              
0               249  0.277
1               210  0.248
2               159  0.245
3               141  0.184
4                85  0.176
5                64  0.234
6                34  0.147
7                25  0.080
8                20  0.050
9                24  0.292

Success rate by day of week:
                   count   mean
request_dayofweek              
0                    392  0.258
1                    432  0.227
2                    476  0.239
3                    403  0.283
4                    396  0.253
5                    366  0.251
6                    413  0.232

Correlation between request and retrieval metrics:
                                           requester_number_of_comments_at_request  \
requester_number_of_comments_at_request                                   1.000000   
requester_number_of_comments_a