In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")
print(f"\nFirst sample preview:")
for key, value in list(train_data[0].items())[:10]:
    print(f"  {key}: {value}")

Number of training samples: 2878
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"Success rate: {df['requester_received_pizza'].mean():.3f}")

# Check for missing values
print(f"\nMissing values:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Check data types
print(f"\nData types:")
print(df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Success rate: 0.248

Missing values:
requester_user_flair    2163
dtype: int64

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [3]:
# Analyze text features
print("Text feature analysis:")
print(f"Request title length (chars): {df['request_title'].str.len().describe()}")
print(f"Request text length (chars): {df['request_text'].str.len().describe()}")
print(f"Request text length (words): {df['request_text'].str.split().str.len().describe()}")

# Analyze key numerical features
print("\nKey numerical features:")
numerical_cols = ['requester_account_age_in_days_at_request', 
                  'requester_number_of_comments_at_request',
                  'requester_number_of_posts_at_request',
                  'requester_upvotes_minus_downvotes_at_request']

for col in numerical_cols:
    print(f"\n{col}:")
    print(df[col].describe())
    print(f"Missing: {df[col].isnull().sum()}")

# Check user flair distribution
print(f"\nUser flair distribution:")
print(df['requester_user_flair'].value_counts(dropna=False))

Text feature analysis:
Request title length (chars): count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64
Request text length (chars): count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64
Request text length (words): count    2878.000000
mean       77.009034
std        69.290490
min         0.000000
25%        35.000000
50%        59.000000
75%        96.000000
max       854.000000
Name: request_text, dtype: float64

Key numerical features:

requester_account_age_in_days_at_request:
count    2878.000000
mean      250.682364
std       301.838771
min         0.000000
25%         3.038877
50%       155.156377
75%       383.640090
max      2809.750787
Name: requester_account_age_in_days_at_requ