In [1]:
import json
import pandas as pd
import numpy as np

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Keys in first sample: {list(train_data[0].keys())}")

# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)
print(f"\nDataFrame shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nTarget proportion:")
print(df['requester_received_pizza'].value_counts(normalize=True))

Number of training samples: 2878
Keys in first sample: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requeste

In [2]:
# Analyze feature types and missing values
print("Feature types:")
print(df.dtypes.value_counts())

print("\n" + "="*50)
print("Missing values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

print("\n" + "="*50)
print("Text features analysis:")
print(f"Request title length (chars): min={df['request_title'].str.len().min()}, max={df['request_title'].str.len().max()}, mean={df['request_title'].str.len().mean():.1f}")
print(f"Request text length (chars): min={df['request_text'].str.len().min()}, max={df['request_text'].str.len().max()}, mean={df['request_text'].str.len().mean():.1f}")

print("\n" + "="*50)
print("Categorical features:")
print(f"Unique requester_user_flair values: {df['requester_user_flair'].unique()}")
print(f"Unique post_was_edited values: {df['post_was_edited'].unique()}")

Feature types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Missing values:
requester_user_flair    2163
dtype: int64

Text features analysis:
Request title length (chars): min=7, max=272, mean=71.6
Request text length (chars): min=0, max=4460, mean=402.5

Categorical features:
Unique requester_user_flair values: [None 'shroom' 'PIF']
Unique post_was_edited values: [False True 1363315140.0 1349998232.0 1364413726.0 1342991229.0
 1351709165.0 1347224731.0 1347230374.0 1371296113.0 1352058582.0
 1347225835.0 1363914082.0 1373814714.0 1351652384.0 1375841082.0
 1338341728.0 1347404183.0 1345416005.0 1367630093.0 1363714853.0
 1367502383.0 1337276328.0 1352239751.0 1356307269.0 1350506322.0
 1346719766.0 1365368614.0 1370019785.0 1347159426.0 1349824194.0
 1351022929.0 1364351254.0 1351898842.0 1373919611.0 1338754342.0
 1374370516.0 1343265957.0 1373252156.0 1368628826.0 1379212134.0
 1358550745.0 1353164052.0 1369360852.0 1369613625.0 1340921147.0
 1