In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")
print(f"\nFirst sample preview:")
print(json.dumps(train_data[0], indent=2)[:1000] + "...")

Number of training samples: 2878
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check basic info
print("DataFrame shape:", df.shape)
print("\nTarget distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nTarget ratio: {df['requester_received_pizza'].mean():.4f}")

# Check data types
print("\nData types:")
print(df.dtypes.head(20))

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().head(10))

DataFrame shape: (2878, 32)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target ratio: 0.2484

Data types:
giver_username_if_known                                  object
number_of_downvotes_of_request_at_retrieval               int64
number_of_upvotes_of_request_at_retrieval                 int64
post_was_edited                                          object
request_id                                               object
request_number_of_comments_at_retrieval                   int64
request_text                                             object
request_text_edit_aware                                  object
request_title                                            object
requester_account_age_in_days_at_request                float64
requester_account_age_in_days_at_retrieval              float64
requester_days_since_first_post_on_raop_at_request      float64
requester_days_since_first_post_on_raop_at_retrieval    float64
requ

In [4]:
# Analyze text features
print("Text feature analysis:")
print(f"Average request_text length: {df['request_text'].str.len().mean():.1f} characters")
print(f"Average request_title length: {df['request_title'].str.len().mean():.1f} characters")
print(f"Average request_text words: {df['request_text'].str.split().str.len().mean():.1f} words")

# Check unique values in categorical features
print("\nCategorical features:")
print(f"Unique requester_user_flair values: {df['requester_user_flair'].unique()}")
print(f"post_was_edited distribution:\n{df['post_was_edited'].value_counts()}")

# Check numerical feature distributions
print("\nNumerical feature statistics:")
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'requester_received_pizza' in numerical_cols:
    numerical_cols.remove('requester_received_pizza')  # Remove target
print(df[numerical_cols].describe().loc[['mean', 'std', 'min', 'max']])

Text feature analysis:
Average request_text length: 402.5 characters
Average request_title length: 71.6 characters
Average request_text words: 77.0 words

Categorical features:
Unique requester_user_flair values: [None 'shroom' 'PIF']
post_was_edited distribution:
post_was_edited
False           2423
True             241
1375324604.0       1
1366314331.0       1
1367280954.0       1
                ... 
1379372126.0       1
1378425306.0       1
1374109637.0       1
1358627245.0       1
1372729287.0       1
Name: count, Length: 216, dtype: int64

Numerical feature statistics:
      number_of_downvotes_of_request_at_retrieval  \
mean                                     2.428075   
std                                      3.035568   
min                                      0.000000   
max                                     47.000000   

      number_of_upvotes_of_request_at_retrieval  \
mean                                   6.090688   
std                                   10.501259   