In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nPercentage with pizza: {df['requester_received_pizza'].mean():.3f}")

# Check missing values
print(f"\nMissing values:")
print(df.isnull().sum().head(10))

# Check data types
print(f"\nData types:")
print(df.dtypes.head(15))

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage with pizza: 0.248

Missing values:
giver_username_if_known                        0
number_of_downvotes_of_request_at_retrieval    0
number_of_upvotes_of_request_at_retrieval      0
post_was_edited                                0
request_id                                     0
request_number_of_comments_at_retrieval        0
request_text                                   0
request_text_edit_aware                        0
request_title                                  0
requester_account_age_in_days_at_request       0
dtype: int64

Data types:
giver_username_if_known                                  object
number_of_downvotes_of_request_at_retrieval               int64
number_of_upvotes_of_request_at_retrieval                 int64
post_was_edited                                          object
request_id                                               object
request_number_of

In [None]:
# Explore text features
print("Sample request title:")
print(df['request_title'].iloc[0][:200])
print("\n" + "="*50)

print("Sample request text:")
print(df['request_text'].iloc[0][:300])
print("\n" + "="*50)

# Text length statistics
print("Text length statistics:")
df['title_length'] = df['request_title'].str.len()
df['text_length'] = df['request_text'].str.len()

print(f"Title length - Mean: {df['title_length'].mean():.1f}, Max: {df['title_length'].max()}")
print(f"Text length - Mean: {df['text_length'].mean():.1f}, Max: {df['text_length'].max()}")

# Check some key numerical features
print("\n" + "="*50)
print("Key numerical features correlation with target:")
numerical_features = ['number_of_upvotes_of_request_at_retrieval', 
                      'request_number_of_comments_at_retrieval',
                      'requester_account_age_in_days_at_request',
                      'requester_number_of_comments_at_request']

for feature in numerical_features:
    corr = df[feature].corr(df['requester_received_pizza'])
    print(f"{feature}: {corr:.3f}")