In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nPercentage of positive samples: {df['requester_received_pizza'].mean():.3f}")

# Check for missing values in key features
print("\nMissing values in key features:")
key_features = ['request_text', 'request_title', 'request_text_edit_aware', 'requester_user_flair']
for feature in key_features:
    missing = df[feature].isnull().sum()
    print(f"{feature}: {missing} ({missing/len(df)*100:.1f}%)")

# Look at text length statistics
print("\nText length statistics:")
df['text_length'] = df['request_text'].str.len()
df['title_length'] = df['request_title'].str.len()
print(f"Request text - Mean: {df['text_length'].mean():.0f}, Median: {df['text_length'].median():.0f}")
print(f"Request title - Mean: {df['title_length'].mean():.0f}, Median: {df['title_length'].median():.0f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage of positive samples: 0.248

Missing values in key features:
request_text: 0 (0.0%)
request_title: 0 (0.0%)
request_text_edit_aware: 0 (0.0%)
requester_user_flair: 2163 (75.2%)

Text length statistics:
Request text - Mean: 403, Median: 308
Request title - Mean: 72, Median: 64


In [3]:
# Explore numerical features
numerical_features = [
    'requester_number_of_posts_at_request', 'requester_number_of_comments_at_request',
    'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request', 'requester_account_age_in_days_at_request',
    'request_number_of_comments_at_retrieval', 'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval'
]

print("Numerical feature statistics:")
for feature in numerical_features:
    if feature in df.columns:
        mean_val = df[feature].mean()
        median_val = df[feature].median()
        print(f"{feature}:")
        print(f"  Mean: {mean_val:.2f}, Median: {median_val:.2f}")

# Check correlation with target
print("\nCorrelation with target:")
correlations = df[numerical_features + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print(correlations)

Numerical feature statistics:
requester_number_of_posts_at_request:
  Mean: 21.61, Median: 4.00
requester_number_of_comments_at_request:
  Mean: 112.31, Median: 22.00
requester_upvotes_minus_downvotes_at_request:
  Mean: 1184.58, Median: 171.00
requester_upvotes_plus_downvotes_at_request:
  Mean: 3988.57, Median: 335.50
requester_number_of_subreddits_at_request:
  Mean: 17.86, Median: 11.00
requester_account_age_in_days_at_request:
  Mean: 250.68, Median: 155.16
request_number_of_comments_at_retrieval:
  Mean: 2.86, Median: 1.00
number_of_upvotes_of_request_at_retrieval:
  Mean: 6.09, Median: 4.00
number_of_downvotes_of_request_at_retrieval:
  Mean: 2.43, Median: 2.00

Correlation with target:
requester_received_pizza                        1.000000
request_number_of_comments_at_retrieval         0.290709
number_of_upvotes_of_request_at_retrieval       0.090767
requester_number_of_subreddits_at_request       0.047001
requester_account_age_in_days_at_request        0.043374
requester_up