In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")
print(f"\nFirst sample preview:")
for key, value in list(train_data[0].items())[:5]:
    print(f"  {key}: {value}")

Number of training samples: 2878
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check basic info
print("DataFrame shape:", df.shape)
print("\nTarget distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nPositive rate: {df['requester_received_pizza'].mean():.3f}")

# Check data types
print("\nData types:")
print(df.dtypes.value_counts())

DataFrame shape: (2878, 32)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Positive rate: 0.248

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [3]:
# Examine text features
print("Text feature examples:")
print("\nRequest title example:")
print(df['request_title'].iloc[0][:200])

print("\nRequest text example:")
print(df['request_text'].iloc[0][:300])

print("\nRequest text length statistics:")
df['request_text_length'] = df['request_text'].str.len()
df['request_title_length'] = df['request_title'].str.len()
print(df[['request_text_length', 'request_title_length']].describe())

# Check for missing values in key features
print("\nMissing values in key features:")
key_features = ['request_text', 'request_title', 'requester_account_age_in_days_at_request', 
                'requester_number_of_comments_at_request', 'requester_number_of_posts_at_request']
for feature in key_features:
    missing = df[feature].isnull().sum()
    print(f"  {feature}: {missing} ({missing/len(df)*100:.1f}%)")

Text feature examples:

Request title example:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Request text example:
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.

Request text length statistics:
       request_text_length  request_title_length
count          2878.000000           2878.000000
mean            402.521543             71.572967
std             362.393727             36.233487
min               0.000000              7.000000
25%             182.000000             46.000000
50%             308.000000             64.000000
75%             503.750000             90.000000
max            4460.000000            272.000000

Missing values in key features:
  request_text: 0 (0.0%)
  request_title: 0 (0.0%)
  requester_account_age_in_days_at_request: 0 (0.0%)
  requester_number_of_comments_at_requ