In [2]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"

# Try loading as a single JSON array
with open(train_path, 'r') as f:
    train_data = json.load(f)

# Convert to DataFrame
train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

In [3]:
# Check target distribution
target_counts = train_df['requester_received_pizza'].value_counts()
print("Target distribution:")
print(target_counts)
print(f"\nPercentage:")
print(train_df['requester_received_pizza'].value_counts(normalize=True) * 100)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage:
requester_received_pizza
False    75.156359
True     24.843641
Name: proportion, dtype: float64


In [4]:
# Explore text data characteristics
print("Sample request title:")
print(train_df['request_title'].iloc[0])
print("\nSample request text:")
print(train_df['request_text'].iloc[0][:500])

print(f"\nText lengths:")
print(f"Title length (chars): min={train_df['request_title'].str.len().min()}, max={train_df['request_title'].str.len().max()}, mean={train_df['request_title'].str.len().mean():.1f}")
print(f"Text length (chars): min={train_df['request_text'].str.len().min()}, max={train_df['request_text'].str.len().max()}, mean={train_df['request_text'].str.len().mean():.1f}")

# Check for missing values in text
print(f"\nMissing values:")
print(f"request_title: {train_df['request_title'].isnull().sum()}")
print(f"request_text: {train_df['request_text'].isnull().sum()}")
print(f"request_text_edit_aware: {train_df['request_text_edit_aware'].isnull().sum()}")

Sample request title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Sample request text:
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.

Text lengths:
Title length (chars): min=7, max=272, mean=71.6
Text length (chars): min=0, max=4460, mean=402.5

Missing values:
request_title: 0
request_text: 0
request_text_edit_aware: 0
