In [1]:
import json
import pandas as pd
import numpy as np

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check basic info
print("DataFrame shape:", df.shape)
print("\nColumn names:")
for i, col in enumerate(df.columns):
    print(f"{i+1:2d}. {col}")

# Check target distribution
print("\n" + "="*50)
print("TARGET DISTRIBUTION")
print("="*50)
target_counts = df['requester_received_pizza'].value_counts()
print(target_counts)
print(f"\nClass percentages:")
print(df['requester_received_pizza'].value_counts(normalize=True) * 100)
print(f"\nClass imbalance ratio: {target_counts[0]/target_counts[1]:.2f}:1")

DataFrame shape: (2878, 32)

Column names:
 1. giver_username_if_known
 2. number_of_downvotes_of_request_at_retrieval
 3. number_of_upvotes_of_request_at_retrieval
 4. post_was_edited
 5. request_id
 6. request_number_of_comments_at_retrieval
 7. request_text
 8. request_text_edit_aware
 9. request_title
10. requester_account_age_in_days_at_request
11. requester_account_age_in_days_at_retrieval
12. requester_days_since_first_post_on_raop_at_request
13. requester_days_since_first_post_on_raop_at_retrieval
14. requester_number_of_comments_at_request
15. requester_number_of_comments_at_retrieval
16. requester_number_of_comments_in_raop_at_request
17. requester_number_of_comments_in_raop_at_retrieval
18. requester_number_of_posts_at_request
19. requester_number_of_posts_at_retrieval
20. requester_number_of_posts_on_raop_at_request
21. requester_number_of_posts_on_raop_at_retrieval
22. requester_number_of_subreddits_at_request
23. requester_received_pizza
24. requester_subreddits_at_reques

  print(f"\nClass imbalance ratio: {target_counts[0]/target_counts[1]:.2f}:1")


In [3]:
# Explore text features
print("="*50)
print("TEXT FEATURES ANALYSIS")
print("="*50)

# Check text lengths
df['request_text_length'] = df['request_text'].str.len()
df['request_title_length'] = df['request_title'].str.len()
df['request_text_edit_aware_length'] = df['request_text_edit_aware'].str.len()

print("Text length statistics:")
print(f"Request text - Mean: {df['request_text_length'].mean():.0f}, Median: {df['request_text_length'].median():.0f}, Max: {df['request_text_length'].max()}")
print(f"Request title - Mean: {df['request_title_length'].mean():.0f}, Median: {df['request_title_length'].median():.0f}, Max: {df['request_title_length'].max()}")

# Show some examples
print("\n" + "="*50)
print("EXAMPLE REQUESTS")
print("="*50)

# Successful request example
success_example = df[df['requester_received_pizza'] == True].iloc[0]
print("\n--- SUCCESSFUL REQUEST ---")
print(f"Title: {success_example['request_title']}")
print(f"Text (first 200 chars): {success_example['request_text'][:200]}...")
print(f"Length: {len(success_example['request_text'])} chars")

# Unsuccessful request example
fail_example = df[df['requester_received_pizza'] == False].iloc[0]
print("\n--- UNSUCCESSFUL REQUEST ---")
print(f"Title: {fail_example['request_title']}")
print(f"Text (first 200 chars): {fail_example['request_text'][:200]}...")
print(f"Length: {len(fail_example['request_text'])} chars")

TEXT FEATURES ANALYSIS
Text length statistics:
Request text - Mean: 403, Median: 308, Max: 4460
Request title - Mean: 72, Median: 64, Max: 272

EXAMPLE REQUESTS

--- SUCCESSFUL REQUEST ---
Title: [REQUEST] Not much food until tomorrow.
Text (first 200 chars): I will go ahead and say that I got a pizza meal from here before as to not seem like I'm scamming anyone. I have been promised 2 well-paying jobs and one minimum wage + .40 to fill in the gaps of unem...
Length: 1028 chars

--- UNSUCCESSFUL REQUEST ---
Title: [REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.
Text (first 200 chars): I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time ...
Length: 214 chars
