In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of records: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First record keys: {list(train_data[0].keys())}")

Number of records: 2878
Type of data: <class 'list'>
First record keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_re

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)
print(f"DataFrame shape: {df.shape}")
print("\nColumn names:")
for col in df.columns:
    print(f"- {col}")

DataFrame shape: (2878, 32)

Column names:
- giver_username_if_known
- number_of_downvotes_of_request_at_retrieval
- number_of_upvotes_of_request_at_retrieval
- post_was_edited
- request_id
- request_number_of_comments_at_retrieval
- request_text
- request_text_edit_aware
- request_title
- requester_account_age_in_days_at_request
- requester_account_age_in_days_at_retrieval
- requester_days_since_first_post_on_raop_at_request
- requester_days_since_first_post_on_raop_at_retrieval
- requester_number_of_comments_at_request
- requester_number_of_comments_at_retrieval
- requester_number_of_comments_in_raop_at_request
- requester_number_of_comments_in_raop_at_retrieval
- requester_number_of_posts_at_request
- requester_number_of_posts_at_retrieval
- requester_number_of_posts_on_raop_at_request
- requester_number_of_posts_on_raop_at_retrieval
- requester_number_of_subreddits_at_request
- requester_received_pizza
- requester_subreddits_at_request
- requester_upvotes_minus_downvotes_at_request

In [3]:
# Analyze target distribution
target = 'requester_received_pizza'
print("Target distribution:")
print(df[target].value_counts())
print(f"\nTarget percentages:")
print(df[target].value_counts(normalize=True) * 100)
print(f"\nClass imbalance ratio: {df[target].value_counts().min() / df[target].value_counts().max():.4f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target percentages:
requester_received_pizza
False    75.156359
True     24.843641
Name: proportion, dtype: float64

Class imbalance ratio: 0.3306


In [4]:
# Analyze feature types
print("Feature types:")
print(df.dtypes.value_counts())

print("\nSample of text features:")
print(f"Request title sample: {df['request_title'].iloc[0][:100]}...")
print(f"Request text sample: {df['request_text'].iloc[0][:100]}...")

print("\nSample of categorical features:")
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {list(categorical_cols)}")

print("\nSample of numerical features:")
numerical_cols = df.select_dtypes(exclude=['object']).columns
print(f"Numerical columns count: {len(numerical_cols)}")

Feature types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Sample of text features:
Request title sample: [REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy....
Request text sample: I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some in...

Sample of categorical features:
Categorical columns: ['giver_username_if_known', 'post_was_edited', 'request_id', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_subreddits_at_request', 'requester_user_flair', 'requester_username']

Sample of numerical features:
Numerical columns count: 23
