In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check basic info
print("DataFrame shape:", df.shape)
print("\nTarget distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nClass balance: {df['requester_received_pizza'].mean():.3f} (positive rate)")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().sum())

DataFrame shape: (2878, 32)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class balance: 0.248 (positive rate)

Missing values:
2163


In [3]:
# Check which columns have missing values
missing_counts = df.isnull().sum()
print("Columns with missing values:")
print(missing_counts[missing_counts > 0])

# Look at text features
print("\nText feature examples:")
print("Request title sample:")
print(df['request_title'].iloc[0])
print("\nRequest text sample (first 200 chars):")
print(df['request_text'].iloc[0][:200])
print("\nRequest text edit aware sample (first 200 chars):")
print(df['request_text_edit_aware'].iloc[0][:200])

Columns with missing values:
requester_user_flair    2163
dtype: int64

Text feature examples:
Request title sample:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Request text sample (first 200 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 

Request text edit aware sample (first 200 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 


In [None]:
# Analyze numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns: {len(numerical_cols)}")
print(numerical_cols)

# Check correlation with target
target_corr = df[numerical_cols].corr()['requester_received_pizza'].abs().sort_values(ascending=False)
print("\nTop correlations with target (absolute):")
print(target_corr.head(10))

# Analyze user flair distribution (categorical)
print("\nUser flair distribution:")
print(df['requester_user_flair'].value_counts(dropna=False))

In [5]:
# Check column names and data types
print("Column names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes.head(15))

# Check the target column specifically
print(f"\nTarget column type: {df['requester_received_pizza'].dtype}")
print(f"Unique values: {df['requester_received_pizza'].unique()}")

# Convert target to numeric if needed
df['requester_received_pizza'] = df['requester_received_pizza'].astype(int)
print(f"After conversion - target mean: {df['requester_received_pizza'].mean():.3f}")

Column names:
['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'r