In [5]:
import pandas as pd
import numpy as np
import json

# Load the training data - it's a JSON array
train_df = pd.read_json('/home/data/train.json')
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

In [6]:
# Check target distribution
target_counts = train_df['requester_received_pizza'].value_counts()
print("Target distribution:")
print(target_counts)
print(f"\nPercentage:")
print(target_counts / len(train_df) * 100)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage:
requester_received_pizza
False    75.156359
True     24.843641
Name: count, dtype: float64


In [7]:
# Check data types and missing values
print("Data types:")
print(train_df.dtypes.value_counts())
print("\nMissing values:")
print(train_df.isnull().sum().sort_values(ascending=False).head(10))

Data types:
int64      19
object      8
float64     4
bool        1
Name: count, dtype: int64

Missing values:
requester_user_flair                           2163
giver_username_if_known                           0
number_of_upvotes_of_request_at_retrieval         0
number_of_downvotes_of_request_at_retrieval       0
request_id                                        0
request_number_of_comments_at_retrieval           0
request_text                                      0
post_was_edited                                   0
request_title                                     0
requester_account_age_in_days_at_request          0
dtype: int64


In [None]:
# Explore text features
print("Text feature lengths:")
print(f"request_title length - mean: {train_df['request_title'].str.len().mean():.1f}, max: {train_df['request_title'].str.len().max()}")
print(f"request_text length - mean: {train_df['request_text'].str.len().mean():.1f}, max: {train_df['request_text'].str.len().max()}")
print(f"request_text_edit_aware length - mean: {train_df['request_text_edit_aware'].str.len().mean():.1f}, max: {train_df['request_text_edit_aware'].str.len().max()}")

# Check unique values in categorical features
print("\nUnique values in key categorical features:")
print(f"post_was_edited: {train_df['post_was_edited'].value_counts().to_dict()}")
print(f"requester_user_flair (non-null): {train_df['requester_user_flair'].value_counts().to_dict()}")