In [2]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file - it's a JSON array, not line-delimited
with open(train_path, 'r') as f:
    train_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(train_data)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_down

In [3]:
# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nPercentage distribution:")
print(df['requester_received_pizza'].value_counts(normalize=True) * 100)

# Check data types
print(f"\nData types:")
print(df.dtypes.head(10))

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage distribution:
requester_received_pizza
False    75.156359
True     24.843641
Name: proportion, dtype: float64

Data types:
giver_username_if_known                         object
number_of_downvotes_of_request_at_retrieval      int64
number_of_upvotes_of_request_at_retrieval        int64
post_was_edited                                 object
request_id                                      object
request_number_of_comments_at_retrieval          int64
request_text                                    object
request_text_edit_aware                         object
request_title                                   object
requester_account_age_in_days_at_request       float64
dtype: object


In [4]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum().sort_values(ascending=False).head(15))

# Check text length statistics
print("\nText length statistics:")
df['request_text_length'] = df['request_text'].str.len()
df['request_title_length'] = df['request_title'].str.len()
print(df['request_text_length'].describe())
print(f"\nTitle length stats:")
print(df['request_title_length'].describe())

Missing values:
requester_user_flair                                    2163
giver_username_if_known                                    0
number_of_upvotes_of_request_at_retrieval                  0
number_of_downvotes_of_request_at_retrieval                0
request_id                                                 0
request_number_of_comments_at_retrieval                    0
request_text                                               0
post_was_edited                                            0
request_title                                              0
requester_account_age_in_days_at_request                   0
requester_account_age_in_days_at_retrieval                 0
requester_days_since_first_post_on_raop_at_request         0
requester_days_since_first_post_on_raop_at_retrieval       0
requester_number_of_comments_at_request                    0
requester_number_of_comments_at_retrieval                  0
dtype: int64

Text length statistics:
count    2878.000000
mean      

In [None]:
# Check user flair distribution
print("User flair distribution:")
print(df['requester_user_flair'].value_counts(dropna=False))

# Check post_was_edited distribution
print("\nPost edited distribution:")
print(df['post_was_edited'].value_counts())

# Check numerical features correlation with target
numerical_features = ['number_of_upvotes_of_request_at_retrieval', 
                      'number_of_downvotes_of_request_at_retrieval',
                      'request_number_of_comments_at_retrieval',
                      'requester_account_age_in_days_at_request',
                      'requester_number_of_comments_at_request',
                      'requester_number_of_posts_at_request']

print("\nCorrelation with target:")
for feature in numerical_features:
    corr = df[feature].corr(df['requester_received_pizza'])
    print(f"{feature}: {corr:.4f}")