In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Load the data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"First train sample keys: {list(train_data[0].keys())}")

Train samples: 2878
Test samples: 1162
First train sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 

In [2]:
# Convert to DataFrame for easier analysis
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)
print("\nTrain columns:")
print(train_df.columns.tolist())

Train DataFrame shape: (2878, 32)
Test DataFrame shape: (1162, 17)

Train columns:
['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subre

In [3]:
# Analyze target distribution
print("Target distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"\nSuccess rate: {train_df['requester_received_pizza'].mean():.4f}")

# Check missing values in target
print(f"\nMissing target values: {train_df['requester_received_pizza'].isnull().sum()}")

# Check which columns are in train but not in test
train_only_cols = set(train_df.columns) - set(test_df.columns)
test_only_cols = set(test_df.columns) - set(train_df.columns)

print(f"\nColumns only in train: {train_only_cols}")
print(f"Columns only in test: {test_only_cols}")

# Check data types
print("\nData types:")
print(train_df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Success rate: 0.2484

Missing target values: 0

Columns only in train: {'requester_upvotes_minus_downvotes_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'requester_account_age_in_days_at_retrieval', 'requester_number_of_posts_on_raop_at_retrieval', 'number_of_downvotes_of_request_at_retrieval', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_user_flair', 'requester_number_of_comments_at_retrieval', 'post_was_edited', 'request_text', 'requester_number_of_posts_at_retrieval', 'requester_received_pizza', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_upvotes_plus_downvotes_at_retrieval', 'request_number_of_comments_at_retrieval'}
Columns only in test: set()

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [None]:
# Analyze text features
print("Text feature analysis:")
print(f"Request title length (chars): min={train_df['request_title'].str.len().min()}, max={train_df['request_title'].str.len().max()}, mean={train_df['request_title'].str.len().mean():.1f}")
print(f"Request text length (chars): min={train_df['request_text'].str.len().min()}, max={train_df['request_text'].str.len().max()}, mean={train_df['request_text'].str.len().mean():.1f}")

# Check for nulls in text features
print(f"\nNull request titles: {train_df['request_title'].isnull().sum()}")
print(f"Null request texts: {train_df['request_text'].isnull().sum()}")

# Analyze categorical features
print("\nCategorical features:")
print(f"Unique requester_user_flair values: {train_df['requester_user_flair'].value_counts()}")
print(f"Unique giver_username_if_known values: {train_df['giver_username_if_known'].nunique()}")

# Analyze numerical features
numerical_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumerical features: {len(numerical_cols)}")
print(f"Sample numerical features: {numerical_cols[:10]}")