In [2]:
import pandas as pd
import numpy as np
import json

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Convert to DataFrame for easier analysis
train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

In [3]:
# Check target distribution
print("Target distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"\nPercentage of successful requests: {train_df['requester_received_pizza'].mean():.3f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage of successful requests: 0.248


In [4]:
# Check data types and missing values
print("Data types:")
print(train_df.dtypes.value_counts())
print("\nMissing values:")
print(train_df.isnull().sum().sum())

# Check feature categories
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
text_features = ['request_text', 'request_text_edit_aware', 'request_title']
categorical_features = [col for col in train_df.columns if col not in numeric_features + text_features]

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Text features: {len(text_features)}")
print(f"Other features: {len(categorical_features)}")
print(f"Categorical features: {categorical_features}")

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Missing values:
2163

Numeric features: 22
Text features: 3
Other features: 7
Categorical features: ['giver_username_if_known', 'post_was_edited', 'request_id', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_user_flair', 'requester_username']


In [5]:
# Analyze text features
for text_col in ['request_text', 'request_title']:
    train_df[f'{text_col}_length'] = train_df[text_col].fillna('').str.len()
    print(f"\n{text_col} length stats:")
    print(train_df[f'{text_col}_length'].describe())

# Check key numerical features correlation with target
key_features = ['requester_account_age_in_days_at_request', 
                'requester_number_of_comments_at_request',
                'requester_number_of_posts_at_request',
                'requester_upvotes_minus_downvotes_at_request']

print("\nCorrelation with target:")
for feature in key_features:
    if feature in train_df.columns:
        corr = train_df[feature].corr(train_df['requester_received_pizza'])
        print(f"{feature}: {corr:.3f}")


request_text length stats:
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text_length, dtype: float64

request_title length stats:
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title_length, dtype: float64

Correlation with target:
requester_account_age_in_days_at_request: 0.043
requester_number_of_comments_at_request: 0.037
requester_number_of_posts_at_request: 0.038
requester_upvotes_minus_downvotes_at_request: 0.043
