In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")
print(f"\nFirst sample preview:")
for key, value in list(train_data[0].items())[:5]:
    print(f"  {key}: {value}")

Number of training samples: 2878
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nTarget distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nTarget percentage:")
print(df['requester_received_pizza'].value_counts(normalize=True))

DataFrame shape: (2878, 32)

Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_d

requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target percentage:
requester_received_pizza
False    0.751564
True     0.248436
Name: proportion, dtype: float64


In [3]:
# Analyze text features
print("Text feature analysis:")
print(f"\nRequest title length stats:")
title_lengths = df['request_title'].str.len()
print(title_lengths.describe())

print(f"\nRequest text length stats:")
text_lengths = df['request_text'].str.len()
print(text_lengths.describe())

print(f"\nRequest text edit aware length stats:")
text_edit_lengths = df['request_text_edit_aware'].str.len()
print(text_edit_lengths.describe())

# Check for missing values in key features
print(f"\nMissing values in text features:")
print(df[['request_title', 'request_text', 'request_text_edit_aware']].isnull().sum())

Text feature analysis:

Request title length stats:
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length stats:
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request text edit aware length stats:
count    2878.000000
mean      394.567755
std       351.922518
min         0.000000
25%       180.000000
50%       302.000000
75%       498.000000
max      4460.000000
Name: request_text_edit_aware, dtype: float64

Missing values in text features:
request_title              0
request_text               0
request_text_edit_aware    0
dtype: int64


In [None]:
# Analyze numerical features
numerical_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request', 
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval'
]

print("Numerical feature analysis:")
for feature in numerical_features:
    print(f"\n{feature}:")
    print(df[feature].describe())

# Check correlation with target
print("\nCorrelation with target:")
correlations = df[numerical_features + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print(correlations)