In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nClass balance: {df['requester_received_pizza'].mean():.3f} positive rate")

# Check missing values
print(f"\nMissing values per column:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Check data types
print(f"\nData types:")
print(df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class balance: 0.248 positive rate

Missing values per column:
requester_user_flair    2163
dtype: int64

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [3]:
# Explore text features
print("Text feature characteristics:")
print(f"Request title length (chars): {df['request_title'].str.len().describe()}")
print(f"\nRequest text length (chars): {df['request_text'].str.len().describe()}")
print(f"\nRequest text length (words): {df['request_text'].str.split().str.len().describe()}")

# Check unique values in key categorical features
print(f"\nUnique user flair values:")
print(df['requester_user_flair'].value_counts())

print(f"\nPost edited distribution:")
print(df['post_was_edited'].value_counts())

# Check some numerical features
print(f"\nAccount age at request (days):")
print(df['requester_account_age_in_days_at_request'].describe())

Text feature characteristics:
Request title length (chars): count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length (chars): count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request text length (words): count    2878.000000
mean       77.009034
std        69.290490
min         0.000000
25%        35.000000
50%        59.000000
75%        96.000000
max       854.000000
Name: request_text, dtype: float64

Unique user flair values:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

Post edited distribution:
post_was_edited
False           2423
True             241
1375324604.0       1
1366314331.0       1
1367280954.0       1
                ... 

In [5]:
# Check correlation with target for numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns: {numerical_cols}")

if 'requester_received_pizza' in numerical_cols:
    numerical_cols.remove('requester_received_pizza')

print("Correlation with target (top 10):")
try:
    correlations = df[numerical_cols + ['requester_received_pizza']].corr()['requester_received_pizza'].drop('requester_received_pizza')
    correlations_abs = correlations.abs().sort_values(ascending=False)
    print(correlations_abs.head(10))
except Exception as e:
    print(f"Error calculating correlations: {e}")

# Check some specific features
print(f"\nSuccess rate by user flair:")
flair_success = df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean'])
print(flair_success)

print(f"\nSuccess rate by post edited (boolean):")
edited_success = df[df['post_was_edited'].isin([True, False])].groupby('post_was_edited')['requester_received_pizza'].agg(['count', 'mean'])
print(edited_success)

Numerical columns: ['number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'request_number_of_comments_at_retrieval', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_minus_downvotes_at_retrieval', 'requester_upvotes_plus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_retrieval', 'unix_timestamp_of_request', 'unix