In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")
print(f"\nFirst sample preview:")
for key, value in list(train_data[0].items())[:5]:
    print(f"  {key}: {value}")

Number of training samples: 2878
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_u

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nTarget distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nTarget percentage:")
print(df['requester_received_pizza'].value_counts(normalize=True))

DataFrame shape: (2878, 32)

Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_d

requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target percentage:
requester_received_pizza
False    0.751564
True     0.248436
Name: proportion, dtype: float64


In [3]:
# Analyze text features
print("Text feature analysis:")
print(f"\nRequest title length stats:")
title_lengths = df['request_title'].str.len()
print(title_lengths.describe())

print(f"\nRequest text length stats:")
text_lengths = df['request_text'].str.len()
print(text_lengths.describe())

print(f"\nRequest text edit aware length stats:")
text_edit_lengths = df['request_text_edit_aware'].str.len()
print(text_edit_lengths.describe())

# Check for missing values in key features
print(f"\nMissing values in text features:")
print(df[['request_title', 'request_text', 'request_text_edit_aware']].isnull().sum())

Text feature analysis:

Request title length stats:
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64

Request text length stats:
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request text edit aware length stats:
count    2878.000000
mean      394.567755
std       351.922518
min         0.000000
25%       180.000000
50%       302.000000
75%       498.000000
max      4460.000000
Name: request_text_edit_aware, dtype: float64

Missing values in text features:
request_title              0
request_text               0
request_text_edit_aware    0
dtype: int64


In [4]:
# Analyze numerical features
numerical_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request', 
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval'
]

print("Numerical feature analysis:")
for feature in numerical_features:
    print(f"\n{feature}:")
    print(df[feature].describe())

# Check correlation with target
print("\nCorrelation with target:")
correlations = df[numerical_features + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print(correlations)

Numerical feature analysis:

requester_account_age_in_days_at_request:
count    2878.000000
mean      250.682364
std       301.838771
min         0.000000
25%         3.038877
50%       155.156377
75%       383.640090
max      2809.750787
Name: requester_account_age_in_days_at_request, dtype: float64

requester_number_of_comments_at_request:
count    2878.000000
mean      112.311327
std       192.017515
min         0.000000
25%         0.000000
50%        22.000000
75%       132.000000
max       981.000000
Name: requester_number_of_comments_at_request, dtype: float64

requester_number_of_posts_at_request:
count    2878.000000
mean       21.614663
std        51.580719
min         0.000000
25%         0.000000
50%         4.000000
75%        21.000000
max       867.000000
Name: requester_number_of_posts_at_request, dtype: float64

requester_upvotes_minus_downvotes_at_request:
count      2878.000000
mean       1184.582349
std        4198.255486
min         -67.000000
25%           3.00000

In [5]:
# Analyze categorical features
categorical_features = ['post_was_edited', 'requester_user_flair']

print("Categorical feature analysis:")
for feature in categorical_features:
    print(f"\n{feature}:")
    print(df[feature].value_counts())
    print(f"\n{feature} by target:")
    print(pd.crosstab(df[feature], df['requester_received_pizza'], normalize='index'))

# Check requester_user_flair correlation with target
print("\n\nRequester user flair detailed analysis:")
flair_pivot = pd.crosstab(df['requester_user_flair'], df['requester_received_pizza'], margins=True)
print(flair_pivot)

print("\n\nSuccess rate by flair:")
flair_success = df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'sum', 'mean']).round(3)
flair_success.columns = ['total', 'successful', 'success_rate']
print(flair_success)

Categorical feature analysis:

post_was_edited:
post_was_edited
False           2423
True             241
1375324604.0       1
1366314331.0       1
1367280954.0       1
                ... 
1379372126.0       1
1378425306.0       1
1374109637.0       1
1358627245.0       1
1372729287.0       1
Name: count, Length: 216, dtype: int64

post_was_edited by target:


requester_received_pizza     False     True 
post_was_edited                             
False                     0.778374  0.221626
True                      0.572614  0.427386
1337134531.0              1.000000  0.000000
1337276328.0              1.000000  0.000000
1337657209.0              1.000000  0.000000
...                            ...       ...
1379817038.0              0.000000  1.000000
1380155455.0              0.000000  1.000000
1380335546.0              1.000000  0.000000
1380374432.0              1.000000  0.000000
1380909467.0              0.000000  1.000000

[216 rows x 2 columns]

requester_user_flair:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

requester_user_flair by target:
requester_received_pizza  True
requester_user_flair          
PIF                        1.0
shroom                     1.0


Requester user flair detailed analysis:
requester_received_pizza  True  All
requester_user_flair               
PIF                   

In [6]:
# Let's check the actual values more carefully
print("Unique values in post_was_edited:")
print(df['post_was_edited'].unique()[:20])
print(f"\nData type: {df['post_was_edited'].dtype}")

# Check if there are mixed types
print(f"\nValue counts (top 10):")
print(df['post_was_edited'].value_counts().head(10))

# Clean the post_was_edited column - it seems to have timestamps instead of just True/False
# Let's check what values are actually boolean vs timestamps
df['post_was_edited_clean'] = df['post_was_edited'].apply(lambda x: True if x == True else False if x == False else True)

print(f"\nCleaned post_was_edited:")
print(df['post_was_edited_clean'].value_counts())

print(f"\nSuccess rate by cleaned post_was_edited:")
print(df.groupby('post_was_edited_clean')['requester_received_pizza'].mean())

Unique values in post_was_edited:
[False True 1363315140.0 1349998232.0 1364413726.0 1342991229.0
 1351709165.0 1347224731.0 1347230374.0 1371296113.0 1352058582.0
 1347225835.0 1363914082.0 1373814714.0 1351652384.0 1375841082.0
 1338341728.0 1347404183.0 1345416005.0 1367630093.0]

Data type: object

Value counts (top 10):
post_was_edited
False           2423
True             241
1375324604.0       1
1366314331.0       1
1367280954.0       1
1375380598.0       1
1353193218.0       1
1376170160.0       1
1373935175.0       1
1342896958.0       1
Name: count, dtype: int64

Cleaned post_was_edited:
post_was_edited_clean
False    2423
True      455
Name: count, dtype: int64

Success rate by cleaned post_was_edited:
post_was_edited_clean
False    0.221626
True     0.391209
Name: requester_received_pizza, dtype: float64
