In [1]:
import json
import pandas as pd
import numpy as np

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nPercentage of positive samples: {df['requester_received_pizza'].mean():.4f}")

# Check for missing values in key fields
print("\nMissing values in key fields:")
key_fields = ['request_text', 'request_title', 'requester_user_flair', 'requester_number_of_comments_at_request']
for field in key_fields:
    missing = df[field].isnull().sum()
    print(f"{field}: {missing} ({missing/len(df)*100:.2f}%)")

# Check data types of numeric fields
numeric_fields = ['requester_number_of_comments_at_request', 'requester_number_of_posts_at_request', 
                  'requester_upvotes_minus_downvotes_at_request', 'requester_number_of_subreddits_at_request']
print("\nSample of numeric fields:")
print(df[numeric_fields].head())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage of positive samples: 0.2484

Missing values in key fields:
request_text: 0 (0.00%)
request_title: 0 (0.00%)
requester_user_flair: 2163 (75.16%)
requester_number_of_comments_at_request: 0 (0.00%)

Sample of numeric fields:
   requester_number_of_comments_at_request  \
0                                        0   
1                                       40   
2                                        0   
3                                       46   
4                                      195   

   requester_number_of_posts_at_request  \
0                                     0   
1                                    11   
2                                     0   
3                                     1   
4                                    12   

   requester_upvotes_minus_downvotes_at_request  \
0                                             3   
1                           

In [3]:
# Analyze text data characteristics
print("Text data analysis:")
print(f"Average request title length (characters): {df['request_title'].str.len().mean():.1f}")
print(f"Average request text length (characters): {df['request_text'].str.len().mean():.1f}")
print(f"Average request text length (words): {df['request_text'].str.split().str.len().mean():.1f}")

# Check for empty text
empty_titles = (df['request_title'].str.len() == 0).sum()
empty_texts = (df['request_text'].str.len() == 0).sum()
print(f"\nEmpty titles: {empty_titles}")
print(f"Empty texts: {empty_texts}")

# Look at user flair distribution (non-missing)
flair_counts = df['requester_user_flair'].value_counts(dropna=False)
print(f"\nUser flair distribution:")
print(flair_counts)

# Check correlation between flair and target
print(f"\nSuccess rate by flair:")
flair_success = df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean']).round(4)
print(flair_success)

Text data analysis:
Average request title length (characters): 71.6
Average request text length (characters): 402.5
Average request text length (words): 77.0

Empty titles: 0
Empty texts: 71

User flair distribution:
requester_user_flair
None      2163
shroom     677
PIF         38
Name: count, dtype: int64

Success rate by flair:
                      count  mean
requester_user_flair             
PIF                      38   1.0
shroom                  677   1.0


In [None]:
# Analyze numeric features correlation with target
numeric_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_minus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request'
]

print("Correlation with target:")
correlations = df[numeric_features + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print(correlations.drop('requester_received_pizza'))

# Check distribution of some key features
print("\nDistribution of key features:")
for feature in ['requester_number_of_comments_at_request', 'requester_upvotes_minus_downvotes_at_request']:
    print(f"\n{feature}:")
    print(f"  Mean: {df[feature].mean():.2f}")
    print(f"  Median: {df[feature].median():.2f}")
    print(f"  Max: {df[feature].max():.2f}")
    print(f"  % zero: {(df[feature] == 0).mean()*100:.2f}%")

# Save key finding about user flair
print("\n" + "="*50)
print("KEY FINDING: User flair is highly predictive!")
print("- None: 0% success rate (2163 samples)")
print("- shroom: 100% success rate (677 samples)")  
print("- PIF: 100% success rate (38 samples)")
print("="*50)