In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nClass imbalance ratio: {df['requester_received_pizza'].mean():.4f}")

# Check missing values in key features
print("\nMissing values in key features:")
key_features = ['request_text', 'request_title', 'requester_user_flair', 'giver_username_if_known']
for feature in key_features:
    missing = df[feature].isnull().sum() if feature in df.columns else 0
    print(f"{feature}: {missing}")

# Check text length statistics
print("\nText length statistics:")
df['text_length'] = df['request_text'].str.len()
df['title_length'] = df['request_title'].str.len()
print(f"Request text - Mean: {df['text_length'].mean():.1f}, Median: {df['text_length'].median():.1f}")
print(f"Request title - Mean: {df['title_length'].mean():.1f}, Median: {df['title_length'].median():.1f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class imbalance ratio: 0.2484

Missing values in key features:
request_text: 0
request_title: 0
requester_user_flair: 2163
giver_username_if_known: 0

Text length statistics:
Request text - Mean: 402.5, Median: 308.0
Request title - Mean: 71.6, Median: 64.0


In [3]:
# Analyze numerical features
numerical_features = [
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval',
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request'
]

print("Numerical feature statistics:")
for feature in numerical_features:
    if feature in df.columns:
        print(f"\n{feature}:")
        print(f"  Mean: {df[feature].mean():.2f}")
        print(f"  Median: {df[feature].median():.2f}")
        print(f"  Std: {df[feature].std():.2f}")
        print(f"  Min: {df[feature].min():.2f}")
        print(f"  Max: {df[feature].max():.2f}")

# Check correlation with target
print("\nCorrelation with target:")
for feature in numerical_features:
    if feature in df.columns:
        corr = df[feature].corr(df['requester_received_pizza'])
        print(f"{feature}: {corr:.4f}")

Numerical feature statistics:

number_of_upvotes_of_request_at_retrieval:
  Mean: 6.09
  Median: 4.00
  Std: 10.50
  Min: 0.00
  Max: 345.00

number_of_downvotes_of_request_at_retrieval:
  Mean: 2.43
  Median: 2.00
  Std: 3.04
  Min: 0.00
  Max: 47.00

request_number_of_comments_at_retrieval:
  Mean: 2.86
  Median: 1.00
  Std: 4.78
  Min: 0.00
  Max: 61.00

requester_account_age_in_days_at_request:
  Mean: 250.68
  Median: 155.16
  Std: 301.84
  Min: 0.00
  Max: 2809.75

requester_number_of_comments_at_request:
  Mean: 112.31
  Median: 22.00
  Std: 192.02
  Min: 0.00
  Max: 981.00

requester_number_of_posts_at_request:
  Mean: 21.61
  Median: 4.00
  Std: 51.58
  Min: 0.00
  Max: 867.00

requester_upvotes_minus_downvotes_at_request:
  Mean: 1184.58
  Median: 171.00
  Std: 4198.26
  Min: -67.00
  Max: 155010.00

requester_upvotes_plus_downvotes_at_request:
  Mean: 3988.57
  Median: 335.50
  Std: 30127.47
  Min: 0.00
  Max: 1286864.00

Correlation with target:
number_of_upvotes_of_request

In [4]:
# Analyze categorical features
print("Categorical feature analysis:")

# User flair distribution
if 'requester_user_flair' in df.columns:
    print("\nUser flair distribution:")
    flair_counts = df['requester_user_flair'].value_counts(dropna=False)
    print(flair_counts)
    
    # Success rate by flair
    print("\nSuccess rate by user flair:")
    flair_success = df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'mean'])
    print(flair_success)

# Post edited
if 'post_was_edited' in df.columns:
    print("\nPost edited distribution:")
    edited_counts = df['post_was_edited'].value_counts()
    print(edited_counts)
    
    print("\nSuccess rate by post edited:")
    edited_success = df.groupby('post_was_edited')['requester_received_pizza'].agg(['count', 'mean'])
    print(edited_success)

# Check text-based patterns
print("\nText-based patterns:")

# Check if text contains certain keywords
df['text_contains_please'] = df['request_text'].str.lower().str.contains('please', na=False)
df['text_contains_thank'] = df['request_text'].str.lower().str.contains('thank', na=False)
df['text_contains_story'] = df['request_text'].str.lower().str.contains('story|life|hard|difficult', na=False)

print("Success rate for 'please':", df[df['text_contains_please']]['requester_received_pizza'].mean())
print("Success rate for 'thank':", df[df['text_contains_thank']]['requester_received_pizza'].mean())
print("Success rate for story keywords:", df[df['text_contains_story']]['requester_received_pizza'].mean())

Categorical feature analysis:

User flair distribution:
requester_user_flair
None      2163
shroom     677
PIF         38
Name: count, dtype: int64

Success rate by user flair:
                      count  mean
requester_user_flair             
PIF                      38   1.0
shroom                  677   1.0

Post edited distribution:
post_was_edited
False           2423
True             241
1375324604.0       1
1366314331.0       1
1367280954.0       1
                ... 
1379372126.0       1
1378425306.0       1
1374109637.0       1
1358627245.0       1
1372729287.0       1
Name: count, Length: 216, dtype: int64

Success rate by post edited:
                 count      mean
post_was_edited                 
False             2423  0.221626
True               241  0.427386
1337134531.0         1  0.000000
1337276328.0         1  0.000000
1337657209.0         1  0.000000
...                ...       ...
1379817038.0         1  1.000000
1380155455.0         1  1.000000
1380335546.0  