In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)
print(f"DataFrame shape: {df.shape}")
print("\nColumn names:")
print(df.columns.tolist())

DataFrame shape: (2878, 32)

Column names:
['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_mi

In [3]:
# Analyze target distribution
target = df['requester_received_pizza']
print(f"Target distribution:")
print(target.value_counts())
print(f"\nTarget percentages:")
print(target.value_counts(normalize=True) * 100)

# Check for class imbalance
print(f"\nClass imbalance ratio: {target.value_counts().iloc[0] / target.value_counts().iloc[1]:.2f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target percentages:
requester_received_pizza
False    75.156359
True     24.843641
Name: proportion, dtype: float64

Class imbalance ratio: 3.03


In [4]:
# Explore text features
print("Sample request title:")
print(df['request_title'].iloc[0][:200])
print("\n" + "="*50)
print("Sample request text:")
print(df['request_text'].iloc[0][:500])

# Check text lengths
df['title_length'] = df['request_title'].str.len()
df['text_length'] = df['request_text'].str.len()

print(f"\nText length statistics:")
print(f"Title - Mean: {df['title_length'].mean():.1f}, Max: {df['title_length'].max()}")
print(f"Text - Mean: {df['text_length'].mean():.1f}, Max: {df['text_length'].max()}")

Sample request title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Sample request text:
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.

Text length statistics:
Title - Mean: 71.6, Max: 272
Text - Mean: 402.5, Max: 4460


In [5]:
# Explore metadata features
metadata_cols = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request', 
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request'
]

print("Metadata feature statistics:")
for col in metadata_cols:
    if col in df.columns:
        print(f"{col}:")
        print(f"  Mean: {df[col].mean():.2f}, Std: {df[col].std():.2f}")
        print(f"  Min: {df[col].min():.2f}, Max: {df[col].max():.2f}")
        print()

Metadata feature statistics:
requester_account_age_in_days_at_request:
  Mean: 250.68, Std: 301.84
  Min: 0.00, Max: 2809.75

requester_number_of_comments_at_request:
  Mean: 112.31, Std: 192.02
  Min: 0.00, Max: 981.00

requester_number_of_posts_at_request:
  Mean: 21.61, Std: 51.58
  Min: 0.00, Max: 867.00

requester_upvotes_minus_downvotes_at_request:
  Mean: 1184.58, Std: 4198.26
  Min: -67.00, Max: 155010.00

requester_upvotes_plus_downvotes_at_request:
  Mean: 3988.57, Std: 30127.47
  Min: 0.00, Max: 1286864.00

requester_number_of_subreddits_at_request:
  Mean: 17.86, Std: 21.78
  Min: 0.00, Max: 186.00

requester_number_of_comments_in_raop_at_request:
  Mean: 0.69, Std: 3.75
  Min: 0.00, Max: 88.00

requester_number_of_posts_on_raop_at_request:
  Mean: 0.07, Std: 0.34
  Min: 0.00, Max: 5.00



In [6]:
# Analyze user flair distribution
print("User flair distribution:")
print(df['requester_user_flair'].value_counts())
print(f"\nUser flair percentages:")
print(df['requester_user_flair'].value_counts(normalize=True) * 100)

# Cross-tabulate flair with target
print("\nFlair vs Pizza Received:")
flair_pivot = pd.crosstab(df['requester_user_flair'], df['requester_received_pizza'], normalize='index') * 100
print(flair_pivot.round(2))

User flair distribution:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

User flair percentages:
requester_user_flair
shroom    94.685315
PIF        5.314685
Name: proportion, dtype: float64

Flair vs Pizza Received:
requester_received_pizza   True
requester_user_flair           
PIF                       100.0
shroom                    100.0


In [7]:
# Check for missing/null values in flair
print("Missing values in user flair:")
print(df['requester_user_flair'].isnull().sum())

# Check unique values including potential nulls
print(f"\nUnique flair values: {df['requester_user_flair'].unique()}")

# Check if there are empty strings or 'None'
print(f"\nFlair value counts (including all):")
print(df['requester_user_flair'].value_counts(dropna=False))

Missing values in user flair:
2163

Unique flair values: [None 'shroom' 'PIF']

Flair value counts (including all):
requester_user_flair
None      2163
shroom     677
PIF         38
Name: count, dtype: int64
