In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training data
train_path = "/home/data/train.json"

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Basic info about the dataset
print("Dataset shape:", df.shape)
print("\nColumn types:")
print(df.dtypes.value_counts())

# Check target distribution
print("\nTarget distribution:")
target_counts = df['requester_received_pizza'].value_counts()
print(target_counts)
print(f"Success rate: {target_counts[True] / len(df):.2%}")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().sum())

Dataset shape: (2878, 32)

Column types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Success rate: 24.84%

Missing values:
2163


In [3]:
# Explore text data characteristics
print("Text data exploration:")
print(f"Average request_text length: {df['request_text'].str.len().mean():.0f} characters")
print(f"Average request_title length: {df['request_title'].str.len().mean():.0f} characters")

# Sample some text
print("\nSample request title:")
print(df['request_title'].iloc[0])
print("\nSample request text (first 200 chars):")
print(df['request_text'].iloc[0][:200])

# Check edit-aware text
print("\nSample edit-aware text (first 200 chars):")
print(df['request_text_edit_aware'].iloc[0][:200])

# Check if edit_aware differs from original
print(f"\nNumber of samples where edit_aware differs from original: {(df['request_text'] != df['request_text_edit_aware']).sum()}")

Text data exploration:
Average request_text length: 403 characters
Average request_title length: 72 characters

Sample request title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Sample request text (first 200 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 

Sample edit-aware text (first 200 chars):
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time 

Number of samples where edit_aware differs from original: 106


In [5]:
# Explore numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns: {len(numerical_cols)}")

# Check if target is in numerical columns and remove it
if 'requester_received_pizza' in numerical_cols:
    numerical_cols.remove('requester_received_pizza')

# Check correlation with target
correlations = df[numerical_cols + ['requester_received_pizza']].corr()['requester_received_pizza'].sort_values(ascending=False)
print("\nTop correlations with target:")
print(correlations.head(10))

print("\nBottom correlations with target:")
print(correlations.tail(10))

Numerical columns: 22

Top correlations with target:
requester_received_pizza                                1.000000
requester_number_of_posts_on_raop_at_retrieval          0.462165
request_number_of_comments_at_retrieval                 0.290709
requester_number_of_comments_in_raop_at_retrieval       0.277129
requester_number_of_posts_on_raop_at_request            0.132712
requester_number_of_comments_in_raop_at_request         0.131965
requester_days_since_first_post_on_raop_at_retrieval    0.127262
requester_number_of_comments_at_retrieval               0.123016
requester_account_age_in_days_at_retrieval              0.118863
requester_days_since_first_post_on_raop_at_request      0.108662
Name: requester_received_pizza, dtype: float64

Bottom correlations with target:
requester_upvotes_plus_downvotes_at_retrieval    0.052447
requester_number_of_subreddits_at_request        0.047001
requester_account_age_in_days_at_request         0.043374
requester_upvotes_minus_downvotes_at_reque

In [7]:
# Explore categorical features
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
print(f"Categorical/object columns: {len(categorical_cols)}")

# Check unique values for each categorical column
for col in categorical_cols:
    try:
        unique_count = df[col].nunique()
        print(f"{col}: {unique_count} unique values")
        if unique_count < 10:
            print(f"  Values: {df[col].value_counts().to_dict()}")
        print()
    except:
        print(f"{col}: Cannot compute nunique (likely contains lists)")
        print()

Categorical/object columns: 10
giver_username_if_known: 184 unique values

post_was_edited: 216 unique values

request_id: 2878 unique values

request_text: 2807 unique values

request_text_edit_aware: 2805 unique values

request_title: 2869 unique values

requester_received_pizza: 2 unique values
  Values: {False: 2163, True: 715}

requester_subreddits_at_request: Cannot compute nunique (likely contains lists)

requester_user_flair: 2 unique values
  Values: {'shroom': 677, 'PIF': 38}

requester_username: 2878 unique values



In [None]:
# Analyze user flair impact
print("User flair analysis:")
flair_analysis = df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'sum', 'mean'])
flair_analysis.columns = ['total_requests', 'successful_requests', 'success_rate']
print(flair_analysis)

# Check subreddit diversity
print("\nSubreddit diversity analysis:")
# Count number of subreddits per user
subreddit_counts = df['requester_subreddits_at_request'].apply(lambda x: len(x) if isinstance(x, list) else 0)
print(f"Average number of subreddits per user: {subreddit_counts.mean():.1f}")
print(f"Median: {subreddit_counts.median():.0f}")
print(f"Max: {subreddit_counts.max()}")

# Correlation with success
subreddit_success_corr = df['requester_received_pizza'].corr(subreddit_counts)
print(f"Correlation between subreddit count and success: {subreddit_success_corr:.3f}")