In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(train_data)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_down

In [2]:
# Check target distribution
print("Target distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nPercentage of successful requests: {df['requester_received_pizza'].mean():.2%}")

# Check data types
print("\nData types:")
print(df.dtypes.value_counts())

# Check missing values
print("\nMissing values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage of successful requests: 24.84%

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Missing values:
requester_user_flair                           2163
giver_username_if_known                           0
number_of_upvotes_of_request_at_retrieval         0
number_of_downvotes_of_request_at_retrieval       0
request_id                                        0
request_number_of_comments_at_retrieval           0
request_text                                      0
post_was_edited                                   0
request_title                                     0
requester_account_age_in_days_at_request          0
dtype: int64


In [3]:
# Analyze text features
print("Text feature analysis:")
print(f"Average request_title length: {df['request_title'].str.len().mean():.1f} characters")
print(f"Average request_text length: {df['request_text'].str.len().mean():.1f} characters")
print(f"Average request_text_edit_aware length: {df['request_text_edit_aware'].str.len().mean():.1f} characters")

# Check for duplicates
print(f"\nDuplicate request_text: {df['request_text'].duplicated().sum()}")

# Sample text data
print("\nSample request_title:")
print(df['request_title'].iloc[0])
print("\nSample request_text:")
print(df['request_text'].iloc[0][:200] + "...")

# Check unique values in categorical features
print("\nUnique values in requester_user_flair:")
print(df['requester_user_flair'].value_counts())

print("\nUnique values in giver_username_if_known:")
print(df['giver_username_if_known'].value_counts().head())

Text feature analysis:
Average request_title length: 71.6 characters
Average request_text length: 402.5 characters
Average request_text_edit_aware length: 394.6 characters

Duplicate request_text: 71

Sample request_title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Sample request_text:
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time ...

Unique values in requester_user_flair:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

Unique values in giver_username_if_known:
giver_username_if_known
N/A            2670
mr_jeep           4
leftnewdigg       3
m2nu              3
thr               3
Name: count, dtype: int64


In [None]:
# Analyze numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns: {len(numerical_cols)}")
print(numerical_cols)

# Check correlations with target
correlations = df[numerical_cols].corr()['requester_received_pizza'].abs().sort_values(ascending=False)
print("\nCorrelations with target (absolute values):")
print(correlations.head(15))

# Analyze temporal features
print("\nTemporal analysis:")
print(f"Date range: {pd.to_datetime(df['unix_timestamp_of_request'], unit='s').min()} to {pd.to_datetime(df['unix_timestamp_of_request'], unit='s').max()}")

# Check for class imbalance in key features
print("\nClass distribution by user flair:")
print(pd.crosstab(df['requester_user_flair'], df['requester_received_pizza'], normalize='index'))

print("\nClass distribution by post_was_edited:")
print(pd.crosstab(df['post_was_edited'], df['requester_received_pizza'], normalize='index'))

In [5]:
# Check the actual target column name
print("Target column name check:")
print([col for col in df.columns if 'pizza' in col.lower()])
print(f"Target column type: {df['requester_received_pizza'].dtype}")

# Convert boolean to int for correlation analysis
df['target'] = df['requester_received_pizza'].astype(int)

# Analyze numerical features
numerical_cols = df.select_dtypes(include=[np.number, 'boolean']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['requester_received_pizza', 'target']]
print(f"\nNumerical columns for analysis: {len(numerical_cols)}")

# Check correlations with target
correlations = df[numerical_cols + ['target']].corr()['target'].abs().sort_values(ascending=False)
print("\nCorrelations with target (absolute values):")
print(correlations.head(15))

Target column name check:
['requester_received_pizza']
Target column type: bool

Numerical columns for analysis: 22

Correlations with target (absolute values):
target                                                  1.000000
requester_number_of_posts_on_raop_at_retrieval          0.462165
request_number_of_comments_at_retrieval                 0.290709
requester_number_of_comments_in_raop_at_retrieval       0.277129
requester_number_of_posts_on_raop_at_request            0.132712
requester_number_of_comments_in_raop_at_request         0.131965
requester_days_since_first_post_on_raop_at_retrieval    0.127262
requester_number_of_comments_at_retrieval               0.123016
requester_account_age_in_days_at_retrieval              0.118863
requester_days_since_first_post_on_raop_at_request      0.108662
unix_timestamp_of_request                               0.099329
unix_timestamp_of_request_utc                           0.099327
number_of_upvotes_of_request_at_retrieval               0.0

In [None]:
# Add new cell to analyze missing value handling strategies
print("Missing value analysis for requester_user_flair:")
print(f"Missing: {df['requester_user_flair'].isnull().sum()} ({df['requester_user_flair'].isnull().mean():.1%})")
print(f"Non-missing distribution:")
print(df['requester_user_flair'].value_counts())

# Check if flair is predictive when present
flair_analysis = df.groupby('requester_user_flair')['target'].agg(['count', 'mean'])
print("\nSuccess rate by flair (when present):")
print(flair_analysis)

# Analyze giver_username_if_known
print("\nGiver username analysis:")
print(f"'N/A' values: {(df['giver_username_if_known'] == 'N/A').sum()} ({(df['giver_username_if_known'] == 'N/A').mean():.1%})")
print(f"Known givers: {df['giver_username_if_known'].nunique()} unique")

# Check if this is leakage
success_with_giver = df[df['giver_username_if_known'] != 'N/A']['target'].mean()
print(f"Success rate when giver known: {success_with_giver:.1%}")
print(f"Success rate when giver unknown: {df[df['giver_username_if_known'] == 'N/A']['target'].mean():.1%}")