In [1]:
import json
import pandas as pd
import numpy as np

# Load the training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)

# Basic info about the dataset
print("Dataset shape:", df.shape)
print("\nTarget variable distribution:")
print(df['requester_received_pizza'].value_counts())
print(f"\nClass balance: {df['requester_received_pizza'].mean():.3f} (positive rate)")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum().sum())

Dataset shape: (2878, 32)

Target variable distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Class balance: 0.248 (positive rate)

Missing values:
2163


In [3]:
# Check which columns have missing values
missing_counts = df.isnull().sum()
missing_cols = missing_counts[missing_counts > 0]
print("Columns with missing values:")
print(missing_cols)

# Check data types
print("\nData types:")
print(df.dtypes.value_counts())

# Check unique values for categorical features
print("\nUnique values in key categorical features:")
categorical_cols = ['requester_user_flair', 'post_was_edited', 'giver_username_if_known']
for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col}: {df[col].nunique()} unique values")
        print(df[col].value_counts().head())

Columns with missing values:
requester_user_flair    2163
dtype: int64

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Unique values in key categorical features:

requester_user_flair: 2 unique values
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

post_was_edited: 216 unique values
post_was_edited
False           2423
True             241
1375324604.0       1
1366314331.0       1
1367280954.0       1
Name: count, dtype: int64

giver_username_if_known: 184 unique values
giver_username_if_known
N/A            2670
mr_jeep           4
leftnewdigg       3
m2nu              3
thr               3
Name: count, dtype: int64


In [None]:
# Analyze text features
print("Text feature analysis:")
print(f"Request title length (chars): min={df['request_title'].str.len().min()}, max={df['request_title'].str.len().max()}, mean={df['request_title'].str.len().mean():.1f}")
print(f"Request text length (chars): min={df['request_text'].str.len().min()}, max={df['request_text'].str.len().max()}, mean={df['request_text'].str.len().mean():.1f}")

# Check correlation between text length and success
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df[df['requester_received_pizza']==False]['request_text'].str.len().hist(bins=50, alpha=0.7, label='No pizza')
df[df['requester_received_pizza']==True]['request_text'].str.len().hist(bins=50, alpha=0.7, label='Got pizza')
plt.xlabel('Request text length (chars)')
plt.ylabel('Count')
plt.legend()
plt.title('Text length distribution by outcome')

plt.subplot(1, 2, 2)
df[df['requester_received_pizza']==False]['request_title'].str.len().hist(bins=30, alpha=0.7, label='No pizza')
df[df['requester_received_pizza']==True]['request_title'].str.len().hist(bins=30, alpha=0.7, label='Got pizza')
plt.xlabel('Request title length (chars)')
plt.ylabel('Count')
plt.legend()
plt.title('Title length distribution by outcome')
plt.tight_layout()
plt.show()