In [1]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of records: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First record keys: {list(train_data[0].keys())}")

Number of records: 2878
Type of data: <class 'list'>
First record keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_re

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)
print(f"DataFrame shape: {df.shape}")
print("\nColumn names:")
for col in df.columns:
    print(f"- {col}")

DataFrame shape: (2878, 32)

Column names:
- giver_username_if_known
- number_of_downvotes_of_request_at_retrieval
- number_of_upvotes_of_request_at_retrieval
- post_was_edited
- request_id
- request_number_of_comments_at_retrieval
- request_text
- request_text_edit_aware
- request_title
- requester_account_age_in_days_at_request
- requester_account_age_in_days_at_retrieval
- requester_days_since_first_post_on_raop_at_request
- requester_days_since_first_post_on_raop_at_retrieval
- requester_number_of_comments_at_request
- requester_number_of_comments_at_retrieval
- requester_number_of_comments_in_raop_at_request
- requester_number_of_comments_in_raop_at_retrieval
- requester_number_of_posts_at_request
- requester_number_of_posts_at_retrieval
- requester_number_of_posts_on_raop_at_request
- requester_number_of_posts_on_raop_at_retrieval
- requester_number_of_subreddits_at_request
- requester_received_pizza
- requester_subreddits_at_request
- requester_upvotes_minus_downvotes_at_request

In [3]:
# Analyze target distribution
target = 'requester_received_pizza'
print("Target distribution:")
print(df[target].value_counts())
print(f"\nTarget percentages:")
print(df[target].value_counts(normalize=True) * 100)
print(f"\nClass imbalance ratio: {df[target].value_counts().min() / df[target].value_counts().max():.4f}")

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target percentages:
requester_received_pizza
False    75.156359
True     24.843641
Name: proportion, dtype: float64

Class imbalance ratio: 0.3306


In [4]:
# Analyze feature types
print("Feature types:")
print(df.dtypes.value_counts())

print("\nSample of text features:")
print(f"Request title sample: {df['request_title'].iloc[0][:100]}...")
print(f"Request text sample: {df['request_text'].iloc[0][:100]}...")

print("\nSample of categorical features:")
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {list(categorical_cols)}")

print("\nSample of numerical features:")
numerical_cols = df.select_dtypes(exclude=['object']).columns
print(f"Numerical columns count: {len(numerical_cols)}")

Feature types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64

Sample of text features:
Request title sample: [REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy....
Request text sample: I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some in...

Sample of categorical features:
Categorical columns: ['giver_username_if_known', 'post_was_edited', 'request_id', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_subreddits_at_request', 'requester_user_flair', 'requester_username']

Sample of numerical features:
Numerical columns count: 23


In [5]:
# Check for missing values
print("Missing values per column:")
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df)) * 100
missing_df = pd.DataFrame({
    'missing_count': missing_counts,
    'missing_pct': missing_pct
}).sort_values('missing_count', ascending=False)

print(missing_df[missing_df['missing_count'] > 0])

print(f"\nTotal columns with missing values: {(missing_counts > 0).sum()}")
print(f"Total missing values: {missing_counts.sum()}")

Missing values per column:
                      missing_count  missing_pct
requester_user_flair           2163    75.156359

Total columns with missing values: 1
Total missing values: 2163


In [6]:
# Analyze key numerical features correlation with target
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Remove target from the list
if target in numerical_features:
    numerical_features.remove(target)

print(f"Analyzing {len(numerical_features)} numerical features")

# Calculate correlation with target
correlations = df[numerical_features + [target]].corr()[target].sort_values(ascending=False)
print("\nTop 10 features most correlated with target:")
print(correlations.head(11))  # 11 because target correlates with itself

print("\nBottom 10 features most correlated with target:")
print(correlations.tail(10))

Analyzing 22 numerical features

Top 10 features most correlated with target:
requester_received_pizza                                1.000000
requester_number_of_posts_on_raop_at_retrieval          0.462165
request_number_of_comments_at_retrieval                 0.290709
requester_number_of_comments_in_raop_at_retrieval       0.277129
requester_number_of_posts_on_raop_at_request            0.132712
requester_number_of_comments_in_raop_at_request         0.131965
requester_days_since_first_post_on_raop_at_retrieval    0.127262
requester_number_of_comments_at_retrieval               0.123016
requester_account_age_in_days_at_retrieval              0.118863
requester_days_since_first_post_on_raop_at_request      0.108662
number_of_upvotes_of_request_at_retrieval               0.090767
Name: requester_received_pizza, dtype: float64

Bottom 10 features most correlated with target:
requester_upvotes_plus_downvotes_at_retrieval    0.052447
requester_number_of_subreddits_at_request        0.04

In [7]:
# Analyze text length features
df['request_title_length'] = df['request_title'].str.len()
df['request_text_length'] = df['request_text'].str.len()
df['request_text_edit_aware_length'] = df['request_text_edit_aware'].str.len()

print("Text length statistics:")
print(df[['request_title_length', 'request_text_length', 'request_text_edit_aware_length']].describe())

# Check correlation of text lengths with target
length_correlations = df[['request_title_length', 'request_text_length', 'request_text_edit_aware_length', target]].corr()[target]
print(f"\nCorrelations with target:")
print(length_correlations)

Text length statistics:
       request_title_length  request_text_length  \
count           2878.000000          2878.000000   
mean              71.572967           402.521543   
std               36.233487           362.393727   
min                7.000000             0.000000   
25%               46.000000           182.000000   
50%               64.000000           308.000000   
75%               90.000000           503.750000   
max              272.000000          4460.000000   

       request_text_edit_aware_length  
count                     2878.000000  
mean                       394.567755  
std                        351.922518  
min                          0.000000  
25%                        180.000000  
50%                        302.000000  
75%                        498.000000  
max                       4460.000000  

Correlations with target:
request_title_length              0.014568
request_text_length               0.130130
request_text_edit_aware_length    