In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training data
train_path = '/home/data/train.json'

# Read the JSON file
with open(train_path, 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Type of data: {type(train_data)}")
print(f"First sample keys: {list(train_data[0].keys())}")

Number of training samples: 2878
Type of data: <class 'list'>
First sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subredd

In [2]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(train_data)
print(f"DataFrame shape: {df.shape}")
print("\nColumn names:")
for col in df.columns:
    print(f"- {col}")

DataFrame shape: (2878, 32)

Column names:
- giver_username_if_known
- number_of_downvotes_of_request_at_retrieval
- number_of_upvotes_of_request_at_retrieval
- post_was_edited
- request_id
- request_number_of_comments_at_retrieval
- request_text
- request_text_edit_aware
- request_title
- requester_account_age_in_days_at_request
- requester_account_age_in_days_at_retrieval
- requester_days_since_first_post_on_raop_at_request
- requester_days_since_first_post_on_raop_at_retrieval
- requester_number_of_comments_at_request
- requester_number_of_comments_at_retrieval
- requester_number_of_comments_in_raop_at_request
- requester_number_of_comments_in_raop_at_retrieval
- requester_number_of_posts_at_request
- requester_number_of_posts_at_retrieval
- requester_number_of_posts_on_raop_at_request
- requester_number_of_posts_on_raop_at_retrieval
- requester_number_of_subreddits_at_request
- requester_received_pizza
- requester_subreddits_at_request
- requester_upvotes_minus_downvotes_at_request

In [3]:
# Analyze target distribution
target_col = 'requester_received_pizza'
print("Target distribution:")
print(df[target_col].value_counts())
print(f"\nTarget distribution percentages:")
print(df[target_col].value_counts(normalize=True) * 100)

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum().sum())

# Check data types
print(f"\nData types:")
print(df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Target distribution percentages:
requester_received_pizza
False    75.156359
True     24.843641
Name: proportion, dtype: float64

Missing values:
2163

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [4]:
# Analyze text features
print("Text feature analysis:")
print(f"\nRequest title length (characters):")
df['title_length'] = df['request_title'].str.len()
print(df['title_length'].describe())

print(f"\nRequest text length (characters):")
df['text_length'] = df['request_text'].str.len()
print(df['text_length'].describe())

print(f"\nRequest text edit aware length (characters):")
df['text_edit_length'] = df['request_text_edit_aware'].str.len()
print(df['text_edit_length'].describe())

# Check for empty text
print(f"\nEmpty titles: {(df['title_length'] == 0).sum()}")
print(f"Empty text: {(df['text_length'] == 0).sum()}")

Text feature analysis:

Request title length (characters):
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: title_length, dtype: float64

Request text length (characters):
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: text_length, dtype: float64

Request text edit aware length (characters):
count    2878.000000
mean      394.567755
std       351.922518
min         0.000000
25%       180.000000
50%       302.000000
75%       498.000000
max      4460.000000
Name: text_edit_length, dtype: float64

Empty titles: 0
Empty text: 71


In [5]:
# Analyze numerical features - account activity metrics
numerical_cols = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request'
]

print("Numerical feature statistics:")
for col in numerical_cols:
    print(f"\n{col}:")
    print(df[col].describe())

# Check correlation with target
correlations = df[numerical_cols + [target_col]].corr()[target_col].sort_values(ascending=False)
print(f"\nCorrelations with target:")
print(correlations)

Numerical feature statistics:

requester_account_age_in_days_at_request:
count    2878.000000
mean      250.682364
std       301.838771
min         0.000000
25%         3.038877
50%       155.156377
75%       383.640090
max      2809.750787
Name: requester_account_age_in_days_at_request, dtype: float64

requester_number_of_comments_at_request:
count    2878.000000
mean      112.311327
std       192.017515
min         0.000000
25%         0.000000
50%        22.000000
75%       132.000000
max       981.000000
Name: requester_number_of_comments_at_request, dtype: float64

requester_number_of_posts_at_request:
count    2878.000000
mean       21.614663
std        51.580719
min         0.000000
25%         0.000000
50%         4.000000
75%        21.000000
max       867.000000
Name: requester_number_of_posts_at_request, dtype: float64

requester_number_of_subreddits_at_request:
count    2878.000000
mean       17.857192
std        21.784934
min         0.000000
25%         1.000000
50%      

In [6]:
# Analyze categorical features
categorical_cols = ['post_was_edited', 'requester_user_flair']

print("Categorical feature analysis:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())
    if col != 'requester_user_flair':
        print(f"Success rate by {col}:")
        print(df.groupby(col)[target_col].mean())

# Check requester_user_flair distribution and success rates
print(f"\nrequester_user_flair distribution:")
flair_counts = df['requester_user_flair'].value_counts()
print(flair_counts)

print(f"\nSuccess rate by flair:")
flair_success = df.groupby('requester_user_flair')[target_col].mean()
print(flair_success)

Categorical feature analysis:

post_was_edited:
post_was_edited
False           2423
True             241
1375324604.0       1
1366314331.0       1
1367280954.0       1
                ... 
1379372126.0       1
1378425306.0       1
1374109637.0       1
1358627245.0       1
1372729287.0       1
Name: count, Length: 216, dtype: int64
Success rate by post_was_edited:
post_was_edited
False           0.221626
True            0.427386
1337134531.0    0.000000
1337276328.0    0.000000
1337657209.0    0.000000
                  ...   
1379817038.0    1.000000
1380155455.0    1.000000
1380335546.0    0.000000
1380374432.0    0.000000
1380909467.0    1.000000
Name: requester_received_pizza, Length: 216, dtype: float64

requester_user_flair:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

requester_user_flair distribution:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

Success rate by flair:
requester_user_flair
PIF       1.0
shroom    1.0
