In [2]:
import pandas as pd
import numpy as np
import json

# Load the training data
train_path = '/home/data/train.json'

# Try reading the JSON file as a single JSON object
with open(train_path, 'r') as f:
    data = json.load(f)

print(f"Type of data: {type(data)}")
if isinstance(data, list):
    train_df = pd.DataFrame(data)
    print(f"Training data shape: {train_df.shape}")
    print(f"Columns: {list(train_df.columns)}")
else:
    print(f"Data structure: {data.keys() if isinstance(data, dict) else 'Not a dict'}")

Type of data: <class 'list'>
Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_requ

In [3]:
# Check target distribution
print("Target distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"\nPercentage distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True) * 100)

# Check for missing values
print(f"\nMissing values:")
print(train_df.isnull().sum().sum())

# Check data types
print(f"\nData types:")
print(train_df.dtypes.value_counts())

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64

Percentage distribution:
requester_received_pizza
False    75.156359
True     24.843641
Name: proportion, dtype: float64

Missing values:
2163

Data types:
int64      16
object      9
float64     6
bool        1
Name: count, dtype: int64


In [4]:
# Check text fields
print("Sample request text:")
print(train_df['request_text'].iloc[0][:500])
print("\n" + "="*50)
print("Sample request title:")
print(train_df['request_title'].iloc[0])

# Check text lengths
print(f"\nRequest text length stats:")
print(train_df['request_text'].str.len().describe())
print(f"\nRequest title length stats:")
print(train_df['request_title'].str.len().describe())

Sample request text:
I will soon be going on a long deployment which I'm not aloud to discuss but willing to give some info if you ask. Just wanna eat some of the stuff America has to offer before I leave for a long time to Afganistan.

Sample request title:
[REQUEST] Oceanside, Ca. USA-  US Marine getting ready to deploy.

Request text length stats:
count    2878.000000
mean      402.521543
std       362.393727
min         0.000000
25%       182.000000
50%       308.000000
75%       503.750000
max      4460.000000
Name: request_text, dtype: float64

Request title length stats:
count    2878.000000
mean       71.572967
std        36.233487
min         7.000000
25%        46.000000
50%        64.000000
75%        90.000000
max       272.000000
Name: request_title, dtype: float64


In [5]:
# Check user flair distribution
print("User flair distribution:")
print(train_df['requester_user_flair'].value_counts())
print(f"\nPercentage distribution:")
print(train_df['requester_user_flair'].value_counts(normalize=True) * 100)

# Check correlation between flair and success
print(f"\nSuccess rate by user flair:")
flair_success = train_df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'sum', 'mean'])
flair_success.columns = ['total_requests', 'successful_requests', 'success_rate']
print(flair_success)

User flair distribution:
requester_user_flair
shroom    677
PIF        38
Name: count, dtype: int64

Percentage distribution:
requester_user_flair
shroom    94.685315
PIF        5.314685
Name: proportion, dtype: float64

Success rate by user flair:
                      total_requests  successful_requests  success_rate
requester_user_flair                                                   
PIF                               38                   38           1.0
shroom                           677                  677           1.0


In [7]:
# Check numeric features
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric features ({len(numeric_features)}):")
print(numeric_features)

# Check if target is in numeric features
if 'requester_received_pizza' in numeric_features:
    # Check correlation with target
    correlations = train_df[numeric_features].corr()['requester_received_pizza'].abs().sort_values(ascending=False)
    print(f"\nTop correlations with target:")
    print(correlations.head(10))
else:
    print("Target is not numeric, converting to numeric for correlation analysis")
    target_numeric = train_df['requester_received_pizza'].astype(int)
    numeric_df = train_df[numeric_features]
    numeric_df['target'] = target_numeric
    correlations = numeric_df.corr()['target'].abs().sort_values(ascending=False)
    print(f"\nTop correlations with target:")
    print(correlations.head(10))

Numeric features (22):
['number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'request_number_of_comments_at_retrieval', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_minus_downvotes_at_retrieval', 'requester_upvotes_plus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_retrieval', 'unix_timestamp_of_request', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_df['target'] = target_numeric
