In [2]:
import json
import numpy as np

# Load your JSON data
with open('../data/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)
with open('../data/test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)
with open('../data/validation.json', 'r', encoding='utf-8') as f:
    validation_data = json.load(f)

# Combine to calculate global quantiles
all_data = train_data + test_data + validation_data

# Calculate engagement scores for all items
engagement_scores = []
for item in all_data:
    eng = item['engagement']
    # Your formula: (1×likes + 2×comments + 3×shares) / views
    score = (1 * eng['likes'] + 2 * eng['comments'] + 3 * eng['shares'] + 3 * eng['favorites']) / max(eng['views'], 1)
    engagement_scores.append(score)

# Calculate global quantile boundaries
p20, p40, p60, p80 = np.percentile(engagement_scores, [20, 40, 60, 80])

# Function to add scores and labels to a dataset
def add_engagement_info(data):
    for item in data:
        eng = item['engagement']
        
        # Calculate engagement score
        score = (1 * eng['likes'] + 2 * eng['comments'] + 3 * eng['shares']) / max(eng['views'], 1)
        item['engagement_score'] = float(score)
        
        # Assign label based on global quantiles
        if score <= p20:
            item['engagement_label'] = "Very Low"
        elif score <= p40:
            item['engagement_label'] = "Low"
        elif score <= p60:
            item['engagement_label'] = "Average"
        elif score <= p80:
            item['engagement_label'] = "High"
        else:
            item['engagement_label'] = "Very High"
    
    return data

# Add to all datasets
train_data = add_engagement_info(train_data)
test_data = add_engagement_info(test_data)
validation_data = add_engagement_info(validation_data)

# Save updated JSON files
with open('../data/train_with_eng.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, indent=2, ensure_ascii=False)
with open('../data/test_with_eng.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, indent=2, ensure_ascii=False)
with open('../data/validation_with_eng.json', 'w', encoding='utf-8') as f:
    json.dump(validation_data, f, indent=2, ensure_ascii=False)

print(f"Quantile boundaries: p20={p20:.6f}, p40={p40:.6f}, p60={p60:.6f}, p80={p80:.6f}")
print(f"Updated files saved with 'engagement_score' and 'engagement_label' fields")

Quantile boundaries: p20=0.034876, p40=0.066469, p60=0.105164, p80=0.164659
Updated files saved with 'engagement_score' and 'engagement_label' fields


In [7]:
{key: value for key, value in train_data[0].items() if key not in ['engagement_score', 'engagement']}

{'image': '../cover/i29115.jpg',
 'title': "This...is it even a guy? Ten Thousand Pink Women's Clothing!",
 'tag': 'Daily Life',
 'description': "This trip down is really too tired, spent a lot of time on the road, and most of the day in various poses, really sore back... If you think it looks good, or if you think I'm really serious about it, I hope you'll long press the like button to give me a three-peat~!",
 'engagement_label': 'High'}