In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, matthews_corrcoef, roc_auc_score, 
                             confusion_matrix, classification_report, roc_curve, auc)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

#TODO todays date?
# Account age reference point
REFERENCE_DATE = pd.Timestamp('2022-12-31').tz_localize('UTC')
# Paths
USER_FILE = '../data/twibot22/user.json'
LABEL_FILE = '../data/twibot22/label.csv'
SPLIT_FILE = '../data/twibot22/split.csv'

def load_data():
    print("Loading labels...")
    df_labels = pd.read_csv(LABEL_FILE)
    label_map = dict(zip(df_labels['id'].astype(str), 
                        (df_labels['label'] == 'bot').astype(int)))
    print(f"Loaded {len(label_map)} labels")
    
    print("Loading splits...")
    df_split = pd.read_csv(SPLIT_FILE)
    split_map = dict(zip(df_split['id'].astype(str), df_split['split']))
    print(f"Loaded {len(split_map)} splits")
    
    print("Loading user data...")
    with open(USER_FILE, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    
    data = []
    for i, entry in enumerate(raw_data):
        # Control print
        if (i + 1) % 50000 == 0:
            print(f"  Processing {i+1}/{len(raw_data)}...")
        
        user_id = str(entry.get('id', ''))
        if user_id not in label_map or user_id not in split_map:
            continue
        
        metrics = entry.get('public_metrics', {})
        
        age_days = 0
        try:
            dt = pd.to_datetime(entry.get('created_at'))
            if dt.tz is None: 
                dt = dt.tz_localize('UTC')
            age = (REFERENCE_DATE - dt).days
            age_days = max(age, 0)
        except: 
            pass
        
        img_url = entry.get('profile_image_url', '')

        # Raw features
        features = {
            'id': user_id,
            'followers_count': int(metrics.get('followers_count', 0)),
            'following_count': int(metrics.get('following_count', 0)),
            'tweet_count': int(metrics.get('tweet_count', 0)),
            'listed_count': int(metrics.get('listed_count', 0)),
            'username_length': len(entry.get('username', '')),
            'account_age_days': age_days,
            'verified': 1 if entry.get('verified') is True else 0,
            'default_profile_image': 1 if 'default_profile_images' in img_url else 0,
            'label': label_map[user_id],
            'split': split_map[user_id]
        }
        data.append(features)
    
    df = pd.DataFrame(data)
    print(f"Total: {len(df)} records")
    return df

df = load_data()
print(f"\nDataset splits:\n{df['split'].value_counts()}\nClass distribution:\n{df['label'].value_counts()}")

Loading labels...
Loaded 1000000 labels
Loading splits...
Loaded 1000000 splits
Loading user data...
  Processing 50000/1000000...
  Processing 100000/1000000...
  Processing 150000/1000000...
  Processing 200000/1000000...
  Processing 250000/1000000...
  Processing 300000/1000000...
  Processing 350000/1000000...
  Processing 400000/1000000...
  Processing 450000/1000000...
  Processing 500000/1000000...
  Processing 550000/1000000...
  Processing 600000/1000000...
  Processing 650000/1000000...
  Processing 700000/1000000...
  Processing 750000/1000000...
  Processing 800000/1000000...
  Processing 850000/1000000...
  Processing 900000/1000000...
  Processing 950000/1000000...
  Processing 1000000/1000000...
Total: 1000000 records

Dataset splits:
split
train    700000
val      200000
test     100000
Name: count, dtype: int64
Class distribution:
label
0    860057
1    139943
Name: count, dtype: int64
