In [13]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

pd.set_option('display.max_columns', None)


In [14]:
df = pd.read_csv("TwiBot-22/parsed_users_full.csv")
labels = pd.read_csv("TwiBot-22/label.csv")

df = pd.merge(df, labels, on="id")
df['label'] = df['label'].map({'bot': 1, 'human': 0})

In [15]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['account_age_days'] = (pd.Timestamp.now(tz='UTC') - df['created_at']).dt.days

df['description_len'] = df['description'].fillna('').apply(len)
df['username_len'] = df['username'].fillna('').apply(len)
df['followers_per_day'] = df['followers_count'] / (df['account_age_days'] + 1)
df['follower_following_ratio'] = df['followers_count'] / (df['following_count'] + 1)
df['tweets_per_day'] = df['tweet_count'] / (df['account_age_days'] + 1)

In [16]:
# Derived features
df['followers_to_tweets'] = df['followers_count'] / (df['tweet_count'] + 1)
df['follow_to_following'] = df['following_count'] / (df['followers_count'] + 1)
df['listed_per_follower'] = df['listed_count'] / (df['followers_count'] + 1)
df['description_density'] = df['description_len'] / (df['account_age_days'] + 1)

In [17]:
features = [
    'verified', 'followers_count', 'following_count', 'tweet_count', 'listed_count',
    'account_age_days', 'description_len', 'username_len',
    'followers_per_day', 'follower_following_ratio', 'tweets_per_day',
    'followers_to_tweets', 'follow_to_following', 'listed_per_follower',
    'description_density'
]
target = 'label'


X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [19]:
from xgboost import XGBClassifier

clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=6)  # weight ~ human/bot ratio
clf.fit(X_train_scaled, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [20]:
y_probs = clf.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_probs > 0.3).astype(int)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.58      0.72    172011
           1       0.26      0.91      0.40     27989

    accuracy                           0.62    200000
   macro avg       0.62      0.74      0.56    200000
weighted avg       0.88      0.62      0.68    200000



In [21]:
joblib.dump(clf, "models/rf_model.pkl")
joblib.dump(scaler, "models/feature_scaler.pkl")


['models/feature_scaler.pkl']

In [22]:
import ijson
from collections import defaultdict
import pandas as pd

In [23]:
def extract_tweet_features(path, limit_users=None):
    features = defaultdict(lambda: {
        "tweet_count": 0,
        "total_text_len": 0,
        "total_likes": 0,
        "total_retweets": 0,
        "media_tweet_count": 0,
        "mention_count": 0,
        "lang_counts": defaultdict(int)
    })

    with open(path, 'rb') as f:
        parser = ijson.items(f, 'item')
        for i, tweet in enumerate(parser):
            uid = "u" + str(tweet.get('author_id', 'unknown'))
            text = tweet.get('text', '')
            metrics = tweet.get('public_metrics', {})
            lang = tweet.get('lang', 'und')

            f = features[uid]
            f['tweet_count'] += 1
            f['total_text_len'] += len(text)
            f['total_likes'] += metrics.get('like_count', 0)
            f['total_retweets'] += metrics.get('retweet_count', 0)
            entities = tweet.get('entities') or {}
            f['media_tweet_count'] += int('media' in entities)
            f['mention_count'] += len(entities.get('user_mentions', []))
            f['lang_counts'][lang] += 1

            if limit_users and len(features) >= limit_users:
                break

    # Convert to DataFrame
    rows = []
    for uid, d in features.items():
        total = d['tweet_count']
        row = {
            "id": uid,
            "avg_tweet_len": d['total_text_len'] / total,
            "avg_likes": d['total_likes'] / total,
            "avg_retweets": d['total_retweets'] / total,
            "media_tweet_ratio": d['media_tweet_count'] / total,
            "mention_per_tweet": d['mention_count'] / total,
            "most_common_lang": max(d['lang_counts'], key=d['lang_counts'].get)
        }
        rows.append(row)

    return pd.DataFrame(rows)

In [25]:
df_tweets = extract_tweet_features(path='TwiBot-22/tweet_0.json')

In [26]:
df_full = pd.merge(df, df_tweets, on="id", how="left")

# Fill missing tweet-based features with 0 or defaults
df_full.fillna({
    'avg_tweet_len': 0,
    'avg_likes': 0,
    'avg_retweets': 0,
    'media_tweet_ratio': 0,
    'mention_per_tweet': 0,
    'most_common_lang': 'unknown'
}, inplace=True)


In [27]:
features = [
    'verified', 'followers_count', 'following_count', 'tweet_count', 'listed_count',
    'account_age_days', 'description_len', 'username_len',
    'followers_per_day', 'follower_following_ratio', 'tweets_per_day',
    'followers_to_tweets', 'follow_to_following', 'listed_per_follower', 'description_density',
    'avg_tweet_len', 'avg_likes', 'avg_retweets',
    'media_tweet_ratio', 'mention_per_tweet'
]

target = 'label'

X = df_full[features]
y = df_full[target]

In [None]:
scaler_new = StandardScaler()
X_train_scaled = scaler_new.fit_transform(X_train)
X_test_scaled = scaler_new.transform(X_test)

clf_new = XGBClassifier(use_label_encoder=False, eval_metric='logloss')  
clf_new.fit(X_train_scaled, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [31]:
y_probs = clf_new.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_probs > 0.3).astype(int)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92    172011
           1       0.50      0.49      0.50     27989

    accuracy                           0.86    200000
   macro avg       0.71      0.71      0.71    200000
weighted avg       0.86      0.86      0.86    200000



In [32]:
import joblib

joblib.dump(clf_new, "models/xgb_model.pkl")
joblib.dump(scaler, "models/feature_scaler.pkl")


['models/feature_scaler.pkl']

In [33]:
import os

print(os.listdir("models"))


['classifier.pkl', 'feature_scaler.pkl', 'rf_model.pkl', 'tfidf_vectorizer.pkl', 'xgb_model.pkl']
