Importing libraries

In [None]:
# Install necessary libraries
!pip install ijson bitsandbytes langchain langchain_huggingface datasets peft trl

# General Python and ML libraries
import numpy as np
import pandas as pd
import json
import ast
import re
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import random
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from transformers import (pipeline, AutoTokenizer, AutoModelForCausalLM, BertTokenizer,
                          TFBertForSequenceClassification, BitsAndBytesConfig, TFBertModel,
                          LlamaForCausalLM, LlamaTokenizer, TrainingArguments, logging)

# Libraries for specific frameworks (LangChain, PEFT, Hugging Face, etc.)
import bitsandbytes as bnb
from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from datasets import Dataset
from peft import LoraConfig, PeftConfig, prepare_model_for_kbit_training
from trl import SFTTrainer, setup_chat_format

# Machine Learning libraries
import tensorflow as tf
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Collecting ijson
  Downloading ijson-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.meta

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Load a subset of tweets from the dataset and export to Google Drive for future use

In [None]:
# # one time to generate a small subset of 10000 lines of tweets
# def process_large_json(file_path, num_lines=100000):
#   """Reads a large JSON file line by line and processes the data.

#   Args:
#     file_path: Path to the JSON file.
#     num_lines: The number of lines to process.
#   """
#   tweets = []
#   with open(file_path, 'r') as f:
#     objects = ijson.items(f, 'item')
#     count = 0
#     for tweet in objects:
#       if count < num_lines:
#           tweets.append(tweet)
#           count += 1
#       else:
#         break

#   return tweets

# file_path = '/content/drive/MyDrive/Twibot_22/tweet_8.json'
# tweets = process_large_json(file_path)

# tweets = pd.DataFrame(tweets)

# labels = pd.read_csv('/content/drive/MyDrive/Twibot_22/label.csv')
# split = pd.read_csv('/content/drive/MyDrive/Twibot_22/split.csv')

# split = split.rename(columns={'id': 'split_id'})

# tweets['user_id'] = 'u' + tweets['author_id'].astype(str)
# tweets_en = tweets[tweets['lang'] == 'en']
# tweets_labels = pd.merge(tweets_en, labels, left_on='user_id', right_on='id', how='inner')
# tweets_labels = pd.merge(tweets_labels, split, left_on='user_id', right_on='split_id', how='inner')
# # tweets_labels = tweets_labels[tweets_labels['split'].isin(['train', 'val'])]

# tweets_labels = tweets_labels.rename(columns={'id_x': 'id'})
# tweets_labels = tweets_labels.drop(['id_y', 'split_id'], axis=1)
# print(tweets_labels.shape)

# tweets_labels.to_csv('/content/drive/MyDrive/Twibot_22/tweet_8_subset_100k.csv', index=False)

In [None]:
tweets_labels = pd.read_csv('/content/tweet_8_subset_100k.csv')

In [None]:
tweets_labels.head()

In [None]:
tweets_labels.shape

In [None]:
tweets_labels.describe()

In [None]:
train_count = tweets_labels[tweets_labels['split'] == 'train'].shape[0]
val_count = tweets_labels[tweets_labels['split'] == 'val'].shape[0]
test_count = tweets_labels[tweets_labels['split'] == 'test'].shape[0]

print(f"Training count: {train_count}")
print(f"Validation count: {val_count}")
print(f"Test count: {test_count}")

In [None]:
labels = pd.read_csv('/content/label.csv')

# Plotting Label Distribution (Human vs Bot)
plt.figure(figsize=(6, 4))
labels['label'].value_counts().plot(kind='bar')
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Distribution of Labels (Human vs Bot)')
plt.show()

In [None]:
tweets_labels['tweet_length'] = tweets_labels['text'].str.len()

# Plotting Tweet Length Distribution
plt.figure(figsize=(8, 6))
plt.hist(tweets_labels['tweet_length'], bins=20)
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.title('Distribution of Tweet Lengths')
plt.show()

In [None]:
# full_text_bot_tweets = tweets_label['text'].tolist()
# for text in full_text_bot_tweets:
#   print(text)

Data Cleaning

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
  """Cleans text by removing URLs, mentions, hashtags, special characters,
  converting to lowercase, removing 'rt', and applying stemming."""
  text = re.sub(r'\bRT\b' , "", text)
  text = re.sub(r'http\S+', '', text)  # Remove URLs
  text = re.sub(r'@\S+', '', text)  # Remove mentions
  text = re.sub(r'#\S+', '', text)  # Remove hashtags
  text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
  return text.strip()

def preprocess_tweet(tweet):
  """Preprocesses a tweet by removing stop words, punctuation,
  and applying stemming."""
  stop_words = set(stopwords.words('english'))
  tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
  tokens = tokenizer.tokenize(tweet)

  stemmer = PorterStemmer()
  filtered_tokens = [stemmer.stem(w) for w in tokens if w not in stop_words and w.isalnum()]
  return " ".join(filtered_tokens)

def clean_and_preprocess(text):
  """Cleans and preprocesses text by removing URLs, mentions, hashtags,
  special characters, converting to lowercase, removing 'rt', and applying
  stemming."""
  text = clean_text(text)
  text = preprocess_tweet(text)
  return text

Simple Tweets Models - Using only text data for predictions

In [None]:
df_reduced = tweets_labels[['user_id', 'text', 'label','split']]

In [None]:
df_reduced.head()

In [None]:
df_reduced['processed_text'] = df_reduced['text'].apply(clean_and_preprocess)

df_reduced = df_reduced.drop(['text'], axis=1)

df_reduced.head()

In [None]:
df_reduced.shape

In [None]:
label_counts = df_reduced['label'].value_counts()
print(label_counts)

In [None]:
# Split the data
X_train = df_reduced[tweets_labels['split'] == 'train']
X_eval = df_reduced[tweets_labels['split'] == 'val']
X_test = df_reduced[tweets_labels['split'] == 'test']

In [None]:
X_test.head()

To address class imbalance, we employ sampling techniques to ensure balanced representation for model training.

In [None]:
# # Separate human and bot tweets
# human_tweets = df_reduced[df_reduced['label'] == 'human']
# bot_tweets = df_reduced[df_reduced['label'] == 'bot']

# # Determine the smaller number of human or bot tweets
# min_count = min(len(human_tweets), len(bot_tweets))

# # Sample the dataframes to have equal number of human and bot tweets
# sampled_human_tweets = human_tweets.sample(n=min_count, random_state=42)
# sampled_bot_tweets = bot_tweets.sample(n=min_count, random_state=42)

# # Combine the sampled human and bot tweets
# balanced_df = pd.concat([sampled_human_tweets, sampled_bot_tweets])

# # Shuffle the dataframe to randomize the order
# balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# print(balanced_df['label'].value_counts())

BERT Model: Utilized as a Baseline for Performance Evaluation.

In [None]:
# Load pre-trained BERT tokenizer and model
bert_basic_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_basic_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # 2 for binary classification (bot/human)

# Prepare data for BERT
def prepare_data(df):
    texts = df['processed_text'].tolist()
    labels = [1 if label == 'bot' else 0 for label in df['label'].tolist()]  # Convert labels to 0 and 1

    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = bert_basic_tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf',
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)
    labels = tf.convert_to_tensor(labels)

    # Remove extra dimension from tensors
    input_ids = tf.squeeze(input_ids, axis=1)
    attention_masks = tf.squeeze(attention_masks, axis=1)

    return input_ids, attention_masks, labels

# Prepare data for each split
train_input_ids, train_attention_masks, train_labels = prepare_data(X_train)
val_input_ids, val_attention_masks, val_labels = prepare_data(X_eval)
test_input_ids, test_attention_masks, test_labels = prepare_data(X_test)

# Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# Compile the model
bert_basic_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model with validation data
bert_basic_model.fit(
    {'input_ids': train_input_ids, 'attention_mask': train_attention_masks},
    train_labels,
    validation_data=({'input_ids': val_input_ids, 'attention_mask': val_attention_masks}, val_labels),
    batch_size=32,
    epochs=3
)

bert_train_preds = bert_basic_model.predict({'input_ids': train_input_ids, 'attention_mask': train_attention_masks})
bert_val_preds = bert_basic_model.predict({'input_ids': val_input_ids, 'attention_mask': val_attention_masks})
bert_test_preds = bert_basic_model.predict({'input_ids': test_input_ids, 'attention_mask': test_attention_masks})

# Evaluate the model on the test data
bert_basic_loss, bert_basic_accuracy = bert_basic_model.evaluate(
    {'input_ids': test_input_ids, 'attention_mask': test_attention_masks},
    test_labels
)
print(f'Loss: {bert_basic_loss}')
print(f'Accuracy: {bert_basic_accuracy}')

# Access logits and apply softmax to get probabilities
logits = bert_test_preds.logits  # raw scores for each class
probabilities = tf.nn.softmax(logits, axis=-1).numpy()  # convert to probabilities

# Convert probabilities to binary predictions (0 or 1)
y_pred_bert_basic = (probabilities[:, 1] > 0.5).astype(int)  # 1 if probability > 0.5, else 0

In [None]:
from sklearn.metrics import f1_score

# ... [Your existing code up to predictions]

# Convert probabilities to binary predictions (0 or 1)
y_pred_bert_basic = (probabilities[:, 1] > 0.5).astype(int)  # 1 if probability > 0.5, else 0

# Calculate the F1 score
f1 = f1_score(test_labels.numpy(), y_pred_bert_basic)
print(f'F1 Score: {f1:.4f}')


In [None]:
# Clear all variables except for loaded libraries and y_pred_bert_basic
import gc

# Get a list of all variables in the global scope
variables = list(globals().keys())

# Remove all variables except for y_pred_bert_basic and the loaded libraries
for var in variables:
    if var != 'y_pred_bert_basic' and not var.startswith('__'):
        del globals()[var]

# Clear garbage collected objects
gc.collect()

LLAMA 3

In [None]:
# Load the LLaMA model and tokenizer
# model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "meta-llama/Llama-3.1-8B-Instruct"
# access_token = "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"

access_token_rachit = "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"

# quantization_config = BitsAndBytesConfig(load_in_4bit=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", quantization_config=bnb_config, token = access_token_rachit)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=access_token_rachit)

Llama 3.1 training and fine tuning https://www.datacamp.com/tutorial/fine-tuning-llama-3-1

In [None]:
base_model_name = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["human", "bot"]

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2,
                        temperature=0.1)

        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")

    return y_pred

In [None]:
def evaluate(y_true, y_pred):
    labels = ["human", "bot"]
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            Classify each twitter account as Bot or Human based on their tweet.
text: {data_point["processed_text"]}
label: {data_point["label"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify each twitter account as Bot or Human based on their tweet.
text: {data_point["processed_text"]}
label: """.strip()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'label']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

In [None]:
X_train.label.value_counts()

In [None]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [None]:
y_pred_llama = predict(X_test, model, tokenizer)

In [None]:
evaluate(y_true, y_pred_llama)

Llama Training

In [None]:
# import bitsandbytes as bnb

# def find_all_linear_names(model):
#     cls = bnb.nn.Linear4bit
#     lora_module_names = set()
#     for name, module in model.named_modules():
#         if isinstance(module, cls):
#             names = name.split('.')
#             lora_module_names.add(names[0] if len(names) == 1 else names[-1])
#     if 'lm_head' in lora_module_names:  # needed for 16 bit
#         lora_module_names.remove('lm_head')
#     return list(lora_module_names)
# modules = find_all_linear_names(model)
# modules

In [None]:
# output_dir="/content/drive/MyDrive/Twibot_22/llama-3.1-fine-tuned-model"

# peft_config = LoraConfig(
#     lora_alpha=16,
#     lora_dropout=0,
#     r=64,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=modules,
# )

# model.gradient_checkpointing_enable()
# model= prepare_model_for_kbit_training(model)
# peft_config.init_lora_weights = False
# model.add_adapter(peft_config)

# training_arguments = TrainingArguments(
#     output_dir=output_dir,                    # directory to save and repository id
#     num_train_epochs=1,                       # number of training epochs
#     per_device_train_batch_size=1,            # batch size per device during training
#     gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
#     gradient_checkpointing=True,              # use gradient checkpointing to save memory
#     optim="paged_adamw_32bit",
#     logging_steps=1,
#     learning_rate=2e-4,                       # learning rate, based on QLoRA paper
#     weight_decay=0.001,
#     fp16=True,
#     bf16=False,
#     max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
#     max_steps=-1,
#     warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
#     group_by_length=False,
#     lr_scheduler_type="cosine",               # use cosine learning rate scheduler
#     report_to="wandb",                  # report metrics to w&b
#     eval_strategy="steps",              # save checkpoint every epoch
#     eval_steps = 0.2
# )

# trainer = SFTTrainer(
#     model=model,
#     args=training_arguments,
#     train_dataset=train_data,
#     eval_dataset=eval_data,
#     peft_config=peft_config,
#     dataset_text_field="text",
#     tokenizer=tokenizer,
#     max_seq_length=512,
#     packing=False,
#     dataset_kwargs={
#     "add_special_tokens": False,
#     "append_concat_token": False,
#     }
# )

In [None]:
# trainer.train()

In [None]:
# wandb.finish()
# model.config.use_cache = True

In [None]:
# # Save trained model and tokenizer
# trainer.save_model(output_dir)
# tokenizer.save_pretrained(output_dir)

In [None]:
# y_pred = predict(X_test, model, tokenizer)
# evaluate(y_true, y_pred)

Llama 3.1 in-context

In [None]:
# Define the prompt generation functions with k in-context samples
def generate_prompt_incontext(data_point, X_train, k=5):
    # Sample k random examples from X_train
    examples = X_train.sample(n=k)
    example_prompts = "\n".join([
        f"tweet: {ex['processed_text']}\nlabel: {ex['label']}"
        for _, ex in examples.iterrows()
    ])

    # Append the current data point at the end of the prompt
    return f"""
            Classify each twitter account as Bot or Human based on their tweet, above we have attached few samples.
{example_prompts}
text: {data_point['processed_text']}
label: {data_point['label']}
    """.strip()

def generate_test_prompt_incontext(data_point, X_train, k=5):
    # Sample k random examples from X_train
    examples = X_train.sample(n=k)
    example_prompts = "\n".join([
        f"text: {ex['processed_text']}\nlabel: {ex['label']}"
        for _, ex in examples.iterrows()
    ])

    # Append the current test data point at the end of the prompt, with an empty label for prediction
    return f"""
            Classify each twitter account as Bot or Human based on their tweet, above we have attached few samples.
{example_prompts}
text: {data_point['processed_text']}
label:
    """.strip()

# Generate prompts for training and evaluation data
# X_train['text'] = X_train.apply(lambda dp: generate_prompt_incontext(dp, X_train, k=5), axis=1)
# X_eval['text'] = X_eval.apply(lambda dp: generate_prompt_incontext(dp, X_train, k=5), axis=1)

# Generate test prompts and extract true labels
y_true = X_test['label']
X_test = pd.DataFrame(X_test.apply(lambda dp: generate_test_prompt_incontext(dp, X_train, k=5), axis=1), columns=["text"])

In [None]:
# Convert to datasets
# train_data = Dataset.from_pandas(X_train[["incontext_text"]])
# eval_data = Dataset.from_pandas(X_eval[["incontext_text"]])

In [None]:
y_pred_llama_incontext = predict(X_test, model, tokenizer)

In [None]:
evaluate(y_true, y_pred_llama_incontext)

In [None]:
evaluate(y_true, y_pred_bert_basic)

Old - Do not use or delete

## Enhanced Model

Feature engineering

In [None]:
tweets_labels.head()

In [None]:
tweets_labels.dtypes

Extracting public metrics to compute summary statistics at the user level. Extending tweet-level data with user-level insights

In [None]:
tweets_labels['public_metrics'] = tweets_labels['public_metrics'].astype(str).str.replace('None', "'None'", regex=False)
tweets_labels['public_metrics'] = tweets_labels['public_metrics'].apply(ast.literal_eval)

# Extract each value into separate columns
tweets_labels['retweet_count'] = tweets_labels['public_metrics'].apply(lambda x: x.get('retweet_count', 0) if x is not None else 0)
tweets_labels['reply_count'] = tweets_labels['public_metrics'].apply(lambda x: x.get('reply_count', 0) if x is not None else 0)
tweets_labels['like_count'] = tweets_labels['public_metrics'].apply(lambda x: x.get('like_count', 0) if x is not None else 0)
tweets_labels['quote_count'] = tweets_labels['public_metrics'].apply(lambda x: x.get('quote_count', 0) if x is not None else 0)

In [None]:
tweets_labels[['retweet_count', 'reply_count', 'like_count', 'quote_count']] = \
    tweets_labels[['retweet_count', 'reply_count', 'like_count', 'quote_count']].apply(pd.to_numeric, errors='coerce')

tweets_labels[['retweet_count', 'reply_count', 'like_count', 'quote_count']] = tweets_labels[['retweet_count', 'reply_count', 'like_count', 'quote_count']].fillna(0)

In [None]:
tweets_labels.head()

In [None]:
# Convert 'created_at' to datetime objects
tweets_labels['created_at'] = pd.to_datetime(tweets_labels['created_at'])

# Access retweet count using the apply method
def get_retweet_count(x):
    try:
        return x['public_metrics']['retweet_count']
    except (KeyError, TypeError):
        return 0

# Group by author and calculate the required metrics
author_summary = tweets_labels.groupby('user_id').agg(
    total_tweets=('text', 'count'),
    last_day_tweets=('created_at', lambda x: (x >= (pd.Timestamp.now(tz='UTC') - pd.DateOffset(days=1))).sum()),
    last_week_tweets=('created_at', lambda x: (x >= (pd.Timestamp.now(tz='UTC') - pd.DateOffset(weeks=1))).sum()),
    last_month_tweets=('created_at', lambda x: (x >= (pd.Timestamp.now(tz='UTC') - pd.DateOffset(months=1))).sum()),
    days_since_first_tweet=('created_at', lambda x: (pd.Timestamp.now(tz='UTC') - x.min()).days),
    days_since_last_tweet=('created_at', lambda x: (pd.Timestamp.now(tz='UTC') - x.max()).days),
    total_retweets=('retweet_count', 'sum'),
    total_replies=('reply_count', 'sum'),
    total_likes=('like_count', 'sum'),
    total_quotes=('quote_count', 'sum')
)

In [None]:
author_summary.head()

In [None]:
# Merge author_summary with tweets_labels on 'user_id'
tweets_labels_with_summary = pd.merge(tweets_labels, author_summary, on='user_id', how='left')

# Print the merged dataframe
tweets_labels_with_summary.head()

Clean Data

In [None]:
tweets_labels_with_summary['processed_text'] = tweets_labels_with_summary['text'].apply(clean_and_preprocess)

In [None]:
tweets_labels_with_summary = tweets_labels_with_summary.drop(['lang', 'attachments', 'author_id', 'conversation_id', 'geo', 'id', 'in_reply_to_user_id', 'reply_settings', 'source', 'referenced_tweets', 'withheld', 'public_metrics', 'text'], axis=1)


Using the BERT model (though not the ideal strategy) as it allows us to compare baselines. We appended additional feature tensors to BERT's text embeddings for enhanced representation.

BERT's accuracy using only text data and text with additional features is fluctuating a lot. Need to check for overfitting.

In [None]:
# Prepare LightGBM features
def prepare_lgbm_data(df):
    features = df[['retweet_count', 'reply_count', 'like_count', 'quote_count', 'total_tweets',
                   'last_day_tweets', 'last_week_tweets', 'last_month_tweets', 'days_since_first_tweet',
                   'days_since_last_tweet', 'total_retweets', 'total_replies', 'total_likes']].values
    labels = [1 if label == 'bot' else 0 for label in df['label'].tolist()]
    return features, labels

# Preparing data
train_input_ids, train_attention_ids = prepare_bert_data(X_train['processed_text'].tolist())

train_lgbm_features, train_lgbm_labels = prepare_lgbm_data(X_train)
val_lgbm_features, val_lgbm_labels = prepare_lgbm_data(X_eval)
test_lgbm_features, test_lgbm_labels = prepare_lgbm_data(X_test)

# Train LightGBM model
train_data = lgb.Dataset(train_lgbm_features, label=train_lgbm_labels)
val_data = lgb.Dataset(val_lgbm_features, label=val_lgbm_labels)
test_data = lgb.Dataset(test_lgbm_features, label=test_lgbm_labels)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

lgb_model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[val_data], early_stopping_rounds=10)

# Get LightGBM predictions
lgb_train_preds = lgb_model.predict(train_lgbm_features)
lgb_val_preds = lgb_model.predict(val_lgbm_features)
lgb_test_preds = lgb_model.predict(test_lgbm_features)

# Ensemble predictions by averaging BERT and LightGBM outputs
ensemble_train_preds = (bert_train_preds.logits[:, 1] + lgb_train_preds) / 2
ensemble_val_preds = (bert_val_preds.logits[:, 1] + lgb_val_preds) / 2
ensemble_test_preds = (bert_test_preds.logits[:, 1] + lgb_test_preds) / 2

# Convert predictions to binary labels (0 or 1)
ensemble_train_labels = (ensemble_train_preds > 0.5).astype(int)
ensemble_val_labels = (ensemble_val_preds > 0.5).astype(int)
ensemble_test_labels = (ensemble_test_preds > 0.5).astype(int)

# Evaluate ensemble model
train_accuracy = accuracy_score(train_lgbm_labels, ensemble_train_labels)
val_accuracy = accuracy_score(val_lgbm_labels, ensemble_val_labels)
test_accuracy = accuracy_score(test_lgbm_labels, ensemble_test_labels)

print(f"Training Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {val_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

bert_ensemble_lgbm_accuracy = test_accuracy

In [None]:
evaluate(y_true, ensemble_test_labels)

In [None]:
basic_bert_accuracy = (y_pred_bert_basic == y_true).mean()
llama_3_accuracy = (y_pred_llama == y_true).mean()
llama_3_incontext_accuracy = (y_pred_llama_incontext == y_true).mean()
bert_ensemble_lgbm_accuracy = (ensemble_test_labels == y_true).mean()

accuracies = [basic_bert_accuracy, llama_3_accuracy, llama_3_incontext_accuracy, bert_ensemble_lgbm_accuracy]
model_names = ['BERT', 'LLAMA 3 Accuracy', 'LLAMA 3 with Incontext Examples Accuracy', 'BERT Ensemble with LGBM']

# Create a bar graph
plt.figure(figsize=(8, 6))
plt.bar(model_names, accuracies)

# Add labels and title
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Comparison of Model Accuracies')

# Show the plot
plt.show()

WORKING BELOW IS DONE AFTER PROJECT STATUS REPORT SUBMISSION

In [None]:
tweets_labels = pd.read_csv('/content/tweet_8_subset_100k.csv')

In [None]:
tweets_labels.head()

Unnamed: 0,attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld,user_id,label,split
0,,730877400662212609,,1391411268519616518,2021-05-09 15:14:32+00:00,"{'hashtags': [], 'symbols': [], 'user_mentions...",,t1391411268519616518,,en,False,"{'retweet_count': 1, 'reply_count': None, 'lik...",,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",Brand new R. Missing interview with Spain's @m...,,u730877400662212609,human,val
1,,730877400662212609,,1390722859757096969,2021-05-07 17:39:02+00:00,"{'hashtags': [], 'symbols': [], 'user_mentions...",,t1390722859757096969,,en,False,"{'retweet_count': 1, 'reply_count': None, 'lik...",,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @GavinHavery: Kicking off tonight's Radio B...,,u730877400662212609,human,val
2,,730877400662212609,,1390722788554645506,2021-05-07 17:38:45+00:00,"{'hashtags': [], 'symbols': [], 'user_mentions...",,t1390722788554645506,,en,False,"{'retweet_count': 3, 'reply_count': None, 'lik...",,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @puddlegum: R. Missing - @rmissingmusic - r...,,u730877400662212609,human,val
3,,730877400662212609,,1390417994325577733,2021-05-06 21:27:37+00:00,"{'hashtags': [], 'symbols': [], 'user_mentions...",,t1390417994325577733,,en,False,"{'retweet_count': 2, 'reply_count': None, 'lik...",,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @BigTakeoverMag: Album Premiere: Crimeless ...,,u730877400662212609,human,val
4,,730877400662212609,,1390292998584520711,2021-05-06 13:10:55+00:00,"{'hashtags': [{'text': 'ALBUMPREMIERE', 'indic...",,t1390292998584520711,,en,False,"{'retweet_count': 2, 'reply_count': None, 'lik...",,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @JenStratosphere: MY #ALBUMPREMIERE of atmo...,,u730877400662212609,human,val


In [None]:
train_count = tweets_labels[tweets_labels['split'] == 'train'].shape[0]
val_count = tweets_labels[tweets_labels['split'] == 'val'].shape[0]
test_count = tweets_labels[tweets_labels['split'] == 'test'].shape[0]

print(f"Training count: {train_count}")
print(f"Validation count: {val_count}")
print(f"Test count: {test_count}")

Training count: 4208
Validation count: 660
Test count: 256


In [None]:
num_rows = len(tweets_labels)
num_splits = 3
split_size = num_rows // num_splits

# Split the dataset into three parts
for i in range(num_splits):
    start_idx = i * split_size
    end_idx = start_idx + split_size if i < num_splits - 1 else num_rows  # Ensure the last split gets remaining rows
    split = tweets_labels.iloc[start_idx:end_idx]
    split_file_path = f'tweet_subset_split_{i+1}.csv'  # Output filenames
    split.to_csv(split_file_path, index=False)
    print(f"Saved: {split_file_path}")

Saved: tweet_subset_split_1.csv
Saved: tweet_subset_split_2.csv
Saved: tweet_subset_split_3.csv


In [None]:
tweets_1 = pd.read_csv('/content/tweet_subset_split_1.csv')
tweets_2 = pd.read_csv('/content/tweet_subset_split_2.csv')
tweets_3 = pd.read_csv('/content/tweet_subset_split_3.csv')

split = pd.read_csv('/content/split.csv')

labels = pd.read_csv('/content/label.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/tweet_subset_split_1.csv'

In [None]:
labels = pd.read_csv('/content/label.csv')
labels.head()

Unnamed: 0,id,label
0,u1217628182611927040,human
1,u2664730894,human
2,u1266703520205549568,human
3,u1089159225148882949,human
4,u36741729,bot


In [None]:
split.head()

Unnamed: 0,id,split
0,u2664730894,train
1,u1089159225148882949,train
2,u36741729,train
3,u1679822588,train
4,u1519144464,train


In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import ijson
import json

from decimal import Decimal

def process_and_split_large_json(file_path, output_prefix, num_splits=3, split_size=30000):
    """Reads a large JSON file line by line, processes the data, and saves splits.

    Args:
        file_path: Path to the JSON file.
        output_prefix: Prefix for the output files.
        num_splits: Number of splits to create.
        split_size: Number of rows in each split.
    """
    def convert_to_serializable(obj):
        """Convert non-serializable objects to a serializable format."""
        if isinstance(obj, Decimal):
            return float(obj)  # or str(obj) if you prefer
        raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")

    with open(file_path, 'r') as f:
        objects = ijson.items(f, 'item')

        tweets = []
        count = 0
        file_count = 1

        for tweet in objects:
            if count < split_size:
                tweets.append(tweet)
                count += 1
            else:
                # Save the current split
                output_file = f"{output_prefix}_split_{file_count}.json"
                with open(output_file, 'w') as out_f:
                    json.dump(tweets, out_f, indent=2, default=convert_to_serializable)
                print(f"Saved: {output_file}")

                # Prepare for the next split
                tweets = [tweet]
                count = 1
                file_count += 1

                # Stop if all splits are completed
                if file_count > num_splits:
                    break

        # Save the remaining tweets (if any)
        if tweets and file_count <= num_splits:
            output_file = f"{output_prefix}_split_{file_count}.json"
            with open(output_file, 'w') as out_f:
                json.dump(tweets, out_f, indent=2, default=convert_to_serializable)
            print(f"Saved: {output_file}")


# Example usage
process_and_split_large_json(
    file_path = '/content/drive/MyDrive/Twibot_22/tweet_0.json',  # Replace with your file path
    output_prefix='tweet_subset',  # Prefix for the output files
    num_splits=3,                  # Number of splits
    split_size=30000               # Number of tweets per split
)

Saved: tweet_subset_split_1.json
Saved: tweet_subset_split_2.json
Saved: tweet_subset_split_3.json


CODE BELOW ITERATES OVER TWEET FILE, ALSO TAKES IN LABEL AND SPLIT FILES...AND THEN GENERATES PROCESSED TWEET FILES WITH 5 TWEETS PER USER.

In [None]:
def process_large_json(file_path, num_lines=100000):
  """Reads a large JSON file line by line and processes the data.

  Args:
    file_path: Path to the JSON file.
    num_lines: The number of lines to process.
  """
  tweets = []
  with open(file_path, 'r') as f:
    objects = ijson.items(f, 'item')
    count = 0
    for tweet in objects:
      if count < num_lines:
          tweets.append(tweet)
          count += 1
      else:
        break

  return tweets

In [None]:
import os
import pandas as pd
import json
import random

files = ['tweet_subset_split_1.json', 'tweet_subset_split_2.json', 'tweet_subset_split_3.json']

labels = pd.read_csv('/content/drive/MyDrive/Twibot_22/label.csv')
split = pd.read_csv('/content/drive/MyDrive/Twibot_22/split.csv')

split = split.rename(columns={'id': 'split_id'})

train_file = 'tweets_train.json'
val_file = 'tweets_val.json'
test_file = 'tweets_test.json'

# Remove files if they already exist to start fresh
for file_path in [train_file, val_file, test_file]:
    if os.path.exists(f'/content/{file_path}'):
        os.remove(f'/content/{file_path}')

# Initialize empty lists to store all tweet data for each split
train_data = []
val_data = []
test_data = []

# Process each tweet file
for file_name in files:
    file_path = f'/content/{file_name}'
    tweets = process_large_json(file_path)

    tweets = pd.DataFrame(tweets)

    print(f"Processing {file_name}, shape: {tweets.shape}")

    tweets['user_id'] = 'u' + tweets['author_id'].astype(str)
    tweets = tweets[tweets['lang'] == 'en']
    tweets = tweets[['user_id', 'text']]

    # Merge with labels and split data
    tweets_labels = pd.merge(tweets, labels, left_on='user_id', right_on='id', how='inner')
    tweets_labels = pd.merge(tweets_labels, split, left_on='user_id', right_on='split_id', how='inner')
    tweets_labels = tweets_labels.drop(['id', 'split_id'], axis=1)

    # Filter data by split (train, val, test)
    tweets_labels_train = tweets_labels[tweets_labels['split'].isin(['train'])].drop_duplicates()
    tweets_labels_val = tweets_labels[tweets_labels['split'].isin(['val'])].drop_duplicates()
    tweets_labels_test = tweets_labels[tweets_labels['split'].isin(['test'])].drop_duplicates()

    # Select 5 random tweets per user in the training data only
    tweets_labels_train = tweets_labels_train.groupby('user_id').apply(lambda x: x.sample(n=min(5, len(x)), random_state=42)).reset_index(drop=True)

    print('train', tweets_labels_train.shape)
    print('val', tweets_labels_val.shape)
    print('test', tweets_labels_test.shape)

    # Append the data to the lists
    train_data.append(tweets_labels_train.to_dict(orient='records'))
    val_data.append(tweets_labels_val.to_dict(orient='records'))
    test_data.append(tweets_labels_test.to_dict(orient='records'))

    print(f"Processed and added data for {file_name}")
    print()

# Save each split's data as JSON
with open(train_file, 'w') as f_train:
    json.dump([item for sublist in train_data for item in sublist], f_train, indent=4)

with open(val_file, 'w') as f_val:
    json.dump([item for sublist in val_data for item in sublist], f_val, indent=4)

with open(test_file, 'w') as f_test:
    json.dump([item for sublist in test_data for item in sublist], f_test, indent=4)

print(f"Saved train data to {train_file}")
print(f"Saved val data to {val_file}")
print(f"Saved test data to {test_file}")

# Optionally, read back the files and check if everything is correct
for file_path in [train_file, val_file, test_file]:
    try:
        with open(f'/content/{file_path}', 'r') as f:
            data = json.load(f)
            print(f"Read {file_path}, total records:", len(data))
    except Exception as e:
        print(f"Error reading {file_path}: {e}")


Processing tweet_subset_split_1.json, shape: (30000, 17)


  tweets_labels_train = tweets_labels_train.groupby('user_id').apply(lambda x: x.sample(n=min(5, len(x)), random_state=42)).reset_index(drop=True)


train (2761, 4)
val (3655, 4)
test (4363, 4)
Processed and added data for tweet_subset_split_1.json

Processing tweet_subset_split_2.json, shape: (30000, 17)


  tweets_labels_train = tweets_labels_train.groupby('user_id').apply(lambda x: x.sample(n=min(5, len(x)), random_state=42)).reset_index(drop=True)


train (2835, 4)
val (4000, 4)
test (3746, 4)
Processed and added data for tweet_subset_split_2.json

Processing tweet_subset_split_3.json, shape: (30000, 17)


  tweets_labels_train = tweets_labels_train.groupby('user_id').apply(lambda x: x.sample(n=min(5, len(x)), random_state=42)).reset_index(drop=True)


train (2759, 4)
val (4300, 4)
test (4000, 4)
Processed and added data for tweet_subset_split_3.json

Saved train data to tweets_train.json
Saved val data to tweets_val.json
Saved test data to tweets_test.json
Read tweets_train.json, total records: 8355
Read tweets_val.json, total records: 11955
Read tweets_test.json, total records: 12109


In [None]:
import json

train_file = 'tweets_train.json'
val_file = 'tweets_val.json'
test_file = 'tweets_test.json'

# Read and count rows in each file
for file_name, split_name in zip([train_file, val_file, test_file], ['Train', 'Validation', 'Test']):
    try:
        # Open the JSON file and load its content
        with open(file_name, 'r') as f:
            data = json.load(f)

        # Print the number of rows
        print(f"{split_name} file '{file_name}' contains {len(data)} rows.")

    except FileNotFoundError:
        print(f"{split_name} file '{file_name}' not found.")
    except json.JSONDecodeError:
        print(f"Error reading {file_name}: Invalid JSON format.")


Train file 'tweets_train.json' contains 8355 rows.
Validation file 'tweets_val.json' contains 11955 rows.
Test file 'tweets_test.json' contains 12109 rows.


In [None]:
df.head()

Unnamed: 0,user_id,text,split,label
0,u1459391889489072130,RT @BhaveshDagla: #जालोर जिले के बागोडा़ तहसील...,test,human
1,u221777661,RT @quinta: Pare proprio ci stiano dando dentr...,test,human
2,u221777661,RT @Kevin2600: These are what I think are must...,test,human
3,u221777661,RT @FPFabrizioPhD: Quello che è successo quest...,test,human
4,u221777661,RT @LatestAnonPress: BREAKING: #Anonymous inva...,test,human
