In [1]:
import pandas as pd
import random
import numpy as np
from datetime import datetime
import pytz

In [2]:
user_df = pd.read_json("Data/user.json")
split_df = pd.read_csv("Data/split.csv")
label_df = pd.read_csv("Data/label.csv")

In [3]:
testing_id_list = split_df[split_df['split']=='test']['id']

In [4]:
user_df_label = pd.merge(user_df,label_df,on='id')

In [5]:
def classify_account_age(age):
    if age == 0:
        return "account created less than a year ago"
    elif age == 1:
        return "account is 1 year old"
    else:
        return f"account is {age} years old"

In [6]:
conditions = [
    (user_df_label['verified'] & user_df_label['protected']),            
    (user_df_label['verified'] & ~user_df_label['protected']),           
    (~user_df_label['verified'] & user_df_label['protected']),          
    (~user_df_label['verified'] & ~user_df_label['protected'])        
]

# Define corresponding values
choices = [
    'verified and protected',
    'verified but not protected',
    'protected but not verified',
    'neither verified nor protected'
]

In [7]:
current_date = datetime.now(pytz.UTC)
user_df_label['account_age'] = user_df_label['created_at'].apply(lambda x: (current_date - x).days / 365.25)
user_df_label['account_age'] = user_df_label['account_age'].apply(lambda x: np.nan if x > 19 else x)
user_df_label['account_age'] = user_df_label.groupby('label')['account_age'].transform(lambda grp: grp.fillna(grp.mean()))
user_df_label['account_age']= user_df_label['account_age'].astype(int)
user_df_label['account_age'] = user_df_label['account_age'].apply(classify_account_age)
user_df_label['followers_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('followers_count', 0))
user_df_label['following_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('following_count', 0))
user_df_label['tweet_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('tweet_count', 0))
user_df_label['listed_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('listed_count', 0))
user_df_label['status'] = np.select(conditions, choices, default='Unknown')
user_df_label['has_custom_profile_image'] = user_df_label['profile_image_url'].apply(lambda x: "does not have a profile picture" if 'default_profile_images' in x else "has a profile picture")
user_df_label.drop(["protected","verified","created_at","public_metrics","entities","location","withheld","pinned_tweet_id","profile_image_url","url"],axis=1,inplace=True)

In [8]:
user_df_label.head(5)

Unnamed: 0,description,id,name,username,label,account_age,followers_count,following_count,tweet_count,listed_count,status,has_custom_profile_image
0,Theoretical Computer Scientist. See also https...,u1217628182611927040,Boaz Barak,boazbaraktcs,human,account is 4 years old,7316,215,3098,69,neither verified nor protected,has a profile picture
1,creative _,u2664730894,olawale 💨,wale_io,human,account is 10 years old,123,1090,1823,0,neither verified nor protected,has a profile picture
2,👽,u1266703520205549568,panagiota_.b,b_panagiota,human,account is 4 years old,3,62,66,0,neither verified nor protected,has a profile picture
3,mama to maya. ABIM research pathway fellow @UV...,u1089159225148882949,"Jacqueline Hodges, MD MPH",jachodges_md,human,account is 5 years old,350,577,237,1,neither verified nor protected,has a profile picture
4,Father / SWT Alumnus / Longhorn Fan,u36741729,Matthew Stubblefield,Matthew_Brody,bot,account is 15 years old,240,297,3713,8,protected but not verified,has a profile picture


In [9]:
def get_user_data_prompt(feature_list):
    user_metadata_text = f"""
    The name of the user is '{feature_list['name']}' with an account username '{feature_list['username']}'. The description for the user profile is '{feature_list['description']}'. The {feature_list['account_age']} and some of the metric of the user is as follows:
    The user has {feature_list['followers_count']} followers and follows {feature_list['following_count']} accounts. 
    The user has posted {feature_list['tweet_count']} tweets and is a member of {feature_list['listed_count']} public list.
    The user account is {feature_list['status']} and it {feature_list['has_custom_profile_image']} associated with it.
    """
    return user_metadata_text

In [10]:
def get_user_data_with_label(feature_list):
    user_metadata_text = f"""
    The name of the user is '{feature_list['name']}' with an account username '{feature_list['username']}'. The description for the user profile is '{feature_list['description']}'. The {feature_list['account_age']} and some of the metric of the user is as follows:
    The user has {feature_list['followers_count']} followers and follows {feature_list['following_count']} accounts. 
    The user has posted {feature_list['tweet_count']} tweets and is a member of {feature_list['listed_count']} public list.
    The user account is {feature_list['status']} and it {feature_list['has_custom_profile_image']} associated with it.

    The account is actually a {feature_list['label']}
    """
    return user_metadata_text

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

hf_token = "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"

# Replace with the model identifier
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load tokenizer and model with the token
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token, device_map="auto", torch_dtype="auto")

  from .autonotebook import tqdm as notebook_tqdm
2024-12-16 08:49:04.577566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734367745.202452    6353 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734367745.540036    6353 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 08:49:06.515612: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards: 100%|██████████| 4/4 [02:29<00:00, 37.34s/it]


In [12]:
from transformers import pipeline

llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [13]:
def parse_output(op):
    if "human" in op:
        return "human"
    else:
        return "bot"

In [14]:
def get_user_samples(user_df_label,num_samples=10):
    sampled_df = user_df_label.sample(n=num_samples, random_state=42)
    all_prompts = []
    for _, row in sampled_df.iterrows():
        # Generate the user data prompt
        user_data_prompt = get_user_data_with_label(row)
        
        # Append the generated prompt to the list
        all_prompts.append(user_data_prompt)
    
    # Join all prompts into a single string, separated by newlines
    incontext_example = "\n\n".join(all_prompts)
    
    return incontext_example

In [17]:
import logging
from datasets import Dataset
from joblib import Parallel, delayed
import math

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s', 
    level=logging.INFO  # You can change to DEBUG for more verbose logs
)

def prepare_prompt(test):
    """Prepares a single prompt for the given test user."""
    try:
        feature_list = user_df_label.set_index('id').loc[test]
        incontext_info = get_user_samples(user_df_label)
        prompt_text = get_user_data_prompt(feature_list)
    
        prompt = f"""
        You are a clever AI agent which can discern between genuine and fake twitter profiles. 
        You will be provided with the accounts metadata information along with several examples of genuine and fake users.
        Please use these to classify the following twitter user as "human" or "bot"
    
        Relevant_Examples : {incontext_info}
    
        Metadata_Info: {prompt_text}
    
        Your output must be the label either <human> or <bot>. Do not write any explanation or reasoning.
        """
    
        return {
            "id": test,
            "ground_truth": feature_list["label"],
            "prompt": prompt
        }
    except Exception as e:
        logging.error(f"Error preparing prompt for user ID {test}: {e}")
        return None

def process_batch(batch):
    """Processes a batch of test IDs."""
    results = []
    for test in batch:
        result = prepare_prompt(test)
        if result is not None:
            results.append(result)
    return results

def parallel_prepare_prompts(test_ids, batch_size=64, n_jobs=-1):
    """Prepare prompts in parallel for a list of test IDs."""
    total = len(test_ids)
    total_batches = math.ceil(total / batch_size)
    
    logging.info(f"Starting prompt preparation for {total} test IDs in {total_batches} batches of size {batch_size}.")
    
    # Split the test_ids into chunks of batch_size
    batches = [test_ids[i:i + batch_size] for i in range(0, total, batch_size)]
    
    results = Parallel(n_jobs=n_jobs, backend="multiprocessing")(
        delayed(process_batch)(batch) for batch in batches
    )
    
    # Flatten the list of lists of results
    valid_results = [res for batch_results in results for res in batch_results if res is not None]
    
    logging.info(f"Successfully prepared {len(valid_results)} out of {total} prompts.")
    return valid_results

# Generate dataset
data = parallel_prepare_prompts(testing_id_list[:10000])

if data:  # Only save if data is successfully prepared
    dataset = Dataset.from_list(data)
    dataset_path = "Data/user_meta_dataset"
    try:
        dataset.save_to_disk(dataset_path)
        logging.info(f"Dataset successfully saved to {dataset_path}")
    except Exception as e:
        logging.error(f"Failed to save dataset to {dataset_path}: {e}")
else:
    logging.warning("No data was prepared. Dataset will not be saved.")


2024-12-16 08:54:24,140 - INFO - Starting prompt preparation for 10000 test IDs in 157 batches of size 64.
2024-12-16 09:02:44,531 - INFO - Successfully prepared 10000 out of 10000 prompts.
Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 359261.31 examples/s]
2024-12-16 09:02:45,011 - INFO - Dataset successfully saved to Data/user_meta_dataset


In [18]:
# Function to process batches
BATCH_SIZE = 64

def process_batch(batch):
    outputs = llm_pipeline(
        batch["prompt"],
        max_new_tokens=5,
        do_sample=True,
        truncation=True,
        return_full_text=False,
        temperature=0.1,
        pad_token_id=tokenizer.eos_token_id
    )
    cleaned_outputs = [parse_output(output[0]["generated_text"]) for output in outputs]
    # print(cleaned_outputs)
    # cleaned_series = np.array(cleaned_outputs).reshape(1,5)                         
    return {"predictions": cleaned_outputs}

# Process dataset in batches
batched_results = dataset.map(
    process_batch,
    batched=True,
    batch_size=BATCH_SIZE,
    remove_columns=["prompt"]  # Keep only necessary columns
)

# Extract results
predictions = batched_results["predictions"]
ground_truths = batched_results["ground_truth"]

Map:   5%|▌         | 512/10000 [02:04<38:35,  4.10 examples/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Map: 100%|██████████| 10000/10000 [40:49<00:00,  4.08 examples/s]


In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

def calculate_metrics(ground_truth, predictions):
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()
    
    # Fit the encoder to the unique labels in ground_truth (and predictions)
    ground_truth_encoded = label_encoder.fit_transform(ground_truth)
    predictions_encoded = label_encoder.transform(predictions)  # Transform predictions
    
    # Accuracy calculation
    accuracy = accuracy_score(ground_truth_encoded, predictions_encoded)
    
    # Precision calculation
    precision = precision_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    # Recall calculation
    recall = recall_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    # F1 Score calculation
    f1 = f1_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    return accuracy, precision, recall, f1


accuracy, precision, recall, f1 = calculate_metrics(ground_truths, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7669
Precision: 0.58813561
Recall: 0.7669
F1 Score: 0.6657259720414285


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
