In [1]:
import pandas as pd

In [2]:
import lmod
await lmod.purge(force=True)
await lmod.load('jdk/17.0.5')

In [3]:
import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession.builder.appName("Our First Spark Example").config("spark.executor.memory", "108g").config("spark.driver.memory", "72g").getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/21 19:13:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from pyspark.sql import functions as F
from pyspark.sql import Window

In [5]:
graph_data = spark.read.parquet("Data/graph_data_user.parquet")

In [11]:
#graph_data = graph_data.filter((graph_data.relation == "followers") | (graph_data.relation == "following"))

In [12]:
#graph_data.show()

+--------------------+---------+--------------------+
|           source_id| relation|           target_id|
+--------------------+---------+--------------------+
| u980749991491682304|followers|u1480979504696864775|
|          u105387876|following|          u402576793|
|          u148520716|following|           u59653593|
|u1276438425457967110|following|u1389155636693381120|
|u1445432327367237638|following| u848348952084828160|
|u1445432327367237638|following| u850507814023942144|
|u1078324065532764166|following| u781676758932262912|
|         u3185512585|following|           u19376807|
|          u246495872|following|           u66780587|
|         u2704715387|followers|u1483909830159085571|
|         u4311016395|following|         u1281203928|
|         u3254857712|following| u876448351268380672|
|u1108086217088724992|following|          u213169506|
|         u2750548111|following|           u25030647|
|           u33746386|following|           u22730752|
|u1093530573140824064|follow

In [6]:
graph_data.count()

3743634

In [14]:
relation_count = graph_data.groupBy("relation").count()

In [15]:
relation_count.show()



+---------+-------+
| relation|  count|
+---------+-------+
|followers|1116655|
|following|2626979|
+---------+-------+



                                                                                

In [16]:
unique_count_source = graph_data.select("source_id").distinct().count()
print(f"Number of unique values in the 'source_id' column: {unique_count_source}")
unique_count_target = graph_data.select("target_id").distinct().count()
print(f"Number of unique values in the 'target_id' column: {unique_count_target}")

                                                                                

Number of unique values in the 'source_id' column: 10009




Number of unique values in the 'target_id' column: 693720


                                                                                

In [18]:
#graph_data.coalesce(1).write.parquet("Data/SparkOutput",mode = "overwrite")

                                                                                

In [7]:
user_data = spark.read.option("multiline", "true").json("Data/user.json")

                                                                                

In [8]:
user_ids = [row.id for row in user_data.select("id").collect()]

# Filter graph_data using the list of user_ids
graph_data = graph_data.filter(
    (graph_data.source_id.isin(user_ids)) & (graph_data.target_id.isin(user_ids))
)


                                                                                

In [10]:
graph_data.select("source_id").distinct().count()

24/11/21 03:38:04 WARN DAGScheduler: Broadcasting large task binary with size 142.4 MiB
24/11/21 03:38:36 WARN DAGScheduler: Broadcasting large task binary with size 142.4 MiB
                                                                                

10009

In [9]:
graph_data.count()

24/11/21 03:35:20 WARN DAGScheduler: Broadcasting large task binary with size 83.6 MiB
                                                                                

3743634

In [21]:
user_df

Unnamed: 0,id,features
0,1,"[feature_a, feature_b]"
1,2,"[feature_b,feature_c]"
2,3,"[feature_c,feature_a]"
3,4,"[feature_d,feature_e]"


# User Structural Info

In [1]:
import pandas as pd
import random
from collections import defaultdict, deque
import numpy as np
from datetime import datetime
import pytz

In [2]:
graph_data = pd.read_parquet("Data/graph_data_user.parquet")
user_df = pd.read_json("Data/user.json")
split_df = pd.read_csv("Data/split.csv")
label_df = pd.read_csv("Data/label.csv")

In [3]:
testing_id_list = split_df[split_df['split']=='test']['id']

In [4]:
user_df_label = pd.merge(user_df,label_df,on='id')

In [5]:
def build_graph_no_rel(relation_df):
    """
    Build an adjacency list graph from the relation dataset.
    """
    graph = defaultdict(set)
    
    for _, row in relation_df.iterrows():
        if row['relation'] == 'following':
            graph[row['source_id']].add(row['target_id'])
        elif row['relation'] == 'followers':
            graph[row['target_id']].add(row['source_id'])
    
    return graph

In [6]:
graph_no_rel = build_graph_no_rel(graph_data)

In [7]:
def build_graph(relation_df):
    """
    Build an adjacency list graph from the relation dataset with the type of relation.
    """
    graph = defaultdict(lambda: {'following': set(), 'followers': set()})
    
    for _, row in relation_df.iterrows():
        if row['relation'] == 'following':
            graph[row['source_id']]['following'].add(row['target_id'])
        elif row['relation'] == 'followers':
            graph[row['target_id']]['followers'].add(row['source_id'])
    
    return graph

In [8]:
graph = build_graph(graph_data)

In [9]:
def classify_account_age(age):
    if age == 0:
        return "account created less than a year ago"
    elif age == 1:
        return "account is 1 year old"
    else:
        return f"account is {age} years old"

In [10]:
conditions = [
    (user_df_label['verified'] & user_df_label['protected']),            
    (user_df_label['verified'] & ~user_df_label['protected']),           
    (~user_df_label['verified'] & user_df_label['protected']),          
    (~user_df_label['verified'] & ~user_df_label['protected'])        
]

# Define corresponding values
choices = [
    'verified and protected',
    'verified but not protected',
    'protected but not verified',
    'neither verified nor protected'
]

In [11]:
current_date = datetime.now(pytz.UTC)
user_df_label['account_age'] = user_df_label['created_at'].apply(lambda x: (current_date - x).days / 365.25)
user_df_label['account_age'] = user_df_label['account_age'].apply(lambda x: np.nan if x > 19 else x)
user_df_label['account_age'] = user_df_label.groupby('label')['account_age'].transform(lambda grp: grp.fillna(grp.mean()))
user_df_label['account_age']= user_df_label['account_age'].astype(int)
user_df_label['account_age'] = user_df_label['account_age'].apply(classify_account_age)
user_df_label['followers_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('followers_count', 0))
user_df_label['following_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('following_count', 0))
user_df_label['tweet_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('tweet_count', 0))
user_df_label['listed_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('listed_count', 0))
user_df_label['status'] = np.select(conditions, choices, default='Unknown')
user_df_label['has_custom_profile_image'] = user_df_label['profile_image_url'].apply(lambda x: "does not have a profile picture" if 'default_profile_images' in x else "has a profile picture")
user_df_label.drop(["protected","verified","created_at","public_metrics","entities","location","withheld","pinned_tweet_id","profile_image_url","url"],axis=1,inplace=True)

In [12]:
user_df_label.head(5)

Unnamed: 0,description,id,name,username,label,account_age,followers_count,following_count,tweet_count,listed_count,status,has_custom_profile_image
0,Theoretical Computer Scientist. See also https...,u1217628182611927040,Boaz Barak,boazbaraktcs,human,account is 4 years old,7316,215,3098,69,neither verified nor protected,has a profile picture
1,creative _,u2664730894,olawale 💨,wale_io,human,account is 10 years old,123,1090,1823,0,neither verified nor protected,has a profile picture
2,👽,u1266703520205549568,panagiota_.b,b_panagiota,human,account is 4 years old,3,62,66,0,neither verified nor protected,has a profile picture
3,mama to maya. ABIM research pathway fellow @UV...,u1089159225148882949,"Jacqueline Hodges, MD MPH",jachodges_md,human,account is 5 years old,350,577,237,1,neither verified nor protected,has a profile picture
4,Father / SWT Alumnus / Longhorn Fan,u36741729,Matthew Stubblefield,Matthew_Brody,bot,account is 15 years old,240,297,3713,8,protected but not verified,has a profile picture


In [13]:
def get_controlled_neighbors(user_id, graph, user_df, max_count=8):
    """
    Get controlled number of followers and following for a given user while maintaining a bot-human ratio.
    
    Parameters:
    - user_id: ID of the user to fetch neighbors for.
    - graph: The adjacency list representing the graph, with types of relations.
    - user_df: DataFrame containing user features, including 'label' (bot or human).
    - max_count: Maximum number of followers or following to fetch (default is 8).
    
    Returns:
    - A dictionary with two keys ('followers', 'following'), each containing a list of user IDs.
    """
    # Extract labels (bot/human) for users
    label_dict = user_df.set_index('id')['label'].to_dict()  # {user_id: 'bot'/'human'}
    
    def select_users(user_set, target_count):
        """
        Select a subset of users while maintaining a bot-to-human ratio close to 1.
        """
        if not user_set:
            return []
        
        # Separate bots and humans
        bots = [user for user in user_set if label_dict.get(user) == 'bot']
        humans = [user for user in user_set if label_dict.get(user) == 'human']
        
        # Calculate the ideal ratio of bots to humans
        total = min(len(bots) + len(humans), target_count)
        bot_target = total // 2
        human_target = total - bot_target
        
        # Select users based on the targets
        selected_bots = random.sample(bots, min(bot_target, len(bots)))
        selected_humans = random.sample(humans, min(human_target, len(humans)))
        
        # Combine and return the selection
        return selected_bots + selected_humans
    
    # Get followers and following
    followers = graph.get(user_id, {}).get('followers', set())
    following = graph.get(user_id, {}).get('following', set())
    
    # Select up to `max_count` followers and following while maintaining bot-human ratio
    selected_followers = select_users(followers, max_count)
    selected_following = select_users(following, max_count)
    
    return {
        'followers': selected_followers,
        'following': selected_following
    }


In [14]:
def get_controlled_k_hop_neighbors(user_id, hops, graph, user_df):
    """
    Get controlled k-hop neighbors for a given user with specified number of users per hop.
    The method is optimized for large datasets by using efficient data structures and avoiding
    unnecessary memory usage.
    
    Parameters:
    - user_id: ID of the user to start from.
    - hops: List of integers specifying how many neighbors to sample at each hop (e.g., [3, 2]).
    - graph: The adjacency list representing the graph.
    - user_df: DataFrame containing user features.
    
    Returns:
    - List of selected neighbors (user_id, features) per hop.
    """
    # Extract only the relevant columns for user features
    user_features_columns = user_df.columns.difference(['id'])  # Exclude the 'id' column
    user_features_dict = user_df.set_index('id')[user_features_columns].to_dict(orient='index')

    # BFS to find neighbors with controlled sample size at each hop
    visited = set()
    queue = deque([(user_id, 0)])  # (current_node, current_depth)
    hop_neighbors = []  # Will hold neighbors for each hop

    # For each hop level, select a specific number of neighbors
    while queue:
        current_node, depth = queue.popleft()

        if depth < len(hops):  # We still have hops left to process
            num_neighbors_at_depth = hops[depth]  # How many neighbors to select at this hop
            neighbors_at_depth = set()

            # Efficiently fetch neighbors from the graph
            for neighbor in graph.get(current_node, []):
                if neighbor not in visited:
                    visited.add(neighbor)
                    neighbors_at_depth.add(neighbor)

            # Randomly select from neighbors_at_depth if needed
            if len(neighbors_at_depth) > num_neighbors_at_depth:
                selected_neighbors = random.sample(list(neighbors_at_depth), num_neighbors_at_depth)
            else:
                selected_neighbors = list(neighbors_at_depth)

            # Add to the list of neighbors for this hop
            hop_neighbors.append((depth + 1, selected_neighbors))

            # Add selected neighbors to the queue for the next hop
            for neighbor in selected_neighbors:
                queue.append((neighbor, depth + 1))

    # Fetch features for all selected neighbors in a memory-efficient way
    selected_ids = set(neighbor for _, neighbors in hop_neighbors for neighbor in neighbors)
    # selected_features = {user_id: user_features_dict[user_id] for user_id in selected_ids if user_id in user_features_dict}

    return hop_neighbors

In [15]:
def get_user_data_prompt(feature_list):
    user_metadata_text = f"""
    The name of the user is '{feature_list['name']}' with an account username '{feature_list['username']}'. The desciption for the user profile is '{feature_list['description']}'. The {feature_list['account_age']} and some of the metric of the user is as follows:
    The user has {feature_list['followers_count']} followers and follows {feature_list['following_count']} accounts. 
    The user has posted {feature_list['tweet_count']} tweets and is a member of {feature_list['listed_count']} public list.
    The user account is {feature_list['status']} and it {feature_list['has_custom_profile_image']} associated with it.
    """
    return user_metadata_text

In [16]:
def network_text_no_rel(hop_neighbors):
    structural_info = f""""""
    for hop in hop_neighbors:
        hop_k, id_list = hop
        structural_info += f"At hop {hop_k} the user is related to these other users :\n"
        for _id in id_list:
            feature_list = user_df_label.set_index('id').loc[_id]
            label = feature_list['label']
            meta_text = get_user_data_prompt(feature_list)
            structural_info += f"{meta_text}"
            structural_info += f"Label: {label}\n"
        structural_info += "\n"
        structural_info+= "---------------------------------------------------------------------------------------------------"
        structural_info += "\n"
    return structural_info

In [17]:
def network_text(neighbors_dict):
    structural_info = f""""""
    followers_list = neighbors_dict['followers']
    following_list = neighbors_dict['following']
    if len(followers_list)>0:
        structural_info += f"Few of the accounts which follows the user are:\n"
        for follower_id in followers_list:
            feature_list = user_df_label.set_index('id').loc[follower_id]
            label = feature_list['label']
            meta_text = get_user_data_prompt(feature_list)
            structural_info += f"{meta_text}"
            structural_info += f"Label: {label}\n"
    else:
        structural_info += f"Could not find the accounts info which follows the user"
    structural_info += "\n"
    structural_info+= "---------------------------------------------------------------------------------------------------"
    structural_info += "\n"
    if len(following_list)>0:
        structural_info += f"Few of the accounts which user is following are:\n"
        for following_id in following_list:
            feature_list = user_df_label.set_index('id').loc[following_id]
            label = feature_list['label']
            meta_text = get_user_data_prompt(feature_list)
            structural_info += f"{meta_text}"
            structural_info += f"Label: {label}\n"
    else:
        structural_info += f"Could not find the accounts info which user is following"
    structural_info += "\n"
    structural_info+= "---------------------------------------------------------------------------------------------------"
    structural_info += "\n"
    return structural_info

In [18]:
neighbors_dict = get_controlled_neighbors('u1493928389669015552',graph=graph,user_df=user_df_label)

In [19]:
print(network_text(neighbors_dict))

Could not find the accounts info which follows the user
---------------------------------------------------------------------------------------------------
Could not find the accounts info which user is following
---------------------------------------------------------------------------------------------------



# Setting up LLM 

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

hf_token = "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"

# Replace with the model identifier
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load tokenizer and model with the token
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token, device_map="auto", torch_dtype="auto")

  from .autonotebook import tqdm as notebook_tqdm
2024-12-16 09:33:17.328476: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734370397.955316   26309 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734370398.105734   26309 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 09:33:19.100152: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards: 100%|██████████| 4/4 [02:27<00:00, 36.78s/it]


In [21]:
from transformers import pipeline

llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [22]:
def parse_output(op):
    if "human" in op:
        return "human"
    else:
        return "bot"

In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
import logging
from datasets import Dataset
from joblib import Parallel, delayed
import math

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s', 
    level=logging.INFO  # You can change to DEBUG for more verbose logs
)

def prepare_prompt(test):
    """Prepares a single prompt for the given test user."""
    try:
        feature_list = user_df_label.set_index('id').loc[test]
        neighbors_dict = get_controlled_neighbors(test, graph, user_df_label)
        structural_info = network_text(neighbors_dict)
        prompt_text = get_user_data_prompt(feature_list)
    
        prompt = f"""
        You are a clever AI agent which can discern between genuine and fake social media profiles. 
        You will be provided with the subset of account's structural information (follower/following relations) if available along with the metadata information.
        Please use these to classify the following twitter user as "human" or "bot"
    
        Structural_Info: {structural_info}
    
        Metadata_Info: {prompt_text}
    
        Your output must be the label either <human> or <bot>. Do not write any explanation or reasoning.
        """
    
        return {
            "id": test,
            "ground_truth": feature_list["label"],
            "prompt": prompt
        }
    except Exception as e:
        logging.error(f"Error preparing prompt for user ID {test}: {e}")
        return None

def process_batch(batch):
    """Processes a batch of test IDs."""
    results = []
    for test in batch:
        result = prepare_prompt(test)
        if result is not None:
            results.append(result)
    return results

def parallel_prepare_prompts(test_ids, batch_size=64, n_jobs=-1):
    """Prepare prompts in parallel for a list of test IDs."""
    total = len(test_ids)
    total_batches = math.ceil(total / batch_size)
    
    logging.info(f"Starting prompt preparation for {total} test IDs in {total_batches} batches of size {batch_size}.")
    
    # Split the test_ids into chunks of batch_size
    batches = [test_ids[i:i + batch_size] for i in range(0, total, batch_size)]
    
    results = Parallel(n_jobs=n_jobs, backend="multiprocessing")(
        delayed(process_batch)(batch) for batch in batches
    )
    
    # Flatten the list of lists of results
    valid_results = [res for batch_results in results for res in batch_results if res is not None]
    
    logging.info(f"Successfully prepared {len(valid_results)} out of {total} prompts.")
    return valid_results

# Generate dataset
data = parallel_prepare_prompts(testing_id_list[:10000])

if data:  # Only save if data is successfully prepared
    dataset = Dataset.from_list(data)
    dataset_path = "Data/user_rel_dataset"
    try:
        dataset.save_to_disk(dataset_path)
        logging.info(f"Dataset successfully saved to {dataset_path}")
    except Exception as e:
        logging.error(f"Failed to save dataset to {dataset_path}: {e}")
else:
    logging.warning("No data was prepared. Dataset will not be saved.")


2024-12-16 09:35:58,132 - INFO - Starting prompt preparation for 10000 test IDs in 157 batches of size 64.
2024-12-16 10:34:59,119 - INFO - Successfully prepared 10000 out of 10000 prompts.
Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 131896.35 examples/s]
2024-12-16 10:34:59,748 - INFO - Dataset successfully saved to Data/user_rel_dataset


In [25]:
# Function to process batches
BATCH_SIZE = 64

def process_batch(batch):
    outputs = llm_pipeline(
        batch["prompt"],
        max_new_tokens=5,
        do_sample=True,
        truncation=True,
        return_full_text=False,
        temperature=0.1,
        pad_token_id=tokenizer.eos_token_id
    )
    cleaned_outputs = [parse_output(output[0]["generated_text"]) for output in outputs]
    # print(cleaned_outputs)
    # cleaned_series = np.array(cleaned_outputs).reshape(1,5)                         
    return {"predictions": cleaned_outputs}

# Process dataset in batches
batched_results = dataset.map(
    process_batch,
    batched=True,
    batch_size=BATCH_SIZE,
    remove_columns=["prompt"]  # Keep only necessary columns
)

# Extract results
predictions = batched_results["predictions"]
ground_truths = batched_results["ground_truth"]

Map:   6%|▋         | 640/10000 [04:09<1:00:08,  2.59 examples/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Map: 100%|██████████| 10000/10000 [51:46<00:00,  3.22 examples/s]


In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

def calculate_metrics(ground_truth, predictions):
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()
    
    # Fit the encoder to the unique labels in ground_truth (and predictions)
    ground_truth_encoded = label_encoder.fit_transform(ground_truth)
    predictions_encoded = label_encoder.transform(predictions)  # Transform predictions
    
    # Accuracy calculation
    accuracy = accuracy_score(ground_truth_encoded, predictions_encoded)
    
    # Precision calculation
    precision = precision_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    # Recall calculation
    recall = recall_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    # F1 Score calculation
    f1 = f1_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    return accuracy, precision, recall, f1


accuracy, precision, recall, f1 = calculate_metrics(ground_truths, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.7576
Precision: 0.6556306245527956
Recall: 0.7576
F1 Score: 0.6717251845564474
