In [1]:
import lmod
await lmod.purge(force=True)
await lmod.load('jdk/17.0.5')

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, FloatType
from transformers import AutoTokenizer, AutoModel
import torch

# Step 1: Initialize PySpark session
spark = SparkSession.builder \
    .appName("TweetEmbeddings") \
    .config("spark.executor.memory", "96g")\
    .config("spark.driver.memory", "64g")\
    .getOrCreate()
spark

  from .autonotebook import tqdm as notebook_tqdm
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/16 21:51:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.parquet("Data/Tweet_SingleFile_Split/tweets_train_0.parquet")

In [5]:
df.show(5)

                                                                                

+--------------------+--------------------+-----+-----+
|             user_id|                text|label|split|
+--------------------+--------------------+-----+-----+
|         u1000012406|@Lesbian_Moses @_...|human|train|
|u1000087449933107200|@PWilliams101 @se...|human|train|
|u1000130044897918977|RT @cricketcrocke...|human|train|
|u1000130044897918977|RT @GlennsTheorem...|human|train|
|u1000130044897918977|@ike_onwuka Thank...|human|train|
+--------------------+--------------------+-----+-----+
only showing top 5 rows



In [6]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

2024-12-16 17:21:29.586060: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734398489.597811     702 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734398489.601363     702 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 17:21:29.614873: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
def compute_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
    return embeddings

In [8]:
embedding_udf = udf(compute_embedding, ArrayType(FloatType()))

In [9]:
df_with_embeddings = df.withColumn("embedding", embedding_udf(col("text")))

In [11]:
df_with_embeddings.write.mode("overwrite").parquet("Data/TrainEmbeddings/embeddings_with_tweets.parquet")

2024-12-16 17:22:13.118592: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 17:22:13.131027: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 17:22:13.131141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734398533.133038    1224 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734398533.137262    1224 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
E0000 00:00:17343

In [12]:
test_data = spark.read.parquet("Data/Tweet_SingleFile_Split/tweets_test_0.parquet")

In [18]:
test_data_subset =  test_data.limit(10000)

In [19]:
test_df_with_embeddings = test_data_subset.withColumn("embedding", embedding_udf(col("text")))

In [20]:
test_df_with_embeddings.write.mode("overwrite").parquet("Data/TestEmbeddings/embeddings_with_tweets.parquet")

2024-12-16 19:24:48.718612: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 19:24:48.718612: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 19:24:48.718635: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 19:24:48.718653: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 19:24:48.719468: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for 

In [4]:
train_data_embed = spark.read.parquet("Data/TrainEmbeddings/embeddings_with_tweets.parquet")

                                                                                

In [16]:
value_counts = train_data_embed.groupBy('label').count()

In [17]:
value_counts.show()



+-----+-------+
|label|  count|
+-----+-------+
|human|7826451|
|  bot| 336488|
+-----+-------+



                                                                                

In [19]:
humans_df = train_data_embed.filter(train_data_embed['label'] == 'human')
bots_df = train_data_embed.filter(train_data_embed['label'] == 'bot')

In [21]:
sampled_humans = humans_df.sample(False, 40000 / humans_df.count())
sampled_bots = bots_df.sample(False, 20000 / bots_df.count())

In [22]:
weighted_train_data_embed = sampled_humans.union(sampled_bots)

In [24]:
weighted_train_data_embed.write.mode("overwrite").parquet("Data/BalancedTrainEmbedding/embeddings_with_tweets.parquet")

                                                                                

In [4]:
train_data_embed = spark.read.parquet("Data/BalancedTrainEmbedding/embeddings_with_tweets.parquet")

In [5]:
test_data_embed = spark.read.parquet("Data/TestEmbeddings/embeddings_with_tweets.parquet")

In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

df_pandas = train_data_embed.toPandas()

def get_most_similar_tweets(query_embedding, df=df_pandas, num_results=10):

    embeddings = np.stack(df['embedding'].values)  # Stack the list of embeddings into a NumPy array
    texts = df['text'].values  # Get the list of tweet texts
    labels = df['label'].values  # Get the list of labels
    
    # Compute cosine similarity between the query embedding and all embeddings in the DataFrame
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    
    # Get the indices of the top `num_results` most similar tweets
    top_indices = similarities.argsort()[-num_results:][::-1]
    
    # Format the top similar tweets into a string
    similar_tweets_string = "\n".join([f"Tweet: {texts[i]} (Label: {labels[i]})" for i in top_indices])
    
    return similar_tweets_string



                                                                                

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

hf_token = "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"

# Replace with the model identifier
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load tokenizer and model with the token
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token, device_map="auto", torch_dtype="auto")

2024-12-16 21:52:02.708319: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734414722.720262   64111 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734414722.725294   64111 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-16 21:52:02.745132: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.13s/it]


In [8]:
from transformers import pipeline

llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [9]:
def parse_output(op):
    if "human" in op:
        return "human"
    else:
        return "bot"

In [10]:
import logging
from datasets import Dataset
from joblib import Parallel, delayed
import math
import pandas as pd

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s', 
    level=logging.INFO  # You can change to DEBUG for more verbose logs
)

def prepare_prompt(test):
    """Prepares a single prompt for the given test user."""
    try:
        # Get similar tweets using the embedding from the current test row
        incontext_info = get_most_similar_tweets(test['embedding'])
        tweet = test['text']
    
        prompt = f"""
        You are a clever AI agent which can discern between genuine and fake twitter profiles. 
        You will be provided with the users tweet along with other tweets which are similar to users along with their labels.
        Please use these to classify the following twitter user as "human" or "bot"
    
        Relevant_Examples : {incontext_info}
    
        Tweet: {tweet}
    
        Your output must be the label either <human> or <bot>. Do not write any explanation or reasoning.
        """
    
        return {
            "id": test['user_id'],
            "ground_truth": test["label"],
            "prompt": prompt
        }
    except Exception as e:
        logging.error(f"Error preparing prompt for user ID {test['user_id']}: {e}")
        return None

def process_batch(batch):
    """Processes a batch of test rows."""
    results = []
    for test in batch:
        result = prepare_prompt(test)
        if result is not None:
            results.append(result)
    return results

def parallel_prepare_prompts(df, batch_size=64, n_jobs=-1):
    """Prepare prompts in parallel for a PySpark DataFrame."""
    total = len(df)
    total_batches = math.ceil(total / batch_size)
    
    logging.info(f"Starting prompt preparation for {total} test rows in {total_batches} batches of size {batch_size}.")
    
    # Split the DataFrame into chunks of batch_size
    batches = [df.iloc[i:i + batch_size] for i in range(0, total, batch_size)]
    
    # Use Parallel to process each batch in parallel
    results = Parallel(n_jobs=n_jobs, backend="multiprocessing")(
        delayed(process_batch)(batch.to_dict(orient='records')) for batch in batches
    )
    
    # Flatten the list of lists of results
    valid_results = [res for batch_results in results for res in batch_results if res is not None]
    
    logging.info(f"Successfully prepared {len(valid_results)} out of {total} prompts.")
    return valid_results

# Assuming you have a PySpark DataFrame called `parquet_df` with the necessary columns
# Convert the PySpark DataFrame to a Pandas DataFrame for further processing
df = test_data_embed.select("user_id", "text", "label", "embedding").toPandas()

# Generate dataset
data = parallel_prepare_prompts(df)

if data:  # Only save if data is successfully prepared
    dataset = Dataset.from_list(data)
    dataset_path = "Data/user_text_dataset"
    try:
        dataset.save_to_disk(dataset_path)
        logging.info(f"Dataset successfully saved to {dataset_path}")
    except Exception as e:
        logging.error(f"Failed to save dataset to {dataset_path}: {e}")
else:
    logging.warning("No data was prepared. Dataset will not be saved.")


2024-12-16 21:52:23,033 - INFO - Starting prompt preparation for 10000 test rows in 157 batches of size 64.
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


KeyboardInterrupt: 

In [None]:
# Function to process batches
BATCH_SIZE = 64

def process_batch(batch):
    outputs = llm_pipeline(
        batch["prompt"],
        max_new_tokens=5,
        do_sample=True,
        truncation=True,
        return_full_text=False,
        temperature=0.1,
        pad_token_id=tokenizer.eos_token_id
    )
    cleaned_outputs = [parse_output(output[0]["generated_text"]) for output in outputs]
    # print(cleaned_outputs)
    # cleaned_series = np.array(cleaned_outputs).reshape(1,5)                         
    return {"predictions": cleaned_outputs}

# Process dataset in batches
batched_results = dataset.map(
    process_batch,
    batched=True,
    batch_size=BATCH_SIZE,
    remove_columns=["prompt"]  # Keep only necessary columns
)

# Extract results
predictions = batched_results["predictions"]
ground_truths = batched_results["ground_truth"]

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

def calculate_metrics(ground_truth, predictions):
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()
    
    # Fit the encoder to the unique labels in ground_truth (and predictions)
    ground_truth_encoded = label_encoder.fit_transform(ground_truth)
    predictions_encoded = label_encoder.transform(predictions)  # Transform predictions
    
    # Accuracy calculation
    accuracy = accuracy_score(ground_truth_encoded, predictions_encoded)
    
    # Precision calculation
    precision = precision_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    # Recall calculation
    recall = recall_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    # F1 Score calculation
    f1 = f1_score(ground_truth_encoded, predictions_encoded, average='weighted')
    
    return accuracy, precision, recall, f1


accuracy, precision, recall, f1 = calculate_metrics(ground_truths, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")