In [1]:
import pandas as pd
import random
import numpy as np
from datetime import datetime
import pytz

In [2]:
user_df = pd.read_json("Data/user.json")
split_df = pd.read_csv("Data/split.csv")
label_df = pd.read_csv("Data/label.csv")

In [3]:
testing_id_list = split_df[split_df['split']=='test']['id']

In [4]:
user_df_label = pd.merge(user_df,label_df,on='id')

In [5]:
def classify_account_age(age):
    if age == 0:
        return "account created less than a year ago"
    elif age == 1:
        return "account is 1 year old"
    else:
        return f"account is {age} years old"

In [6]:
conditions = [
    (user_df_label['verified'] & user_df_label['protected']),            
    (user_df_label['verified'] & ~user_df_label['protected']),           
    (~user_df_label['verified'] & user_df_label['protected']),          
    (~user_df_label['verified'] & ~user_df_label['protected'])        
]

# Define corresponding values
choices = [
    'verified and protected',
    'verified but not protected',
    'protected but not verified',
    'neither verified nor protected'
]

In [7]:
current_date = datetime.now(pytz.UTC)
user_df_label['account_age'] = user_df_label['created_at'].apply(lambda x: (current_date - x).days / 365.25)
user_df_label['account_age'] = user_df_label['account_age'].apply(lambda x: np.nan if x > 19 else x)
user_df_label['account_age'] = user_df_label.groupby('label')['account_age'].transform(lambda grp: grp.fillna(grp.mean()))
user_df_label['account_age']= user_df_label['account_age'].astype(int)
user_df_label['account_age'] = user_df_label['account_age'].apply(classify_account_age)
user_df_label['followers_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('followers_count', 0))
user_df_label['following_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('following_count', 0))
user_df_label['tweet_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('tweet_count', 0))
user_df_label['listed_count'] = user_df_label['public_metrics'].apply(lambda x: x.get('listed_count', 0))
user_df_label['status'] = np.select(conditions, choices, default='Unknown')
user_df_label['has_custom_profile_image'] = user_df_label['profile_image_url'].apply(lambda x: "does not have a profile picture" if 'default_profile_images' in x else "has a profile picture")
user_df_label.drop(["protected","verified","created_at","public_metrics","entities","location","withheld","pinned_tweet_id","profile_image_url","url"],axis=1,inplace=True)

In [13]:
user_df_label = pd.merge(user_df_label,split_df,on='id')

In [17]:
user_train_df = user_df_label[user_df_label['split']=='train']

In [59]:
user_val_df = user_df_label[user_df_label['split']=='val']

In [18]:
user_train_df.shape

(700000, 13)

In [24]:
def get_user_data_prompt(feature_list):
    user_metadata_text = f"""
    The name of the user is '{feature_list['name']}' with an account username '{feature_list['username']}'. The description for the user profile is '{feature_list['description']}'. The {feature_list['account_age']} and some of the metric of the user is as follows:
    The user has {feature_list['followers_count']} followers and follows {feature_list['following_count']} accounts. 
    The user has posted {feature_list['tweet_count']} tweets and is a member of {feature_list['listed_count']} public list.
    The user account is {feature_list['status']} and it {feature_list['has_custom_profile_image']} associated with it.
    """
    return user_metadata_text

In [64]:
def prepare_finetuning_data(training_data):
    """Prepares data for fine-tuning with QLoRA."""
    data = []

    for i, row in training_data.iterrows():
        prompt_text = get_user_data_prompt(row)
        output = row['label']
 
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
                    You are a clever AI agent which can discern between genuine and fake twitter profiles.You will be provided with the accounts metadata information.
                    Please use these to classify the following twitter user as "human" or "bot". <|eot_id|>
                    
                    <|start_header_id|>user<|end_header_id|>
                    
                    ### MetaData Information:
                    {prompt_text}
                    
                    <|eot_id|>
                    
                    ### Response: <|start_header_id|>assistant<|end_header_id|>
                    {output}"""
        
        data.append({
            'text': prompt
        })
        
    return data


In [65]:
data = prepare_finetuning_data(user_train_df)

In [60]:
val_data = prepare_finetuning_data(user_val_df)

In [36]:
from transformers import AutoTokenizer, AutoModelForCausalLM

hf_token = "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"

# Replace with the model identifier
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load tokenizer and model with the token
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=hf_token, device_map="auto", torch_dtype="auto")

2024-12-15 16:12:19.283303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734307940.774302   20477 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734307941.335931   20477 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-15 16:12:23.247952: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards: 100%|██████████| 4/4 [06:10<00:00, 92.50s/it] 


In [66]:
from datasets import Dataset
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True)

# Assuming you have a dataset of inputs and outputs
dataset = Dataset.from_list(data)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 700000/700000 [01:31<00:00, 7626.96 examples/s]


In [67]:
tokenized_dataset.save_to_disk("Data/tokenized_user_meta_dataset_sft")

Saving the dataset (4/4 shards): 100%|██████████| 700000/700000 [00:01<00:00, 599393.38 examples/s]


In [61]:
val_dataset = Dataset.from_list(val_data)
val_tokenized_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 200000/200000 [00:38<00:00, 5183.93 examples/s]


In [62]:
tokenized_dataset.save_to_disk("Data/tokenized_user_meta_dataset_sft_val_set")

Saving the dataset (5/5 shards): 100%|██████████| 700000/700000 [00:02<00:00, 270641.39 examples/s]


In [70]:
parquet_file_path = 'Data/Tweet_SingleFile_Split/tweets_train_0.parquet'

tweet_df_train = pd.read_parquet(parquet_file_path)

print(tweet_df_train.shape)

(8162939, 4)


In [71]:
sampled_tweet_df_train = pd.DataFrame(tweet_df_train.sample(n=700000))

In [74]:
def prepare_finetuning_tweet_data(training_data):
    """Prepares data for fine-tuning with QLoRA."""
    data = []

    for i, row in training_data.iterrows():
        text = row['text']
        output = row['label']
 
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
                    You are a clever AI agent which can discern between genuine and fake twitter profiles.You will be provided with the tweet posted by the user.
                    Please use these to classify the following twitter user as "human" or "bot". <|eot_id|>
                    
                    <|start_header_id|>user<|end_header_id|>
                    
                    ### Tweet:
                    {text}
                    
                    <|eot_id|>
                    
                    ### Response: <|start_header_id|>assistant<|end_header_id|>
                    {output}"""
        
        data.append({
            'text': prompt
        })
        
    return data


In [75]:
tweet_data_sft = prepare_finetuning_tweet_data(sampled_tweet_df_train)

In [80]:
tweet_dataset = Dataset.from_list(data)
tokenized_tweet_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 700000/700000 [01:28<00:00, 7934.65 examples/s]


In [81]:
tokenized_tweet_dataset.save_to_disk("Data/tokenized_user_tweet_dataset_sft")

Saving the dataset (4/4 shards): 100%|██████████| 700000/700000 [00:01<00:00, 443237.71 examples/s]
