# Prerequisites

## Install and Import

In [28]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install imbalanced-learn
!pip install emoji
!pip install transformers torch

import pandas as pd
import os
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from pandas import json_normalize
import json
import numpy as np
import emoji
from transformers import pipeline
from tqdm.notebook import tqdm

features = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
            'username_numeric', 'username_special', 'username_length', 'username_se',
            'screenname', 'screenname_uppercase', 'screenname_lowercase',
            'screenname_numeric', 'screenname_special', 'screenname_length',
            'screenname_se', 'screenname_emoji', 'screenname_hashtag',
            'screenname_word', 'description', 'description_length',
            'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
            'user_md_total_post', 'user_md_total_like', 'user_md_verified',
            'user_md_protected', 'post_md_like_mean',
            'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
            'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean',
            'post_md_quote_std', 'post_text_length_mean', 'post_text_length_std',
            'post_sentiment_score_mean', 'post_sentiment_score_std',
            'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
            'post_sentiment_numeric_prop_positive',
            'post_sentiment_numeric_prop_negative']

dataset_columns = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                  'username_numeric', 'username_special', 'username_length', 'username_se',
                  'screenname', 'screenname_uppercase', 'screenname_lowercase',
                  'screenname_numeric', 'screenname_special', 'screenname_length',
                  'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                  'screenname_word', 'description', 'description_length',
                  'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                  'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                  'user_md_protected', 'post_md_like_mean',
                  'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
                  'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean',
                  'post_md_quote_std', 'post_text_length_mean', 'post_text_length_std',
                  'post_sentiment_score_mean', 'post_sentiment_score_std',
                  'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive',
                  'post_sentiment_numeric_prop_negative', 'label']

feature_sets = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se',],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                   'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                   'user_md_protected'],  # Add user metadata features
    'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'post_md_quote_mean', 'post_md_quote_std'],  # Add post metadata features
    'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']  # Add post text features (like BERT embeddings)
}



# Preprocessing

## Feature Engineering

### Load Data

In [None]:
train_data = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts.csv')
train_data_text = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_posts.csv')

In [None]:
train_data_text = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_UPDATED.csv')

user_id              object
username             object
post_md_like        float64
post_md_retweet     float64
post_md_reply       float64
post_md_quote       float64
post_text            object
post_text_length    float64
user_exists            bool
dtype: object


  train_data_text = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_UPDATED.csv')


### Text Data Sentiment Analysis

In [None]:
# Load the sentiment-analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Ensure no NaN values and strip whitespace
train_data_text['post_text'] = train_data_text['post_text'].fillna('').str.strip()

# Use tqdm to apply sentiment analysis with a progress bar
tqdm.pandas()  # Initialize tqdm for pandas

# Analyze sentiment for each post with progress tracking
sentiment_results = train_data_text['post_text'].progress_apply(sentiment_pipeline)

# Extract sentiment score
train_data_text['post_sentiment_score'] = sentiment_results.apply(lambda x: x[0]['score'])

# Map sentiment labels to numerical values
sentiment_mapping = {
    'POSITIVE': 1,
    'NEGATIVE': -1,
    'NEUTRAL': 0
}
train_data_text['post_sentiment_numeric'] = sentiment_results.apply(lambda x: sentiment_mapping.get(x[0]['label'], 0))

# # Select only the specified columns for the final dataset
# train_data_text = train_data_text[text_dataset_columns]

# Save the DataFrame to CSV
train_data_text.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv', index=False)


### Fill missing sentiment features

In [None]:
train_data_text_updated = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_posts_UPDATED.csv')

train_data_text_updated = train_data_text_updated.sort_values(by=['user_id'])

for index, row in train_data_text_updated.iterrows():
  if pd.isna(row['post_text']):
    train_data_text_updated.loc[index, ['post_sentiment_score', 'post_sentiment_numeric']] = [np.nan, np.nan]



train_data_text_updated.to_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_posts_UPDATED_2.csv', index=False)


### Aggregate Text Features

In [171]:
train_data_text_updated = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_posts_UPDATED_2.csv')

In [172]:
import pandas as pd
import numpy as np

# Sample post data DataFrame for demonstration
# Assuming your post data is loaded into a DataFrame named 'train_data_text_updated'

def safe_mean(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return np.nan

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    return valid_values.mean()  # Return the mean of the modified values

# Function to safely compute standard deviation, returning 0 if only one valid value exists
def safe_std(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return np.nan

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    if len(valid_values) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return valid_values.std(ddof=0)  # Population std deviation
def safe_std_sentiment(series):
    if series.empty or series.isnull().all():
        return np.nan

    if len(series) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return series.std(ddof=0)  # Population std deviation



# Aggregation functions including proportion of positive and negative sentiment
aggregation_functions = {
    'post_md_like': [safe_mean, safe_std],
    'post_md_retweet': [safe_mean, safe_std],
    'post_md_reply': [safe_mean, safe_std],
    'post_md_quote': [safe_mean, safe_std],
    'post_text_length': [safe_mean, safe_std],
    'post_sentiment_score': ['mean', safe_std_sentiment],
    'post_sentiment_numeric': ['mean', safe_std_sentiment,
        ('prop_positive', lambda x: (x == 1).mean() if x.notna().any() else np.nan),
        ('prop_negative', lambda x: (x == -1).mean() if x.notna().any() else np.nan)
    ]
}

# Group by user_id and aggregate using the functions
aggregated_df = train_data_text_updated.groupby('user_id').agg(aggregation_functions)

# Custom column renaming to handle function names and aggregation names
new_columns = []
for col in aggregated_df.columns:
    feature_name = col[0]
    aggregation_name = col[1]

    if callable(aggregation_name):  # If the second element is a function, name it based on the function
        aggregation_name = aggregation_name.__name__

    new_columns.append(f"{feature_name}_{aggregation_name}")

# Apply the new column names
aggregated_df.columns = new_columns

# Reset index to make user_id a column
aggregated_df.reset_index(inplace=True)

aggregated_df = aggregated_df.rename(columns={'post_md_like_safe_mean': 'post_md_like_mean',
                                              'post_md_like_safe_std': 'post_md_like_std',
                                              'post_md_retweet_safe_mean': 'post_md_retweet_mean',
                                              'post_md_retweet_safe_std': 'post_md_retweet_std',
                                              'post_md_reply_safe_mean': 'post_md_reply_mean',
                                              'post_md_reply_safe_std': 'post_md_reply_std',
                                              'post_md_quote_safe_mean': 'post_md_quote_mean',
                                              'post_md_quote_safe_std': 'post_md_quote_std',
                                              'post_text_length_safe_mean': 'post_text_length_mean',
                                              'post_text_length_safe_std': 'post_text_length_std',
                                              'post_sentiment_score_safe_std_sentiment': 'post_sentiment_score_std',
                                              'post_sentiment_numeric_safe_std_sentiment': 'post_sentiment_numeric_std'})

# Display the aggregated DataFrame
print(aggregated_df)

# Save the aggregated DataFrame to CSV
aggregated_df.to_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_posts_AGGREGATED.csv', index=False)


                                   user_id  post_md_like_mean  \
0     00362980-EF23-4D88-8A7D-DDFE75922846              0.400   
1     00381E27-59ED-4DD2-8F37-79029E835B7B           2673.875   
2     00C51755-857E-45DF-AE24-0824CF3B8B78              2.200   
3     01BDB544-B668-4B52-B3DA-11E7155BECCA            205.700   
4     0277D521-A9F8-48EC-B7F6-C628E7A045DF              1.300   
...                                    ...                ...   
1326  FEA4C9DF-0575-4647-8D79-E1DCC82E9896              1.000   
1327  FEAB21DC-8952-4B16-8EAE-469B9D1E6A52          41293.000   
1328  FF22A7B6-7E8A-4316-8F73-54E1A3EF87B8              0.000   
1329  FF9EA681-F708-473E-9D9D-0E0E871C881C             11.900   
1330  FFBF34E1-2AC2-4174-A0ED-63728C395F97              2.000   

      post_md_like_std  post_md_retweet_mean  post_md_retweet_std  \
0             1.356466              0.000000             0.000000   
1          6385.836680             56.500000           158.168028   
2           

### User Data

In [None]:
train_data['username'] = train_data['username'].str.strip()
train_data['screenname'] = train_data['screenname'].str.strip()
train_data['description'] = train_data['description'].str.strip()


dataset_columns = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                   'username_numeric', 'username_special', 'username_length', 'username_se',
                   'screenname', 'screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                   'screenname_word', 'description', 'description_length',
                   'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                   'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                   'user_md_protected', 'label']


# Function to calculate string entropy
def calculate_entropy(s):
    if pd.isna(s):  # Handle null values
        return pd.NA

    # Strip leading and trailing whitespace
    s = s.strip()

    # Check if the stripped string is empty after removing whitespace
    if pd.isna(s):  # If the string is empty, return pd.NA
        return pd.NA

    # Calculate frequency of each character
    freq = {}
    for char in s:
        freq[char] = freq.get(char, 0) + 1

    # Calculate probability of each character
    probabilities = [count / len(s) for count in freq.values()]

    # Calculate entropy
    entropy = -sum(p * np.log2(p) for p in probabilities if p > 0)
    return entropy

# Function to convert to boolean or mapped float value
def to_boolean(value):
    if pd.isna(value):  # Handle null values
        return float('nan')  # Return NaN for missing values
    if isinstance(value, bool):
        return 1.0 if value else 0.0  # Map True to 1.0, False to 2.0

    if isinstance(value, str):
        value = value.strip()

    if value in [1, "1", "yes", "True", True]:  # Check for truthy values
        return 1.0  # True -> 1.0
    if value in [0, "0", "no", "False", False]:  # Check for falsy values
        return 0.0  # False -> 0.0
    return float('nan')  # Return NaN for unexpected values

# Username Features
train_data['username_uppercase'] = train_data['username'].apply(lambda x: sum(1 for c in x if c.isupper()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['username_lowercase'] = train_data['username'].apply(lambda x: sum(1 for c in x if c.islower()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['username_numeric'] = train_data['username'].apply(lambda x: sum(1 for c in x if c.isdigit()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['username_special'] = train_data['username'].apply(lambda x: sum(1 for c in x if not c.isalnum()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['username_length'] = train_data['username'].apply(lambda x: len(x) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['username_se'] = train_data['username'].apply(calculate_entropy).astype('Float64') # Change to Float64


# Screenname Features
train_data['screenname_uppercase'] = train_data['screenname'].apply(lambda x: sum(1 for c in x if c.isupper()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['screenname_lowercase'] = train_data['screenname'].apply(lambda x: sum(1 for c in x if c.islower()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['screenname_numeric'] = train_data['screenname'].apply(lambda x: sum(1 for c in x if c.isdigit()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['screenname_special'] = train_data['screenname'].apply(lambda x: sum(1 for c in x if not c.isalnum()) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['screenname_length'] = train_data['screenname'].apply(lambda x: len(x) if pd.notnull(x) else pd.NA).astype('Int64')  # Change to Int64
train_data['screenname_se'] = train_data['screenname'].apply(calculate_entropy).astype('Float64') # Change to Float64
train_data['screenname_emoji'] = train_data['screenname'].apply(
    lambda x: sum(1 for c in x if c in emoji.EMOJI_DATA) if pd.notnull(x) else pd.NA).astype('Int64') # Change to Int64


# Description Features
train_data['description_length'] = train_data['description'].apply(lambda x: len(x) if pd.notnull(x) else pd.NA).astype('Int64') # Change to Int64

# User Metadata Features
train_data['user_md_protected'] = train_data['user_md_protected'] = train_data['user_md_protected'].map(to_boolean)

# print(train_data)

train_data = train_data[dataset_columns]

# Save the DataFrame to CSV
train_data.to_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_UPDATED.csv', index=False)

### Aggregate user and post features

In [173]:
user_train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_UPDATED.csv')
post_train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_posts_AGGREGATED.csv')

In [174]:
dataset_columns_aggregated = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                    'username_numeric', 'username_special', 'username_length', 'username_se',
                    'screenname', 'screenname_uppercase', 'screenname_lowercase',
                    'screenname_numeric', 'screenname_special', 'screenname_length',
                    'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                    'screenname_word', 'description', 'description_length',
                    'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                    'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                    'user_md_protected', 'post_md_like_mean', 'post_md_like_std',
                    'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean',
                    'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std',
                    'post_text_length_mean', 'post_text_length_std',
                    'post_sentiment_score_mean', 'post_sentiment_score_std',
                    'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                    'post_sentiment_numeric_prop_positive',
                    'post_sentiment_numeric_prop_negative', 'label']

# Ensure both DataFrames have 'user_id' as the key for merging
combined_df = pd.merge(user_train_data_labeled, post_train_data_labeled, on='user_id', how='left')

combined_df = combined_df[dataset_columns_aggregated]

# Display the combined DataFrame
print(combined_df)

combined_df.to_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_AGGREGATED.csv', index=False)

                  user_id         username  username_uppercase  \
0               112805276  FrancoLostaunau                 2.0   
1              1050006840      joel_archie                 0.0   
2                 1140451          Anthony                 1.0   
3               219617448     SushmaSwaraj                 2.0   
4     1296330070177599488      ben76821215                 0.0   
...                   ...              ...                 ...   
3235                  NaN              NaN                 NaN   
3236                  NaN              NaN                 NaN   
3237                  NaN              NaN                 NaN   
3238                  NaN              NaN                 NaN   
3239                  NaN              NaN                 NaN   

      username_lowercase  username_numeric  username_special  username_length  \
0                   13.0               0.0               0.0             15.0   
1                   10.0               0.0   

### Unlabeled TikTok data feature engineering

In [None]:
tiktok_data = pd.read_excel('./drive/MyDrive/Datasets/2-Train_2/2020 TikTok Bots and Clickworker/CSV/accounts.xlsx')

tiktok_unlabeled = tiktok_data[tiktok_data['IsABot'].isna()]

print (tiktok_unlabeled)

                                         Id  IsAccountPrivate  IsVerified  \
1      9A60337F-64D3-468F-A20F-0017CD58C59A              True       False   
4      4026CA43-1F0D-42C7-AA7F-001E55ECB80A             False       False   
7      0CA73C78-D1DF-4508-86CC-0044299A2560             False       False   
8      CC93AC1E-C388-4FCC-8742-0046662A0B01             False       False   
9      763D9B3F-55C4-44C2-87FB-004BF1A97D40              True       False   
...                                     ...               ...         ...   
10114  F6582805-A196-436D-AF73-FFD82A8F1A6E             False       False   
10115  45A93F86-D044-4DC0-8127-FFD91AC7810C             False       False   
10116  B57EC2B8-F088-40FA-9D78-FFDDBB506C68             False       False   
10117  604FA15E-5515-4E4C-9A8D-FFEC1BE5A751             False       False   
10120  C6020ECB-91AF-4E84-8F5A-FFFAEFC433FB             False       False   

      HasProfilePicture  AnzahlFolgeIch  AnzahlFollower AnzahlLikes  \
1   

In [None]:
import pandas as pd

# Select the required TikTok features
tiktok_features = ['Id', 'VerhaeltnisFolgeIchProFollower', 'AnzahlPosts',
                   'NumberOfLikedVideos', 'IsVerified', 'IsAccountPrivate']

# Filter only rows whose 'Id' (user_id) contains letters
tiktok_data_with_letters = tiktok_unlabeled[tiktok_unlabeled['Id'].str.contains('[A-Za-z]', na=False)]

# Select the required columns
tiktok_data_selected = tiktok_data_with_letters[tiktok_features]

# Randomly sample 7,140 records
sampled_tiktok_data = tiktok_data_selected.sample(n=7100, random_state=42)

# Define the new feature names
features = ['user_id', 'user_md_follow_ratio', 'user_md_total_post',
            'user_md_total_like', 'user_md_verified', 'user_md_protected']

# Create a dictionary to map the TikTok columns to your desired feature names
rename_mapping = {
    'Id': 'user_id',
    'VerhaeltnisFolgeIchProFollower': 'user_md_follow_ratio',
    'AnzahlPosts': 'user_md_total_post',
    'NumberOfLikedVideos': 'user_md_total_like',
    'IsVerified': 'user_md_verified',
    'IsAccountPrivate': 'user_md_protected'
}

# Rename the columns
sampled_tiktok_data = sampled_tiktok_data.rename(columns=rename_mapping)

# Display the first few rows to verify
print(sampled_tiktok_data)

print (sampled_tiktok_data.dtypes)

# sampled_tiktok_data.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/tiktok_unlabeled_7140.csv', index=False)

                                   user_id  user_md_follow_ratio  \
3916  10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             35.333333   
4898  5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             52.250000   
9185  83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             14.000000   
4459  CF33854C-298F-4C7F-AB77-6F7631B59FC8             17.500000   
8322  BB1A148A-380B-411C-B053-D3256C5BEF4C             31.727273   
...                                    ...                   ...   
187   B17C1427-0C93-4BB1-B0BD-0523B0161B79              6.222222   
4157  58E86FB3-27E8-4673-BC8B-673C02C8F9EF              2.947368   
4528  72BF16D0-2D7D-4659-8A17-712C18786C79             17.166667   
4908  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             35.090909   
8839  16DEF46E-65AD-4CB8-B144-DFE751386965             80.645161   

     user_md_total_post  user_md_total_like  user_md_verified  \
3916                 20                 0.0             False   
4898                  0                 0.0          

In [None]:
import pandas as pd

# Convert the 'user_md_total_post' column to numeric, forcing invalid values to NaN
sampled_tiktok_data['user_md_total_post'] = pd.to_numeric(sampled_tiktok_data['user_md_total_post'], errors='coerce')

# Explicitly cast to float (even if it could be integers)
sampled_tiktok_data['user_md_total_post'] = sampled_tiktok_data['user_md_total_post'].astype('float64')

# Confirm the conversion
print(sampled_tiktok_data.dtypes)

# Display the first few rows to verify
print(sampled_tiktok_data[['user_id', 'user_md_total_post']].head())


user_id                  object
user_md_follow_ratio    float64
user_md_total_post      float64
user_md_total_like      float64
user_md_verified           bool
user_md_protected          bool
dtype: object
                                   user_id  user_md_total_post
3916  10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4                20.0
4898  5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC                 0.0
9185  83C07A5E-0F5E-4519-A1FF-E87CEFC905D3                 8.0
4459  CF33854C-298F-4C7F-AB77-6F7631B59FC8                 2.0
8322  BB1A148A-380B-411C-B053-D3256C5BEF4C                 0.0


In [None]:
user_columns_unlabeled = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                    'username_numeric', 'username_special', 'username_length', 'username_se',
                    'screenname', 'screenname_uppercase', 'screenname_lowercase',
                    'screenname_numeric', 'screenname_special', 'screenname_length',
                    'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                    'screenname_word', 'description', 'description_length',
                    'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                    'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                    'user_md_protected']

post_columns_unlabeled = ['post_md_like_mean', 'post_md_like_std',
                    'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean',
                    'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std',
                    'post_text_length_mean', 'post_text_length_std',
                    'post_sentiment_score_mean', 'post_sentiment_score_std',
                    'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                    'post_sentiment_numeric_prop_positive',
                    'post_sentiment_numeric_prop_negative']


# Function to calculate string entropy
def calculate_entropy(s):
    if pd.isna(s):  # Handle null values
        return pd.NA

    # Strip leading and trailing whitespace
    s = s.strip()

    # Check if the stripped string is empty after removing whitespace
    if pd.isna(s):  # If the string is empty, return pd.NA
        return pd.NA

    # Calculate frequency of each character
    freq = {}
    for char in s:
        freq[char] = freq.get(char, 0) + 1

    # Calculate probability of each character
    probabilities = [count / len(s) for count in freq.values()]

    # Calculate entropy
    entropy = -sum(p * np.log2(p) for p in probabilities if p > 0)
    return entropy

# Function to convert to boolean or mapped float value
def to_boolean(value):
    if pd.isna(value):  # Handle null values
        return float('nan')  # Return NaN for missing values
    if isinstance(value, bool):
        return 1.0 if value else 0.0  # Map True to 1.0, False to 2.0

    if isinstance(value, str):
        value = value.strip()

    if value in [1, "1", "yes", "True", True]:  # Check for truthy values
        return 1.0  # True -> 1.0
    if value in [0, "0", "no", "False", False]:  # Check for falsy values
        return 0.0  # False -> 0.0
    return float('nan')  # Return NaN for unexpected values

# User Metadata Features
sampled_tiktok_data['user_md_verified'] = sampled_tiktok_data['user_md_verified'] = sampled_tiktok_data['user_md_verified'].map(to_boolean)
sampled_tiktok_data['user_md_protected'] = sampled_tiktok_data['user_md_protected'] = sampled_tiktok_data['user_md_protected'].map(to_boolean)

for col in user_columns_unlabeled:
    if col not in sampled_tiktok_data.columns:
        sampled_tiktok_data[col] = np.nan  # Add missing columns and fill with NaN

sampled_tiktok_data = sampled_tiktok_data[user_columns_unlabeled]

print (sampled_tiktok_data)

print (sampled_tiktok_data.dtypes)

                                   user_id  username  username_uppercase  \
3916  10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN                 NaN   
4898  5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC       NaN                 NaN   
9185  83C07A5E-0F5E-4519-A1FF-E87CEFC905D3       NaN                 NaN   
4459  CF33854C-298F-4C7F-AB77-6F7631B59FC8       NaN                 NaN   
8322  BB1A148A-380B-411C-B053-D3256C5BEF4C       NaN                 NaN   
...                                    ...       ...                 ...   
187   B17C1427-0C93-4BB1-B0BD-0523B0161B79       NaN                 NaN   
4157  58E86FB3-27E8-4673-BC8B-673C02C8F9EF       NaN                 NaN   
4528  72BF16D0-2D7D-4659-8A17-712C18786C79       NaN                 NaN   
4908  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN                 NaN   
8839  16DEF46E-65AD-4CB8-B144-DFE751386965       NaN                 NaN   

      username_lowercase  username_numeric  username_special  username_length  \
3916  

In [None]:

# Load the TikTok posts data from the Excel file
tiktok_posts = pd.read_excel('./drive/MyDrive/Datasets/2-Train_2/2020 TikTok Bots and Clickworker/CSV/videos.xlsx')

# Specify the columns you want to extract from tiktok_posts
post_columns = ['AccountId', 'NumberOfLikes', 'NumberOfComments', 'VideoDescriptionLength']

# Filter to retain only the relevant columns
tiktok_posts_filtered = tiktok_posts[post_columns]

# Create an empty DataFrame to store the extracted data
extracted_post_data = pd.DataFrame()

# Iterate over each user_id in the sampled_tiktok_data
for user_id in sampled_tiktok_data['user_id']:
    # Filter the post data for the current user_id
    user_post_data = tiktok_posts_filtered[tiktok_posts_filtered['AccountId'] == user_id]

    # Limit to a maximum of 20 rows per user_id
    user_post_data_limited = user_post_data.head(20)

    # Append the limited data to the extracted_post_data DataFrame
    extracted_post_data = pd.concat([extracted_post_data, user_post_data_limited])

# Display the extracted post data for verification
print(extracted_post_data)

                                  AccountId  NumberOfLikes NumberOfComments  \
348    10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            3.0                0   
2480   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            0.0                0   
4292   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            2.0                0   
5117   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            2.0                0   
6828   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            1.0                0   
...                                     ...            ...              ...   
49390  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            3.0                0   
50489  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            1.0                0   
61918  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            2.0                0   
62203  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            0.0                0   
68168  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            1.0                0   

       VideoDescriptionLength  
348                

In [None]:
extracted_post_data = extracted_post_data.rename(columns={'AccountId': 'user_id',
                                                          'NumberOfLikes': 'post_md_like',
                                                          'NumberOfComments': 'post_md_reply',
                                                          'VideoDescriptionLength': 'post_text_length'})

print (extracted_post_data)

                                    user_id  post_md_like post_md_reply  \
348    10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           3.0             0   
2480   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           0.0             0   
4292   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           2.0             0   
5117   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           2.0             0   
6828   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           1.0             0   
...                                     ...           ...           ...   
49390  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           3.0             0   
50489  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           1.0             0   
61918  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           2.0             0   
62203  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           0.0             0   
68168  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           1.0             0   

       post_text_length  
348                 0.0  
2480                0.0  
4292                0

In [None]:
unlabeled_posts = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts.csv')

In [None]:
for col in unlabeled_posts.columns:
    if col not in extracted_post_data.columns:
        extracted_post_data[col] = np.nan  # Add missing columns and fill with NaN

extracted_post_data = extracted_post_data[unlabeled_posts.columns]

print (extracted_post_data)

                                    user_id  username  post_md_like  \
348    10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           3.0   
2480   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           0.0   
4292   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           2.0   
5117   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           2.0   
6828   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           1.0   
...                                     ...       ...           ...   
49390  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           3.0   
50489  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           1.0   
61918  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           2.0   
62203  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           0.0   
68168  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           1.0   

       post_md_retweet post_md_reply  post_md_quote  post_text  \
348                NaN             0            NaN        NaN   
2480           

In [None]:
combined_unlabeled_posts = pd.concat([unlabeled_posts, extracted_post_data], ignore_index=True)

print (combined_unlabeled_posts)

combined_unlabeled_posts.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_UPDATED.csv', index=False)

                                     user_id        username  post_md_like  \
0                                 u234450632  REALISE_innOV8           0.0   
1                       u1465728402372059144   DAIRInstitute           0.0   
2                                u2822124414    jaredliangtw           0.0   
3                                u2315526308    nimmirastogi           0.0   
4                                 u330562292   ubergasmonkey           1.0   
...                                      ...             ...           ...   
147156  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           3.0   
147157  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   
147158  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           2.0   
147159  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           0.0   
147160  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   

        post_md_retweet post_md_reply  post_md_quote  \
0      

In [None]:
import pandas as pd

# Step 1: Remove last 40 rows from the first 7140 rows and first 40 rows from the last 7140 rows
# Remove the last 40 rows from the first subset (up to row 7139)
removed_first_subset = unlabeled_users.iloc[7100:7140]

# Remove the first 40 rows from the last subset (starting from the last 7140 rows)
removed_last_subset = unlabeled_users.iloc[-7140:-7100]

# Step 2: Combine the removed rows into a separate DataFrame
removed_accounts = pd.concat([removed_first_subset, removed_last_subset], ignore_index=True)

# Display the removed accounts
print("Removed Accounts:")
print(removed_accounts)

unlabeled_users_filtered = unlabeled_users.drop(removed_first_subset.index).drop(removed_last_subset.index)

print (unlabeled_users_filtered)

Removed Accounts:
                 user_id        username  username_uppercase  \
0                    NaN             NaN                 NaN   
1                    NaN             NaN                 NaN   
2                    NaN             NaN                 NaN   
3                    NaN             NaN                 NaN   
4                    NaN             NaN                 NaN   
..                   ...             ...                 ...   
75  u1484652566885580801        yciejxla                 0.0   
76  u1124604567663120385   NanotechEvent                 2.0   
77            u392930821  GiorgioCafiero                 2.0   
78            u460107942     CustomPCMag                 4.0   
79   u827046914285830144  emanuelewolves                 0.0   

    username_lowercase  username_numeric  username_special  username_length  \
0                  NaN               NaN               NaN              NaN   
1                  NaN               NaN               

In [None]:
# Step 3: Get the unique user IDs of the removed accounts
removed_user_ids = removed_accounts['user_id'].unique()

# Step 4: Filter the 'unlabeled_posts' to remove posts related to the removed accounts
filtered_unlabeled_posts = combined_unlabeled_posts[~combined_unlabeled_posts['user_id'].isin(removed_user_ids)]

# Display the filtered posts
print("Filtered Unlabeled Posts:")
print(filtered_unlabeled_posts)

Filtered Unlabeled Posts:
                                     user_id        username  post_md_like  \
0                                 u234450632  REALISE_innOV8           0.0   
1                       u1465728402372059144   DAIRInstitute           0.0   
2                                u2822124414    jaredliangtw           0.0   
3                                u2315526308    nimmirastogi           0.0   
4                                 u330562292   ubergasmonkey           1.0   
...                                      ...             ...           ...   
147156  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           3.0   
147157  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   
147158  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           2.0   
147159  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           0.0   
147160  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   

        post_md_retweet post_md_reply

In [None]:
# Combine the sampled TikTok data with the filtered unlabeled accounts
combined_unlabeled_data = pd.concat([sampled_tiktok_data, unlabeled_users_filtered], ignore_index=True)

# Display the combined data
print("Combined Data:")
print(combined_unlabeled_data)

# Optionally, save the combined data to a CSV file
# combined_data.to_csv('combined_unlabeled_accounts.csv', index=False)


Combined Data:
                                    user_id        username  \
0      10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             NaN   
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             NaN   
2      83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             NaN   
3      CF33854C-298F-4C7F-AB77-6F7631B59FC8             NaN   
4      BB1A148A-380B-411C-B053-D3256C5BEF4C             NaN   
...                                     ...             ...   
21295                   u958807016520192008   HackYFutureBE   
21296                  u1486053324554227715   Empirerising4   
21297                  u1145483306563244032  _sheenavasquez   
21298                  u1265061521597648897      Ieomessiok   
21299                           u1010593116      SharkawyMD   

       username_uppercase  username_lowercase  username_numeric  \
0                     NaN                 NaN               NaN   
1                     NaN                 NaN               NaN   
2                     NaN  

In [None]:
# Check if user_id in filtered_unlabeled_posts exists in combined_unlabeled_data
filtered_unlabeled_posts['user_exists'] = filtered_unlabeled_posts['user_id'].isin(combined_unlabeled_data['user_id'])

# Filter rows where user_id exists in combined_unlabeled_data
posts_with_existing_users = filtered_unlabeled_posts[filtered_unlabeled_posts['user_exists']]

# Optionally, filter rows where user_id doesn't exist in combined_unlabeled_data (for validation or further actions)
posts_without_existing_users = filtered_unlabeled_posts[~filtered_unlabeled_posts['user_exists']]

# Display the filtered posts with existing users
print("Posts without existing users:")
print(posts_without_existing_users)

# Optionally, save the result to CSV
# posts_with_existing_users.to_csv('posts_with_existing_users.csv', index=False)


Posts without existing users:
                           user_id username  post_md_like  post_md_retweet  \
57366  to reserve at least 40% of…      140           NaN              NaN   

      post_md_reply  post_md_quote post_text  post_text_length  user_exists  
57366           NaN            NaN       NaN               NaN        False  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_unlabeled_posts['user_exists'] = filtered_unlabeled_posts['user_id'].isin(combined_unlabeled_data['user_id'])


In [None]:
# Filter and keep only rows where user_id exists in combined_unlabeled_data
filtered_unlabeled_posts_existing = filtered_unlabeled_posts[filtered_unlabeled_posts['user_id'].isin(combined_unlabeled_data['user_id'])]

# Save the filtered posts without the missing users if needed
# filtered_unlabeled_posts_existing.to_csv('filtered_unlabeled_posts_existing.csv', index=False)

# Overwrite the original DataFrame if desired
filtered_unlabeled_posts = filtered_unlabeled_posts_existing

print (filtered_unlabeled_posts)

filtered_unlabeled_posts.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_UPDATED.csv')

combined_unlabeled_data.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_UPDATED.csv')

                                     user_id        username  post_md_like  \
0                                 u234450632  REALISE_innOV8           0.0   
1                       u1465728402372059144   DAIRInstitute           0.0   
2                                u2822124414    jaredliangtw           0.0   
3                                u2315526308    nimmirastogi           0.0   
4                                 u330562292   ubergasmonkey           1.0   
...                                      ...             ...           ...   
147156  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           3.0   
147157  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   
147158  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           2.0   
147159  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           0.0   
147160  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   

        post_md_retweet post_md_reply  post_md_quote  \
0      

In [None]:
print (combined_unlabeled_data)

                                    user_id        username  \
0      10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             NaN   
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             NaN   
2      83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             NaN   
3      CF33854C-298F-4C7F-AB77-6F7631B59FC8             NaN   
4      BB1A148A-380B-411C-B053-D3256C5BEF4C             NaN   
...                                     ...             ...   
21295                   u958807016520192008   HackYFutureBE   
21296                  u1486053324554227715   Empirerising4   
21297                  u1145483306563244032  _sheenavasquez   
21298                  u1265061521597648897      Ieomessiok   
21299                           u1010593116      SharkawyMD   

       username_uppercase  username_lowercase  username_numeric  \
0                     NaN                 NaN               NaN   
1                     NaN                 NaN               NaN   
2                     NaN                 

In [None]:
unlabeled_post_sentiment = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv')

print (unlabeled_post_sentiment.dtypes)

user_id                    object
username                   object
post_md_like              float64
post_md_retweet           float64
post_md_reply             float64
post_md_quote             float64
post_text                  object
post_text_length          float64
user_exists                  bool
post_sentiment_score      float64
post_sentiment_numeric      int64
dtype: object


  unlabeled_post_sentiment = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv')


In [None]:
unlabeled_post_sentiment = unlabeled_post_sentiment.sort_values(by=['user_id'])

for index, row in unlabeled_post_sentiment.iterrows():
  if pd.isna(row['post_text']):
    unlabeled_post_sentiment.loc[index, ['post_sentiment_score', 'post_sentiment_numeric']] = [np.nan, np.nan]



unlabeled_post_sentiment.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv', index=False)


In [None]:
# unlabeled_post_sentiment = unlabeled_post_sentiment.drop(columns=['user_exists'])
unlabeled_post_sentiment.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv', index=False)

In [None]:
print (unlabeled_post_sentiment)

                                     user_id        username  post_md_like  \
145514  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           2.0   
145516  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           5.0   
145515  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           2.0   
145519  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           0.0   
145513  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN          11.0   
...                                      ...             ...           ...   
71485                    u999492441513148416  timevalueofbtc           0.0   
71054                    u999492441513148416  timevalueofbtc           0.0   
48037                    u999492441513148416  timevalueofbtc           0.0   
62112                    u999492441513148416  timevalueofbtc          14.0   
56582                    u999492441513148416  timevalueofbtc          51.0   

        post_md_retweet  post_md_reply  post_md_quote  \
145514

### Aggregate Unlabeled Text Data

In [None]:
# Sample post data DataFrame for demonstration
# Assuming your post data is loaded into a DataFrame named 'train_data_text_updated'

def safe_mean(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return np.nan

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    return valid_values.mean()  # Return the mean of the modified values

# Function to safely compute standard deviation, returning 0 if only one valid value exists
def safe_std(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return np.nan

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    if len(valid_values) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return valid_values.std(ddof=0)  # Population std deviation

def safe_std_sentiment(series):
    if series.empty or series.isnull().all():
        return np.nan

    if len(series) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return series.std(ddof=0)  # Population std deviation



# Aggregation functions including proportion of positive and negative sentiment
aggregation_functions = {
    'post_md_like': [safe_mean, safe_std],
    'post_md_retweet': [safe_mean, safe_std],
    'post_md_reply': [safe_mean, safe_std],
    'post_md_quote': [safe_mean, safe_std],
    'post_text_length': [safe_mean, safe_std],
    'post_sentiment_score': ['mean', safe_std_sentiment],
    'post_sentiment_numeric': ['mean', safe_std_sentiment,
        ('prop_positive', lambda x: (x == 1).mean() if x.notna().any() else np.nan),
        ('prop_negative', lambda x: (x == -1).mean() if x.notna().any() else np.nan)
    ]
}

# Group by user_id and aggregate using the functions
aggregated_df = unlabeled_post_sentiment.groupby('user_id').agg(aggregation_functions)

# Custom column renaming to handle function names and aggregation names
new_columns = []
for col in aggregated_df.columns:
    feature_name = col[0]
    aggregation_name = col[1]

    if callable(aggregation_name):  # If the second element is a function, name it based on the function
        aggregation_name = aggregation_name.__name__

    new_columns.append(f"{feature_name}_{aggregation_name}")

# Apply the new column names
aggregated_df.columns = new_columns

# Reset index to make user_id a column
aggregated_df.reset_index(inplace=True)

aggregated_df = aggregated_df.rename(columns={'post_md_like_safe_mean': 'post_md_like_mean',
                                              'post_md_like_safe_std': 'post_md_like_std',
                                              'post_md_retweet_safe_mean': 'post_md_retweet_mean',
                                              'post_md_retweet_safe_std': 'post_md_retweet_std',
                                              'post_md_reply_safe_mean': 'post_md_reply_mean',
                                              'post_md_reply_safe_std': 'post_md_reply_std',
                                              'post_md_quote_safe_mean': 'post_md_quote_mean',
                                              'post_md_quote_safe_std': 'post_md_quote_std',
                                              'post_text_length_safe_mean': 'post_text_length_mean',
                                              'post_text_length_safe_std': 'post_text_length_std',
                                              'post_sentiment_score_safe_std_sentiment': 'post_sentiment_score_std',
                                              'post_sentiment_numeric_safe_std_sentiment': 'post_sentiment_numeric_std'})

# Display the aggregated DataFrame
print(aggregated_df)

# Save the aggregated DataFrame to CSV
aggregated_df.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_AGGREGATED.csv', index=False)

                                   user_id  post_md_like_mean  \
0     0001B2D9-59A7-464E-9427-704DAB4677A7           6.500000   
1     0006C595-1722-4D1C-9438-E1698690D71F          12.000000   
2     0008C48D-E99A-4E6A-AF0C-86E8CE896D0B          12.950000   
3     000A53DD-11AB-471F-9992-77B9A32CF0F9           0.650000   
4     00118AEC-44A5-4965-BFB5-7D8698F5F1CA          11.950000   
...                                    ...                ...   
6859                   u994843692660862976           0.300000   
6860                   u995369593744035840         562.800000   
6861                              u9963832         211.285714   
6862                            u997150783         100.000000   
6863                   u999492441513148416          11.200000   

      post_md_like_std  post_md_retweet_mean  post_md_retweet_std  \
0            16.363068                   NaN                  NaN   
1             1.000000                   NaN                  NaN   
2           

In [None]:
unlabeled_accounts = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_UPDATED.csv')
unlabeled_posts = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_AGGREGATED.csv')

In [None]:
unlabeled_dataset_columns_aggregated = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                    'username_numeric', 'username_special', 'username_length', 'username_se',
                    'screenname', 'screenname_uppercase', 'screenname_lowercase',
                    'screenname_numeric', 'screenname_special', 'screenname_length',
                    'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                    'screenname_word', 'description', 'description_length',
                    'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                    'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                    'user_md_protected', 'post_md_like_mean', 'post_md_like_std',
                    'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean',
                    'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std',
                    'post_text_length_mean', 'post_text_length_std',
                    'post_sentiment_score_mean', 'post_sentiment_score_std',
                    'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                    'post_sentiment_numeric_prop_positive',
                    'post_sentiment_numeric_prop_negative']

# Ensure both DataFrames have 'user_id' as the key for merging
combined_df = pd.merge(unlabeled_accounts, unlabeled_posts, on='user_id', how='left')

combined_df = combined_df[unlabeled_dataset_columns_aggregated]

# Display the combined DataFrame
print(combined_df)

combined_df.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_AGGREGATED.csv', index=False)

                                    user_id        username  \
0      10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             NaN   
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             NaN   
2      83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             NaN   
3      CF33854C-298F-4C7F-AB77-6F7631B59FC8             NaN   
4      BB1A148A-380B-411C-B053-D3256C5BEF4C             NaN   
...                                     ...             ...   
21295                   u958807016520192008   HackYFutureBE   
21296                  u1486053324554227715   Empirerising4   
21297                  u1145483306563244032  _sheenavasquez   
21298                  u1265061521597648897      Ieomessiok   
21299                           u1010593116      SharkawyMD   

       username_uppercase  username_lowercase  username_numeric  \
0                     NaN                 NaN               NaN   
1                     NaN                 NaN               NaN   
2                     NaN                 

# Training

## Import

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from scipy.stats import mode
from tqdm import tqdm
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

## Supervised Training

In [4]:
user_train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_AGGREGATED.csv')

# print (user_train_data_labeled.columns)

# print (user_train_data_labeled.dtypes)

In [105]:
# user_train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_AGGREGATED.csv')

# print (train_data)

features = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
            'username_numeric', 'username_special', 'username_length', 'username_se',
            'screenname', 'screenname_uppercase', 'screenname_lowercase',
            'screenname_numeric', 'screenname_special', 'screenname_length',
            'screenname_se', 'screenname_emoji', 'screenname_hashtag',
            'screenname_word', 'description', 'description_length',
            'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
            'user_md_total_post', 'user_md_total_like', 'user_md_verified',
            'user_md_protected', 'post_md_like_mean',
            'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
            'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean',
            'post_md_quote_std', 'post_text_length_mean', 'post_text_length_std',
            'post_sentiment_score_mean', 'post_sentiment_score_std',
            'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
            'post_sentiment_numeric_prop_positive',
            'post_sentiment_numeric_prop_negative']

### Initial training of model_1

In [106]:
# Assign the entire labeled dataset to X_train and y_train
X = user_train_data_labeled[features]  # All feature columns from the labeled dataset
y = user_train_data_labeled['label']    # Label column from the labeled dataset

# Split the dataset into training and testing sets (80% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Initialize an empty dictionary to store models and validation results
models_1 = {}

# Train separate models for each feature set
feature_sets = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se',],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                   'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                   'user_md_protected'],  # Add user metadata features
    'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'post_md_quote_mean', 'post_md_quote_std'],  # Add post metadata features
    'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']  # Add post text features (like BERT embeddings)
}

for feature_name, feature_columns in tqdm(feature_sets.items(), desc="Training Models", ncols=100):
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train[feature_columns], y_train)
    models_1[feature_name] = rf_model

Training Models: 100%|████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.27it/s]


In [None]:
import joblib

# Save the entire dictionary of models to a single file
filename = './drive/MyDrive/Datasets/Models/Supervised_Models_10-14.joblib'
joblib.dump(models_1, filename)

print("All models have been saved successfully as a whole.")


All models have been saved successfully as a whole.


In [None]:
models_2 = {}

feature_sets_2 = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se',],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                   'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                   'user_md_protected']  # Add user metadata features
}

for feature_name, feature_columns in tqdm(feature_sets_2.items(), desc="Training Models", ncols=100):
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train[feature_columns], y_train)
    models_2[feature_name] = rf_model

Training Models: 100%|████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.85it/s]


In [None]:
print (models_1)

print (models_2)

{'username': RandomForestClassifier(random_state=42), 'screenname': RandomForestClassifier(random_state=42), 'description': RandomForestClassifier(random_state=42), 'user_metadata': RandomForestClassifier(random_state=42), 'post_metadata': RandomForestClassifier(random_state=42), 'post_text': RandomForestClassifier(random_state=42)}
{'username': RandomForestClassifier(random_state=42), 'screenname': RandomForestClassifier(random_state=42), 'description': RandomForestClassifier(random_state=42), 'user_metadata': RandomForestClassifier(random_state=42)}


### Validation with weighted voting

In [None]:
print (feature_sets)

{'username': ['username_uppercase', 'username_lowercase', 'username_numeric', 'username_special', 'username_length', 'username_se'], 'screenname': ['screenname_uppercase', 'screenname_lowercase', 'screenname_numeric', 'screenname_special', 'screenname_length', 'screenname_se'], 'description': ['description_length'], 'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio', 'user_md_total_post', 'user_md_total_like', 'user_md_verified', 'user_md_protected'], 'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std'], 'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean', 'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std', 'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']}


In [107]:
# Step 2: Validation Phase
# Initialize dictionaries to store weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = .80

# Step 1: Generate predictions for the validation set (X_test) using individual models with weighted voting
for feature_name, model in models_1.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)


    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Step 2: Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Step 3: Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Step 4: Evaluation

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')


Accuracy: 0.85
Precision: 0.83
Recall: 0.88
F1 Score: 0.85
MCC: 0.70


### Validation with weighted voting and Platt's scaling

In [43]:
from sklearn.calibration import CalibratedClassifierCV
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# Step 2: Validation Phase
# Initialize dictionaries to store weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Step 1: Apply Platt’s scaling (calibration) to each model before validation
calibrated_models = {}
for feature_name, model in models_1.items():
    # Apply Platt's scaling using CalibratedClassifierCV
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Step 2: Generate predictions for the validation set (X_test) using calibrated models with weighted voting
for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Step 3: Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Step 4: Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Step 5: Evaluation

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')


Accuracy: 0.85
Precision: 0.83
Recall: 0.89
F1 Score: 0.86
MCC: 0.71


## Supervised Training FINAL

### Import and prepare the dataset

In [175]:
user_train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_AGGREGATED.csv')

# OPTIONAL Print current training dataset
# print (user_train_data_labeled)

In [176]:
# Step 1: Split the dataset into training and validation sets (90% train, 10% validation)
train_data, val_data = train_test_split(user_train_data_labeled, test_size=0.1, random_state=42, stratify=user_train_data_labeled['label'])

# Step 2: Save the datasets
train_data.to_csv('./drive/MyDrive/Datasets/FINAL/Labeled/initial_train_data.csv', index=False)
val_data.to_csv('./drive/MyDrive/Datasets/FINAL/Labeled/val_data.csv', index=False)


In [181]:

# Step 3: Split the features and labels for each dataset
X_train = train_data[features]  # All feature columns from the training dataset
y_train = train_data['label']    # Label column from the training dataset

X_test = val_data[features]  # All feature columns from the validation dataset
y_test = val_data['label']    # Label column from the validation dataset

### Supervised Learning with Hyperparameters FINAL

In [67]:
# Define the feature subsets for each model
feature_sets = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se'],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                      'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                      'user_md_protected'],  # Add user metadata features
    'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'post_md_quote_mean', 'post_md_quote_std'],  # Add post metadata features
    'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']  # Add post text features
}

# Define hyperparameter grid for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20],  # Depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples to split a node
    'min_samples_leaf': [1, 2, 5],    # Minimum samples in a leaf node
    'max_features': ['sqrt', 'log2']  # Number of features to consider at each split
}

# Initialize an empty dictionary to store models with tuned hyperparameters
tuned_models = {}

# Train and tune separate models for each feature subset
for feature_name, feature_columns in tqdm(feature_sets.items(), desc="Training Models", ncols=100):
    print(f"Training model for {feature_name} feature set...")

    # Initialize the RandomForestClassifier
    rf_model = RandomForestClassifier(random_state=42)

    # Perform Grid Search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

    # Fit the grid search to the training data (for the specific feature set)
    grid_search.fit(X_train[feature_columns], y_train)

    # Store the best model with tuned hyperparameters
    tuned_models[feature_name] = grid_search.best_estimator_

    # Print the best hyperparameters for the feature set
    print(f"Best hyperparameters for {feature_name}: {grid_search.best_params_}")

Training Models:   0%|                                                        | 0/6 [00:00<?, ?it/s]

Training model for username feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,
Training Models:  17%|███████▊                                       | 1/6 [08:12<41:04, 492.96s/it]

Best hyperparameters for username: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Training model for screenname feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


Training Models:  33%|███████████████▋                               | 2/6 [17:07<34:29, 517.30s/it]

Best hyperparameters for screenname: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}
Training model for description feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


Training Models:  50%|███████████████████████▌                       | 3/6 [24:24<24:02, 480.75s/it]

Best hyperparameters for description: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Training model for user_metadata feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


Training Models:  67%|███████████████████████████████▎               | 4/6 [36:12<19:00, 570.27s/it]

Best hyperparameters for user_metadata: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Training model for post_metadata feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


Training Models:  83%|███████████████████████████████████████▏       | 5/6 [41:31<07:59, 479.91s/it]

Best hyperparameters for post_metadata: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Training model for post_text feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


Training Models: 100%|███████████████████████████████████████████████| 6/6 [46:59<00:00, 469.92s/it]

Best hyperparameters for post_text: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}





### Save Models

In [70]:
# Save the entire dictionary of models to a single file
filename = './drive/MyDrive/Datasets/Models/10-09_models.joblib'

# Rename models if needed
joblib.dump(tuned_models, filename)

print("All models have been saved successfully as a whole.")

All models have been saved successfully as a whole.


### Validation on the validation set with Weighted Voting and Platt's Scaling FINAL

In [460]:
# Initialize arrays to accumulate weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Initialize array for calbirated models
calibrated_models = {}

# Apply Platt's scaling to each model using CalibratedClassifierCV
for feature_name, model in models_1.items():
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Generate predictions for each model using calibrated models
for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')

Accuracy: 0.90
Precision: 0.93
Recall: 0.86
F1 Score: 0.89
MCC: 0.79


## Unsupervised Learning

### Predicting labels on unlabeled data without weighted voting

In [None]:
bot_prob_sum = np.zeros(len(train_data_unlabeled))
human_prob_sum = np.zeros(len(train_data_unlabeled))
model_count = len(models_1)
pseudo_labeled_data = train_data_unlabeled

# Step 1: Generate pseudo-labels for the unlabeled dataset using individual models
for feature_name, model in models_1.items():
    feature_columns = feature_sets[feature_name]

    # Predict probabilities for the unlabeled data
    probas = model.predict_proba(train_data_unlabeled[feature_columns])

    # Accumulate probabilities
    human_prob_sum += probas[:, 0]  # Human probabilities
    bot_prob_sum += probas[:, 1]     # Bot probabilities

# Calculate average bot probability across all models
avg_human_prob = human_prob_sum / model_count
avg_bot_prob = bot_prob_sum / model_count


# Step 3: Assign labels and confidence based on averaged probabilities
for i in range(len(avg_human_prob)):
    if avg_human_prob[i] > avg_bot_prob[i]:
        pseudo_label = 0  # Human
        confidence = avg_human_prob[i]
    else:
        pseudo_label = 1  # Bot
        confidence = avg_bot_prob[i]

    # Store the label and confidence in the DataFrame
    pseudo_labeled_data.at[i, 'predicted_label'] = pseudo_label
    pseudo_labeled_data.at[i, 'confidence'] = confidence

# Step 4: Filter accounts with high confidence
threshold = 0.75  # Define confidence threshold
high_confidence_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]

print (high_confidence_data)


# pseudo_labeled_data.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/pseudo_labeled_data.csv')
# high_confidence_data.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/high_confidence_data.csv')

                    user_id         username  username_uppercase  \
15815  u1427075542818430978  FernandezGeneby                 2.0   
17797  u1346450217936293890    StatusCurioso                 2.0   
18364  u1480367054490320903  AricaInformativ                 2.0   
18592   u868756669311754240     sheshmani124                 0.0   
19215            u345326048   MarketPlaceOTA                 5.0   
20205             u30138131              PPG                 3.0   
20813             u72158278       yakkopinky                 0.0   
20986             u74048201            DrPyo                 2.0   

       username_lowercase  username_numeric  username_special  \
15815                13.0               0.0               0.0   
17797                11.0               0.0               0.0   
18364                13.0               0.0               0.0   
18592                 9.0               3.0               0.0   
19215                 9.0               0.0               0.0 

### Predicting labels on unlabeled data with weighted voting

In [None]:
# Initialize arrays to accumulate weighted probabilities
bot_prob_sum = np.zeros(len(train_data_unlabeled))
human_prob_sum = np.zeros(len(train_data_unlabeled))
total_weights = np.zeros(len(train_data_unlabeled))  # To normalize the weighted sums
pseudo_labeled_data = train_data_unlabeled.copy()  # To avoid modifying the original DataFrame

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Step 1: Generate pseudo-labels for the unlabeled dataset using individual models
for feature_name, model in models_1.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user)
    completeness = train_data_unlabeled[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict probabilities for the unlabeled data
    probas = model.predict_proba(train_data_unlabeled[feature_columns])

    # Accumulate weighted probabilities
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights
    total_weights += weights

# Step 2: Normalize the weighted probabilities
# Avoid division by zero
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Step 3: Assign labels and confidence based on averaged probabilities
for i in range(len(avg_human_prob)):
    if avg_human_prob[i] > avg_bot_prob[i]:
        pseudo_label = 0  # Human
        confidence = avg_human_prob[i]
    else:
        pseudo_label = 1  # Bot
        confidence = avg_bot_prob[i]

    # Store the label and confidence in the DataFrame
    pseudo_labeled_data.at[i, 'predicted_label'] = pseudo_label
    pseudo_labeled_data.at[i, 'confidence'] = confidence

# Step 4: Filter accounts with high confidence
threshold = .90  # Define confidence threshold
high_confidence_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]

# Output high-confidence pseudo labels
print(high_confidence_data)

high_confidence_data.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/high_confidence_data.csv', index=False)

                                    user_id         username  \
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC              NaN   
11     89D6BE9B-C006-4BC5-88F5-C9993CF1C680              NaN   
13     D8D0DABC-3E2E-4898-99E1-093908DC6E57              NaN   
19     C3AABDC0-8996-4682-97D2-7B7637AB4D5E              NaN   
24     356A44A6-00DA-4DA5-9E16-B21B81C5FC93              NaN   
...                                     ...              ...   
14154                                   NaN              NaN   
14186                                   NaN              NaN   
14188                                   NaN              NaN   
18364                  u1480367054490320903  AricaInformativ   
19215                            u345326048   MarketPlaceOTA   

       username_uppercase  username_lowercase  username_numeric  \
1                     NaN                 NaN               NaN   
11                    NaN                 NaN               NaN   
13                    NaN     

### Predicting labels on unlabeled data with weighted voting and Platt's scaling

In [104]:
# Load the entire dictionary of models from a file
filename = './drive/MyDrive/Datasets/Models/Tuned_Supervised_Models_10-15.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")

All models have been loaded successfully.


In [96]:
train_data_unlabeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_AGGREGATED.csv')

# train_data_unlabeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Outputs/user_train_data_labeled_AUGMENTED.csv')

In [97]:
from sklearn.calibration import CalibratedClassifierCV
import numpy as np

# Initialize arrays to accumulate weighted probabilities
bot_prob_sum = np.zeros(len(train_data_unlabeled))
human_prob_sum = np.zeros(len(train_data_unlabeled))
total_weights = np.zeros(len(train_data_unlabeled))  # To normalize the weighted sums
pseudo_labeled_data = train_data_unlabeled.copy()  # To avoid modifying the original DataFrame

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Step 1: Apply Platt’s scaling (calibration) to each model before generating pseudo-labels
calibrated_models = {}
for feature_name, model in models_1.items():
    # Apply Platt's scaling using CalibratedClassifierCV
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Fit the model with training data
    calibrated_models[feature_name] = calibrated_model

# Step 2: Generate pseudo-labels for the unlabeled dataset using calibrated models
for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user)
    completeness = train_data_unlabeled[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for the unlabeled data
    probas = model.predict_proba(train_data_unlabeled[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Step 3: Normalize the weighted probabilities
# Avoid division by zero
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

for i in range(len(avg_human_prob)):
    completeness = train_data_unlabeled[feature_sets[feature_name]].notnull().mean(axis=1)[i]  # Feature completeness for current instance

    if avg_human_prob[i] > avg_bot_prob[i]:
        pseudo_label = 0  # Human
        confidence = avg_human_prob[i] * completeness  # Penalize confidence by feature completeness
    else:
        pseudo_label = 1  # Bot
        confidence = avg_bot_prob[i] * completeness  # Penalize confidence by feature completeness

    # Store the label and penalized confidence in the DataFrame
    pseudo_labeled_data.at[i, 'predicted_label'] = pseudo_label
    pseudo_labeled_data.at[i, 'confidence'] = confidence


In [98]:
print (pseudo_labeled_data)

                                    user_id        username  \
0      10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             NaN   
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             NaN   
2      83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             NaN   
3      CF33854C-298F-4C7F-AB77-6F7631B59FC8             NaN   
4      BB1A148A-380B-411C-B053-D3256C5BEF4C             NaN   
...                                     ...             ...   
21295                   u958807016520192008   HackYFutureBE   
21296                  u1486053324554227715   Empirerising4   
21297                  u1145483306563244032  _sheenavasquez   
21298                  u1265061521597648897      Ieomessiok   
21299                           u1010593116      SharkawyMD   

       username_uppercase  username_lowercase  username_numeric  \
0                     NaN                 NaN               NaN   
1                     NaN                 NaN               NaN   
2                     NaN                 

In [99]:
# Step 5: Filter accounts with high confidence
threshold = 0.75  # Define confidence threshold
high_confidence_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]

# Output high-confidence pseudo labels
print(high_confidence_data)

                    user_id        username  username_uppercase  \
14204            u108618303        Shrimaan                 1.0   
14260             u47968268   woodforbrains                 0.0   
14265            u185949244  yanagiyatoshio                 0.0   
14280   u952629310761742336     sia_siberia                 0.0   
14284   u870926983865282560      mr_enissay                 0.0   
...                     ...             ...                 ...   
20989  u1305945887563112448   Moody_Gaurav_                 2.0   
21127  u1004169917187481601       itsdreese                 0.0   
21160   u761262722378072064        ICHRI_Fa                 6.0   
21196   u831519910463434752  mostafatajzade                 0.0   
21212             u64132999            DKFZ                 4.0   

       username_lowercase  username_numeric  username_special  \
14204                 7.0               0.0               0.0   
14260                13.0               0.0               0.0   


In [78]:
pseudo_labeled_data.to_csv('./drive/MyDrive/Datasets/FINAL/Outputs/pseudo_labeled_data_PENALTY.csv', index=False)

In [56]:
high_confidence_data.to_csv('./drive/MyDrive/Datasets/FINAL/Outputs/high_confidence_data_PENALTY.csv', index=False)

In [100]:
augmented_data = high_confidence_data.copy()

augmented_data['label'] = high_confidence_data['predicted_label'].map(lambda x: True if x == 1 else False)

augmented_data = augmented_data.drop(columns=['predicted_label', 'confidence'])

augmented_data = augmented_data[dataset_columns]

print (augmented_data)

                    user_id        username  username_uppercase  \
14204            u108618303        Shrimaan                 1.0   
14260             u47968268   woodforbrains                 0.0   
14265            u185949244  yanagiyatoshio                 0.0   
14280   u952629310761742336     sia_siberia                 0.0   
14284   u870926983865282560      mr_enissay                 0.0   
...                     ...             ...                 ...   
20989  u1305945887563112448   Moody_Gaurav_                 2.0   
21127  u1004169917187481601       itsdreese                 0.0   
21160   u761262722378072064        ICHRI_Fa                 6.0   
21196   u831519910463434752  mostafatajzade                 0.0   
21212             u64132999            DKFZ                 4.0   

       username_lowercase  username_numeric  username_special  \
14204                 7.0               0.0               0.0   
14260                13.0               0.0               0.0   


In [101]:
user_train_data_labeled_augmented = pd.concat([user_train_data_labeled, augmented_data], ignore_index=True)

print (user_train_data_labeled_augmented)

user_train_data_labeled_augmented.to_csv('./drive/MyDrive/Datasets/FINAL/Outputs/user_train_data_labeled_AUGMENTED.csv', index=False)

                   user_id         username  username_uppercase  \
0                112805276  FrancoLostaunau                 2.0   
1               1050006840      joel_archie                 0.0   
2                  1140451          Anthony                 1.0   
3                219617448     SushmaSwaraj                 2.0   
4      1296330070177599488      ben76821215                 0.0   
...                    ...              ...                 ...   
3321  u1305945887563112448    Moody_Gaurav_                 2.0   
3322  u1004169917187481601        itsdreese                 0.0   
3323   u761262722378072064         ICHRI_Fa                 6.0   
3324   u831519910463434752   mostafatajzade                 0.0   
3325             u64132999             DKFZ                 4.0   

      username_lowercase  username_numeric  username_special  username_length  \
0                   13.0               0.0               0.0             15.0   
1                   10.0         

In [67]:
user_train_data_labeled_augmented = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Outputs/user_train_data_labeled_AUGMENTED.csv')

In [102]:
X_train_augmented = user_train_data_labeled_augmented.drop(columns=['label'])  # Features
y_train_augmented = user_train_data_labeled_augmented['label']  # Labels

# # print (X_train_augmented.dtypes)
# print (y_train_augmented)

# Step 3: Retrain the models
for feature_name, model in models_1.items():
    feature_columns = feature_sets[feature_name]

    # Retrain the model using the augmented data
    model.fit(X_train_augmented[feature_columns], y_train_augmented)

In [37]:
print (X_train_augmented)

                   user_id         username  username_uppercase  \
0                112805276  FrancoLostaunau                 2.0   
1               1050006840      joel_archie                 0.0   
2                  1140451          Anthony                 1.0   
3                219617448     SushmaSwaraj                 2.0   
4      1296330070177599488      ben76821215                 0.0   
...                    ...              ...                 ...   
3301  u1219384205047668738        codebeef_                 0.0   
3302             u74048201            DrPyo                 2.0   
3303  u1305945887563112448    Moody_Gaurav_                 2.0   
3304   u761262722378072064         ICHRI_Fa                 6.0   
3305   u831519910463434752   mostafatajzade                 0.0   

      username_lowercase  username_numeric  username_special  username_length  \
0                   13.0               0.0               0.0             15.0   
1                   10.0         

In [103]:
from sklearn.calibration import CalibratedClassifierCV
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# Step 2: Validation Phase
# Initialize dictionaries to store weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Step 1: Apply Platt’s scaling (calibration) to each model before validation
calibrated_models = {}
for feature_name, model in models_1.items():
    # Apply Platt's scaling using CalibratedClassifierCV
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Step 2: Generate predictions for the validation set (X_test) using calibrated models with weighted voting
for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Step 3: Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Step 4: Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Step 5: Evaluation

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')


Accuracy: 0.97
Precision: 0.98
Recall: 0.97
F1 Score: 0.97
MCC: 0.94


## Unsupervised Learning ITERATIVE

In [182]:
train_data_unlabeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_AGGREGATED.csv')
train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/initial_train_data.csv')

In [183]:
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

completeness_threshold = 0.80
threshold = 0.80

# Function to iterate through unsupervised learning until stopping condition is met
def iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns, threshold=0.75, completeness_threshold=0.80):
    iteration = 0
    improvements = True
    previous_confidently_labeled = 0

    while improvements:
        print(f"Iteration: {iteration + 1}")

        # Step 1: Initialize arrays for weighted probabilities
        bot_prob_sum = np.zeros(len(train_data_unlabeled))
        human_prob_sum = np.zeros(len(train_data_unlabeled))
        total_weights = np.zeros(len(train_data_unlabeled))
        pseudo_labeled_data = train_data_unlabeled.copy()

        # Step 2: Apply Platt's scaling (calibration) and generate pseudo-labels for the unlabeled dataset
        calibrated_models = {}
        for feature_name, model in models_1.items():
            # Apply Platt's scaling
            calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
            calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
            calibrated_models[feature_name] = calibrated_model

        for feature_name, model in calibrated_models.items():
            feature_columns = feature_sets[feature_name]
            completeness = train_data_unlabeled[feature_columns].notnull().mean(axis=1)
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            probas = model.predict_proba(train_data_unlabeled[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe

        for i in range(len(avg_human_prob)):
            completeness = train_data_unlabeled[feature_sets[feature_name]].notnull().mean(axis=1)[i]
            if avg_human_prob[i] > avg_bot_prob[i]:
                pseudo_label = 0  # Human
                confidence = avg_human_prob[i] * completeness
            else:
                pseudo_label = 1  # Bot
                confidence = avg_bot_prob[i] * completeness

            pseudo_labeled_data.at[i, 'predicted_label'] = pseudo_label
            pseudo_labeled_data.at[i, 'confidence'] = confidence

        high_confidence_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]
        confidently_labeled_count = len(high_confidence_data)

        if confidently_labeled_count == previous_confidently_labeled:
            improvements = False  # Stop if no more improvements
        else:
            previous_confidently_labeled = confidently_labeled_count

        # Step 3: Augment the training data
        augmented_data = high_confidence_data.copy()
        augmented_data['label'] = high_confidence_data['predicted_label'].map(lambda x: True if x == 1 else False)
        augmented_data = augmented_data.drop(columns=['predicted_label', 'confidence'])
        augmented_data = augmented_data[dataset_columns]

        # Combine original labeled data with high-confidence pseudo-labeled data
        user_train_data_labeled_augmented = pd.concat([train_data_labeled, augmented_data], ignore_index=True)
        X_train_augmented = user_train_data_labeled_augmented.drop(columns=['label'])
        y_train_augmented = user_train_data_labeled_augmented['label']

        # Step 4: Retrain models with the augmented data
        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            model.fit(X_train_augmented[feature_columns], y_train_augmented)

        # Step 5: Validate the performance on the validation set
        bot_prob_sum = np.zeros(len(X_test))
        human_prob_sum = np.zeros(len(X_test))
        total_weights = np.zeros(len(X_test))
        calibrated_models = {}

        for feature_name, model in models_1.items():
            calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
            calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
            calibrated_models[feature_name] = calibrated_model

        for feature_name, model in calibrated_models.items():
            feature_columns = feature_sets[feature_name]
            completeness = X_test[feature_columns].notnull().mean(axis=1)
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            probas = model.predict_proba(X_test[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe
        final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

        accuracy = accuracy_score(y_test, final_predictions)
        precision = precision_score(y_test, final_predictions, pos_label=True)
        recall = recall_score(y_test, final_predictions, pos_label=True)
        f1 = f1_score(y_test, final_predictions, pos_label=True)
        mcc = matthews_corrcoef(y_test, final_predictions)

        print(f"Iteration {iteration + 1} Evaluation Metrics:")
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print(f'MCC: {mcc:.2f}')

        print(confidently_labeled_count)

        iteration += 1

        # Stop if all unlabeled data has been confidently labeled
        if confidently_labeled_count == len(train_data_unlabeled):
            print("All unlabeled data has been confidently labeled.")
            break

# Example usage:
iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns)


Iteration: 1
Iteration 1 Evaluation Metrics:
Accuracy: 0.89
Precision: 0.85
Recall: 0.93
F1 Score: 0.89
MCC: 0.77
62
Iteration: 2
Iteration 2 Evaluation Metrics:
Accuracy: 0.89
Precision: 0.85
Recall: 0.94
F1 Score: 0.89
MCC: 0.78
178
Iteration: 3
Iteration 3 Evaluation Metrics:
Accuracy: 0.90
Precision: 0.87
Recall: 0.93
F1 Score: 0.90
MCC: 0.79
260
Iteration: 4
Iteration 4 Evaluation Metrics:
Accuracy: 0.90
Precision: 0.89
Recall: 0.93
F1 Score: 0.91
MCC: 0.81
318
Iteration: 5
Iteration 5 Evaluation Metrics:
Accuracy: 0.90
Precision: 0.90
Recall: 0.91
F1 Score: 0.90
MCC: 0.80
383
Iteration: 6
Iteration 6 Evaluation Metrics:
Accuracy: 0.90
Precision: 0.90
Recall: 0.90
F1 Score: 0.90
MCC: 0.80
474
Iteration: 7
Iteration 7 Evaluation Metrics:
Accuracy: 0.90
Precision: 0.91
Recall: 0.88
F1 Score: 0.89
MCC: 0.79
566
Iteration: 8
Iteration 8 Evaluation Metrics:
Accuracy: 0.90
Precision: 0.91
Recall: 0.88
F1 Score: 0.89
MCC: 0.79
680
Iteration: 9
Iteration 9 Evaluation Metrics:
Accuracy: 0.

# Model Import and Export

## Save

In [184]:
import joblib

# Save the entire dictionary of models to a single file
filename = './drive/MyDrive/Datasets/Models/Unsupervised_models_10-15.joblib'
joblib.dump(models_1, filename)

print("All models have been saved successfully as a whole.")


All models have been saved successfully as a whole.


## Loat

### Import Semi-Supervised Learning Model

In [459]:

filename = './drive/MyDrive/Datasets/Models/Unsupervised_models_10-15.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")


All models have been loaded successfully.


### Import Initial Model

In [177]:
filename = './drive/MyDrive/Datasets/Models/Initial_Models_10-15.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")

All models have been loaded successfully.


### Import Initial Supervised Learning Model

In [153]:
import joblib


# Load the entire dictionary of models from a file
filename = './drive/MyDrive/Datasets/Models/Tuned_Supervised_Models_10-15.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")

All models have been loaded successfully.


# Final Code

In [448]:
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
import joblib

filename = './drive/MyDrive/Datasets/Models/Unsupervised_models_10-15.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")


In [449]:
train_data_unlabeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_AGGREGATED.csv')
train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/initial_train_data.csv')
val_data = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/val_data.csv')

X_train = train_data_labeled.drop(columns=['label'])
y_train = train_data_labeled['label']

X_test = val_data.drop(columns=['label'])
y_test = val_data['label']

In [450]:
# Placeholder: Number of high-confidence accounts to trigger retraining
retrain_threshold = 10  # For example, retrain after 1000 high-confidence accounts
high_confidence_accounts = []  # Store high-confidence predictions

# Initialize dictionaries to store probabilities for each feature subset
probabilities = {feature_name: [] for feature_name in models_1.keys()}

# Initialize arrays to accumulate probabilities
bot_prob_sum = np.zeros(len(X_test))  # Replace 'test_data' with your testing data
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

calibrated_models = {}
for feature_name, model in models_1.items():
    # Apply Platt's scaling using CalibratedClassifierCV
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Fit the model with training data
    calibrated_models[feature_name] = calibrated_model

In [224]:
print (probabilities)

{'username': [], 'screenname': [], 'description': [], 'user_metadata': [], 'post_metadata': [], 'post_text': []}


In [451]:
# Step 1: Process each model for its feature subset
probabilities = {feature_name: [] for feature_name in models_1.keys()}

for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (account)
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict probabilities for the test data
    probas = model.predict_proba(X_test[feature_columns])

    # Store probabilities for each feature subset
    probabilities[feature_name] = probas

    # Accumulate weighted probabilities
    human_prob_sum += probas[:, 0] * weights  # Probability of being human
    bot_prob_sum += probas[:, 1] * weights    # Probability of being a bot

    if feature_name == 'post_text':
        post_text_human_prob = probas[:, 0] * weights
        post_text_bot_prob = probas[:, 1] * weights
    elif feature_name == 'post_metadata':
        post_text_human_prob = probas[:, 0] * weights
        post_text_bot_prob = probas[:, 1] * weights

    # Accumulate total weights for normalization
    total_weights += weights

In [452]:
# Step 2: Normalize probabilities to avoid division by zero
total_weights_safe = np.where(total_weights == 0, 1, total_weights)  # Handle zero weights
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Calculate average probabilities for post (weighted by completeness if needed)
post_human_prob = (post_metadata_probas[:, 0] + post_text_probas[:, 0]) / 2
post_bot_prob = (post_metadata_probas[:, 1] + post_text_probas[:, 1]) / 2

# Add combined post probabilities to the dictionary
probabilities['post'] = np.column_stack([post_human_prob, post_bot_prob])

# Optionally remove individual post_metadata and post_text from the final output
del probabilities['post_metadata']
del probabilities['post_text']

In [453]:
# Step 3: Output final classification probabilities and assign final labels
final_predictions = []        # Stores final predicted labels (0 for Human, 1 for Bot)
final_probabilities = []      # Stores probabilities of both classes [Human_prob, Bot_prob]
confidence_scores = []        # Stores penalized confidence scores

for i in range(len(avg_human_prob)):
    completeness = max(0.0, X_test[feature_sets[feature_name]].notnull().mean(axis=1)[i])

    # Calculate and store the probabilities for both classes
    human_prob = avg_human_prob[i]
    bot_prob = avg_bot_prob[i]

    # Append both class probabilities to the list
    final_probabilities.append([human_prob, bot_prob])

    # Final classification: 0 (Human) or 1 (Bot)
    if human_prob > bot_prob:
        final_predictions.append(False)  # Human
        confidence = human_prob * completeness  # Penalize confidence by completeness
    else:
        final_predictions.append(True)  # Bot
        confidence = bot_prob * completeness  # Penalize confidence by completeness

    # Store the confidence score
    confidence_scores.append(confidence)



In [454]:
# Example Output
for i in range(len(final_predictions)):
    print(f"Index: {i + 1}")  # Print the index
    print(f"Prediction: {'Human' if final_predictions[i] == False else 'Bot'}")
    print(f"Probabilities - Human: {final_probabilities[i][0]:.4f}, Bot: {final_probabilities[i][1]:.4f}")
    print(f"Confidence Score: {confidence_scores[i]:.4f}\n")

    # Output feature probabilities
    print(f"Feature Probabilities:\nUsername - Human: {probabilities['username'][i][0]:.4f}, Bot: {probabilities['username'][i][1]:.4f}")
    print(f"Screenname - Human: {probabilities['screenname'][i][0]:.4f}, Bot: {probabilities['screenname'][i][1]:.4f}")
    print(f"Description - Human: {probabilities['description'][i][0]:.4f}, Bot: {probabilities['description'][i][1]:.4f}")
    print(f"User Metadata - Human: {probabilities['user_metadata'][i][0]:.4f}, Bot: {probabilities['user_metadata'][i][1]:.4f}")
    print(f"Post - Human: {probabilities['post'][i][0]:.4f}, Bot: {probabilities['post'][i][1]:.4f}")

    print("-" * 50)

Index: 1
Prediction: Bot
Probabilities - Human: 0.4721, Bot: 0.5279
Confidence Score: 0.0000

Feature Probabilities:
Username - Human: 0.4492, Bot: 0.5508
Screenname - Human: 0.4480, Bot: 0.5520
Description - Human: 0.4720, Bot: 0.5280
User Metadata - Human: 0.4721, Bot: 0.5279
Post - Human: 0.4764, Bot: 0.5236
--------------------------------------------------
Index: 2
Prediction: Bot
Probabilities - Human: 0.0105, Bot: 0.9895
Confidence Score: 0.0000

Feature Probabilities:
Username - Human: 0.4492, Bot: 0.5508
Screenname - Human: 0.4480, Bot: 0.5520
Description - Human: 0.4720, Bot: 0.5280
User Metadata - Human: 0.0105, Bot: 0.9895
Post - Human: 0.4764, Bot: 0.5236
--------------------------------------------------
Index: 3
Prediction: Human
Probabilities - Human: 0.7883, Bot: 0.2117
Confidence Score: 0.1971

Feature Probabilities:
Username - Human: 0.4492, Bot: 0.5508
Screenname - Human: 0.4480, Bot: 0.5520
Description - Human: 0.4720, Bot: 0.5280
User Metadata - Human: 0.5615, Bot

In [389]:
def augment_training_data(high_confidence_accounts, user_train_data_labeled, dataset_columns):
    # Updated to access 'predicted_label' from the nested 'account' dictionary
    high_confidence_df = pd.DataFrame(high_confidence_accounts).apply(lambda x: x['account'].to_dict(), axis=1, result_type='expand')

    # Drop confidence column and format the high-confidence data for training
    # Access 'predicted_label' from the correct level in the dictionary
    high_confidence_df['label'] = high_confidence_df.apply(lambda row: True if row.get('predicted_label', None) == 1 else False, axis=1)
    # Select desired columns, ensuring they exist in high_confidence_df
    high_confidence_df = high_confidence_df[[col for col in dataset_columns if col in high_confidence_df.columns]]

    # Augment the original labeled data with the new high-confidence data
    augmented_train_data = pd.concat([user_train_data_labeled, high_confidence_df], ignore_index=True)

    return augmented_train_data

def retrain_models(trained_models, augmented_train_data, feature_sets):
    X_train_augmented = augmented_train_data.drop(columns=['label'])  # Features
    y_train_augmented = augmented_train_data['label']  # Labels

    # Retrain each model using the augmented data
    for feature_name, model in trained_models.items():
        feature_columns = feature_sets[feature_name]
        model.fit(X_train_augmented[feature_columns], y_train_augmented)

In [455]:
high_confidence_accounts = []  # Store high-confidence predictions


# Step 4: Store accounts with high confidence
high_confidence_threshold = 0.75  # Set high-confidence threshold

for i, confidence in enumerate(confidence_scores):
    if confidence > high_confidence_threshold:
        high_confidence_accounts.append({
            'account': X_test.iloc[i],  # Store the account's features
            'predicted_label': final_predictions[i],  # The final classification
            'confidence': confidence  # The confidence score
        })

        print(f"Index: {i + 1}")  # Print the index
        print(f"Prediction: {'Human' if final_predictions[i] == 0 else 'Bot'}")
        print(f"Probabilities - Human: {final_probabilities[i][0]:.4f}, Bot: {final_probabilities[i][1]:.4f}")
        print(f"Confidence Score: {confidence_scores[i]:.4f}")
        print("-" * 50)

Index: 42
Prediction: Bot
Probabilities - Human: 0.1175, Bot: 0.8825
Confidence Score: 0.8825
--------------------------------------------------
Index: 48
Prediction: Bot
Probabilities - Human: 0.2426, Bot: 0.7574
Confidence Score: 0.7574
--------------------------------------------------
Index: 51
Prediction: Human
Probabilities - Human: 0.7639, Bot: 0.2361
Confidence Score: 0.7639
--------------------------------------------------
Index: 96
Prediction: Human
Probabilities - Human: 0.7579, Bot: 0.2421
Confidence Score: 0.7579
--------------------------------------------------
Index: 172
Prediction: Bot
Probabilities - Human: 0.2389, Bot: 0.7611
Confidence Score: 0.7611
--------------------------------------------------
Index: 210
Prediction: Human
Probabilities - Human: 0.8067, Bot: 0.1933
Confidence Score: 0.8067
--------------------------------------------------
Index: 244
Prediction: Bot
Probabilities - Human: 0.1875, Bot: 0.8125
Confidence Score: 0.8125
---------------------------

In [456]:
print (len(high_confidence_accounts))

11


In [457]:
# Check if retraining should be triggered
if len(high_confidence_accounts) >= retrain_threshold:
    print ("Retraining triggered")

    # Step 5: Augment the training data with high-confidence pseudo-labeled data
    train_data_labeled = augment_training_data(high_confidence_accounts, train_data_labeled, dataset_columns)

    # Retrain the models with the augmented data
    retrain_models(models_1, train_data_labeled, feature_sets)

    # Clear high-confidence accounts after retraining
    high_confidence_accounts = []

Retraining triggered


# Notes

### Ensemble Approach
* Change average to voting
  * Explore weighted voting
* Weighted Probabilities based on the completeness of the features

### Feature Engineering
* Explore feature importance methods for deeper understanding of the models

### Text data Processing
* Aggragate post feature per user
  * avg_likes, avg_sentiment_score, etc
  * Include aggregated topic labeling
    * Requires corpus, stick to aggregated features


### Update training dataset to include aggregated post features
* Features to use
  * mean likes - Average number of likes per post
  *
