# Prerequisites

## Install and Import

In [None]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install imbalanced-learn
!pip install emoji
!pip install transformers torch

^C
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.1.2-cp312-cp312-win_amd64.whl.metadata (59 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached numpy-2.1.2-cp312-cp312-win_amd64.whl (12.6 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.1.2 pandas-2.2.3 pytz-2024.2 tzdata-2024.2


In [2]:
# Standard Library Imports
import os
import json

# Data Manipulation and Processing
import pandas as pd
import numpy as np
from pandas import json_normalize
from tqdm.notebook import tqdm

# Text Processing
import emoji
from transformers import pipeline

# Machine Learning Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
import joblib

# Statistical Utilities
from scipy.stats import mode

features = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
            'username_numeric', 'username_special', 'username_length', 'username_se',
            'screenname', 'screenname_uppercase', 'screenname_lowercase',
            'screenname_numeric', 'screenname_special', 'screenname_length',
            'screenname_se', 'screenname_emoji', 'screenname_hashtag',
            'screenname_word', 'description', 'description_length',
            'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
            'user_md_total_post', 'user_md_total_like', 'user_md_verified',
            'user_md_protected', 'post_md_like_mean',
            'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
            'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean',
            'post_md_quote_std', 'post_text_length_mean', 'post_text_length_std',
            'post_sentiment_score_mean', 'post_sentiment_score_std',
            'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
            'post_sentiment_numeric_prop_positive',
            'post_sentiment_numeric_prop_negative']

dataset_columns = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                  'username_numeric', 'username_special', 'username_length', 'username_se',
                  'screenname', 'screenname_uppercase', 'screenname_lowercase',
                  'screenname_numeric', 'screenname_special', 'screenname_length',
                  'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                  'screenname_word', 'description', 'description_length',
                  'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                  'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                  'user_md_protected', 'post_md_like_mean',
                  'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
                  'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean',
                  'post_md_quote_std', 'post_text_length_mean', 'post_text_length_std',
                  'post_sentiment_score_mean', 'post_sentiment_score_std',
                  'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive',
                  'post_sentiment_numeric_prop_negative', 'label']

feature_sets = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se',],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                   'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                   'user_md_protected'],  # Add user metadata features
    'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'post_md_quote_mean', 'post_md_quote_std'],  # Add post metadata features
    'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']  # Add post text features (like BERT embeddings)
}

# Preprocessing

## Feature Engineering FINAL

### Load Data

In [156]:
input_data = pd.read_json('../data/input_data.json')

In [137]:
user_features = {
        'user_id': input_data['user_id'],
        'username': input_data['username'],
        'screenname': input_data['screenname'],
        'description': input_data['description'],
        'user_md_follower': input_data['user_md_follower'].astype('Float64'),
        'user_md_following': input_data['user_md_following'].astype('Float64'),
        'user_md_total_post': input_data['user_md_total_post'].astype('Float64'),
        'user_md_total_like': input_data['user_md_total_like'].astype('Float64'),
        'user_md_verified': input_data['user_md_verified'].astype('Float64'),  # Convert boolean to int
        'user_md_protected': input_data['user_md_protected'].astype('Float64')  # Convert boolean to int
    }

user_df = pd.DataFrame(user_features)

user_df = user_df.drop_duplicates()

post_features = input_data['posts'].apply(lambda x: pd.Series({
        'post_text': x['post_text'],  # Assuming this is a string
        'post_md_like': float(x['post_like']) if pd.notnull(x['post_like']) else pd.NA,  # Convert to Float64 with null check
        'post_md_retweet': float(x['post_retweet']) if pd.notnull(x['post_retweet']) else pd.NA,  # Convert to Float64 with null check
        'post_md_reply': float(x['post_reply']) if pd.notnull(x['post_reply']) else pd.NA,  # Convert to Float64 with null check
        'post_md_quote': float(x['post_quote']) if pd.notnull(x['post_quote']) else pd.NA,  # Convert to Float64 with null check
    })).astype({
        'post_md_like': 'Float64',
        'post_md_retweet': 'Float64',
        'post_md_reply': 'Float64',
        'post_md_quote': 'Float64',
    })

post_features['user_id'] = input_data['user_id']


### Engineer User Features

In [149]:
user_df['username'] = user_df['username'].str.strip()
user_df['screenname'] = user_df['screenname'].str.strip()
user_df['description'] = user_df['description'].str.strip()

# Function to calculate string entropy
def calculate_entropy(s):
    if pd.isna(s):  # Handle null values
        return pd.NA

    # Strip leading and trailing whitespace
    s = s.strip()

    # Check if the stripped string is empty after removing whitespace
    if pd.isna(s):  # If the string is empty, return pd.NA
        return pd.NA

    # Calculate frequency of each character
    freq = {}
    for char in s:
        freq[char] = freq.get(char, 0) + 1

    # Calculate probability of each character
    probabilities = [count / len(s) for count in freq.values()]

    # Calculate entropy
    entropy = -sum(p * np.log2(p) for p in probabilities if p > 0)
    return entropy

# Function to convert to boolean or mapped float value
def to_boolean(value):
    if pd.isna(value):  # Handle null values
        return float('nan')  # Return NaN for missing values
    if isinstance(value, bool):
        return 1.0 if value else 0.0  # Map True to 1.0, False to 2.0

    if isinstance(value, str):
        value = value.strip()

    if value in [1, "1", "yes", "True", True]:  # Check for truthy values
        return 1.0  # True -> 1.0
    if value in [0, "0", "no", "False", False]:  # Check for falsy values
        return 0.0  # False -> 0.0
    return float('nan')  # Return NaN for unexpected values

# Username Features
user_df['username_uppercase'] = user_df['username'].apply(lambda x: sum(1 for c in x if c.isupper()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['username_lowercase'] = user_df['username'].apply(lambda x: sum(1 for c in x if c.islower()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['username_numeric'] = user_df['username'].apply(lambda x: sum(1 for c in x if c.isdigit()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['username_special'] = user_df['username'].apply(lambda x: sum(1 for c in x if not c.isalnum()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['username_length'] = user_df['username'].apply(lambda x: len(x) if pd.notnull(x) else pd.NA).astype(float)  # Change to Int64
user_df['username_se'] = user_df['username'].apply(calculate_entropy).astype('Float64') # Change to Float64


# Screenname Features
user_df['screenname_uppercase'] = user_df['screenname'].apply(lambda x: sum(1 for c in x if c.isupper()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['screenname_lowercase'] = user_df['screenname'].apply(lambda x: sum(1 for c in x if c.islower()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['screenname_numeric'] = user_df['screenname'].apply(lambda x: sum(1 for c in x if c.isdigit()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['screenname_special'] = user_df['screenname'].apply(lambda x: sum(1 for c in x if not c.isalnum()) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['screenname_length'] = user_df['screenname'].apply(lambda x: len(x) if pd.notnull(x) else pd.NA).astype('Float64')  # Change to Int64
user_df['screenname_se'] = user_df['screenname'].apply(calculate_entropy).astype(float) # Change to Float64
user_df['screenname_emoji'] = user_df['screenname'].apply(
    lambda x: sum(1 for c in x if c in emoji.EMOJI_DATA) if pd.notnull(x) else pd.NA).astype('Float64') # Change to Int64
user_df['screenname_hashtag'] = user_df['screenname'].apply(
    lambda x: sum(1 for c in x if c == '#') if pd.notnull(x) else pd.NA).astype('Float64')

# New feature: Count number of words (split by spaces) in the screenname
user_df['screenname_word'] = user_df['screenname'].apply(
    lambda x: len(x.split()) if pd.notnull(x) else pd.NA).astype('Float64')


# Description Features
user_df['description_length'] = user_df['description'].apply(lambda x: len(x) if pd.notnull(x) else pd.NA).astype('Float64') # Change to Int64

# User Metadata Features
# Calculate follow ratio with null checks and handle division by zero
user_df['user_md_follow_ratio'] = user_df.apply(
    lambda row: 0 if pd.isnull(row['user_md_follower']) or pd.isnull(row['user_md_following']) or row['user_md_following'] == 0 
    else row['user_md_follower'] / row['user_md_following'], axis=1
).astype('Float64')  # Convert to Float64

### Engineer Text Features

In [139]:
# Load the sentiment-analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Ensure no NaN values and strip whitespace
post_features['post_text'] = post_features['post_text'].fillna('').str.strip()

# Use tqdm to apply sentiment analysis with a progress bar
tqdm.pandas()  # Initialize tqdm for pandas

# Analyze sentiment for each post with progress tracking
sentiment_results = post_features['post_text'].progress_apply(sentiment_pipeline)

# Extract sentiment score
post_features['post_sentiment_score'] = sentiment_results.apply(lambda x: x[0]['score'])

# Map sentiment labels to numerical values
sentiment_mapping = {
    'POSITIVE': 1,
    'NEGATIVE': -1,
    'NEUTRAL': 0
}
post_features['post_sentiment_numeric'] = (sentiment_results.apply(lambda x: sentiment_mapping.get(x[0]['label'], 0))).astype('Float64')

post_features['post_text_length'] = post_features['post_text'].apply(len).astype('Float64')

# # Select only the specified columns for the final dataset
# train_data_text = train_data_text[text_dataset_columns]

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


  0%|          | 0/2 [00:00<?, ?it/s]

### Aggregate Text Features

In [None]:
def safe_mean(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return pd.na

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    return valid_values.mean()  # Return the mean of the modified values

# Function to safely compute standard deviation, returning 0 if only one valid value exists
def safe_std(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return pd.na

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    if len(valid_values) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return valid_values.std(ddof=0)  # Population std deviation
def safe_std_sentiment(series):
    if series.empty or series.isnull().all():
        return np.nan

    if len(series) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return series.std(ddof=0)  # Population std deviation



# Aggregation functions including proportion of positive and negative sentiment
aggregation_functions = {
    'post_md_like': [safe_mean, safe_std],
    'post_md_retweet': [safe_mean, safe_std],
    'post_md_reply': [safe_mean, safe_std],
    'post_md_quote': [safe_mean, safe_std],
    'post_text_length': [safe_mean, safe_std],
    'post_sentiment_score': ['mean', safe_std_sentiment],
    'post_sentiment_numeric': ['mean', safe_std_sentiment,
        ('prop_positive', lambda x: (x == 1).mean() if x.notna().any() else np.nan),
        ('prop_negative', lambda x: (x == -1).mean() if x.notna().any() else np.nan)
    ]
}

# Group by user_id and aggregate using the functions
aggregated_df = post_features.groupby('user_id').agg(aggregation_functions)

# Custom column renaming to handle function names and aggregation names
new_columns = []
for col in aggregated_df.columns:
    feature_name = col[0]
    aggregation_name = col[1]

    if callable(aggregation_name):  # If the second element is a function, name it based on the function
        aggregation_name = aggregation_name.__name__

    new_columns.append(f"{feature_name}_{aggregation_name}")

# Apply the new column names
aggregated_df.columns = new_columns

# Reset index to make user_id a column
aggregated_df.reset_index(inplace=True)

aggregated_df = aggregated_df.rename(columns={'post_md_like_safe_mean': 'post_md_like_mean',
                                              'post_md_like_safe_std': 'post_md_like_std',
                                              'post_md_retweet_safe_mean': 'post_md_retweet_mean',
                                              'post_md_retweet_safe_std': 'post_md_retweet_std',
                                              'post_md_reply_safe_mean': 'post_md_reply_mean',
                                              'post_md_reply_safe_std': 'post_md_reply_std',
                                              'post_md_quote_safe_mean': 'post_md_quote_mean',
                                              'post_md_quote_safe_std': 'post_md_quote_std',
                                              'post_text_length_safe_mean': 'post_text_length_mean',
                                              'post_text_length_safe_std': 'post_text_length_std',
                                              'post_sentiment_score_safe_std_sentiment': 'post_sentiment_score_std',
                                              'post_sentiment_numeric_safe_std_sentiment': 'post_sentiment_numeric_std'})

### Aggregate User and Text features

In [154]:
dataset_columns_aggregated = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                    'username_numeric', 'username_special', 'username_length', 'username_se',
                    'screenname', 'screenname_uppercase', 'screenname_lowercase',
                    'screenname_numeric', 'screenname_special', 'screenname_length',
                    'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                    'screenname_word', 'description', 'description_length',
                    'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                    'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                    'user_md_protected', 'post_md_like_mean', 'post_md_like_std',
                    'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean',
                    'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std',
                    'post_text_length_mean', 'post_text_length_std',
                    'post_sentiment_score_mean', 'post_sentiment_score_std',
                    'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                    'post_sentiment_numeric_prop_positive',
                    'post_sentiment_numeric_prop_negative']

# Ensure both DataFrames have 'user_id' as the key for merging
combined_df = pd.merge(user_df, aggregated_df, on='user_id', how='left')

combined_df = combined_df[dataset_columns_aggregated]

   user_id username  username_uppercase  username_lowercase  username_numeric  \
0        1  user123                 0.0                 4.0               3.0   

   username_special  username_length  username_se   screenname  \
0               0.0              7.0     2.807355  user_screen   

   screenname_uppercase  ...  post_md_quote_mean  post_md_quote_std  \
0                   0.0  ...                 1.5                0.5   

   post_text_length_mean  post_text_length_std  post_sentiment_score_mean  \
0                   21.0                   1.0                   0.984934   

   post_sentiment_score_std  post_sentiment_numeric_mean  \
0                  0.003338                         -1.0   

   post_sentiment_numeric_std post_sentiment_numeric_prop_positive  \
0                         0.0                                  0.0   

   post_sentiment_numeric_prop_negative  
0                                   1.0  

[1 rows x 43 columns]


## Feature Engineering

### Unlabeled TikTok data feature engineering

In [None]:
tiktok_data = pd.read_excel('./drive/MyDrive/Datasets/2-Train_2/2020 TikTok Bots and Clickworker/CSV/accounts.xlsx')

tiktok_unlabeled = tiktok_data[tiktok_data['IsABot'].isna()]

print (tiktok_unlabeled)

                                         Id  IsAccountPrivate  IsVerified  \
1      9A60337F-64D3-468F-A20F-0017CD58C59A              True       False   
4      4026CA43-1F0D-42C7-AA7F-001E55ECB80A             False       False   
7      0CA73C78-D1DF-4508-86CC-0044299A2560             False       False   
8      CC93AC1E-C388-4FCC-8742-0046662A0B01             False       False   
9      763D9B3F-55C4-44C2-87FB-004BF1A97D40              True       False   
...                                     ...               ...         ...   
10114  F6582805-A196-436D-AF73-FFD82A8F1A6E             False       False   
10115  45A93F86-D044-4DC0-8127-FFD91AC7810C             False       False   
10116  B57EC2B8-F088-40FA-9D78-FFDDBB506C68             False       False   
10117  604FA15E-5515-4E4C-9A8D-FFEC1BE5A751             False       False   
10120  C6020ECB-91AF-4E84-8F5A-FFFAEFC433FB             False       False   

      HasProfilePicture  AnzahlFolgeIch  AnzahlFollower AnzahlLikes  \
1   

In [None]:
import pandas as pd

# Select the required TikTok features
tiktok_features = ['Id', 'VerhaeltnisFolgeIchProFollower', 'AnzahlPosts',
                   'NumberOfLikedVideos', 'IsVerified', 'IsAccountPrivate']

# Filter only rows whose 'Id' (user_id) contains letters
tiktok_data_with_letters = tiktok_unlabeled[tiktok_unlabeled['Id'].str.contains('[A-Za-z]', na=False)]

# Select the required columns
tiktok_data_selected = tiktok_data_with_letters[tiktok_features]

# Randomly sample 7,140 records
sampled_tiktok_data = tiktok_data_selected.sample(n=7100, random_state=42)

# Define the new feature names
features = ['user_id', 'user_md_follow_ratio', 'user_md_total_post',
            'user_md_total_like', 'user_md_verified', 'user_md_protected']

# Create a dictionary to map the TikTok columns to your desired feature names
rename_mapping = {
    'Id': 'user_id',
    'VerhaeltnisFolgeIchProFollower': 'user_md_follow_ratio',
    'AnzahlPosts': 'user_md_total_post',
    'NumberOfLikedVideos': 'user_md_total_like',
    'IsVerified': 'user_md_verified',
    'IsAccountPrivate': 'user_md_protected'
}

# Rename the columns
sampled_tiktok_data = sampled_tiktok_data.rename(columns=rename_mapping)

# Display the first few rows to verify
print(sampled_tiktok_data)

print (sampled_tiktok_data.dtypes)

# sampled_tiktok_data.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/tiktok_unlabeled_7140.csv', index=False)

                                   user_id  user_md_follow_ratio  \
3916  10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             35.333333   
4898  5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             52.250000   
9185  83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             14.000000   
4459  CF33854C-298F-4C7F-AB77-6F7631B59FC8             17.500000   
8322  BB1A148A-380B-411C-B053-D3256C5BEF4C             31.727273   
...                                    ...                   ...   
187   B17C1427-0C93-4BB1-B0BD-0523B0161B79              6.222222   
4157  58E86FB3-27E8-4673-BC8B-673C02C8F9EF              2.947368   
4528  72BF16D0-2D7D-4659-8A17-712C18786C79             17.166667   
4908  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             35.090909   
8839  16DEF46E-65AD-4CB8-B144-DFE751386965             80.645161   

     user_md_total_post  user_md_total_like  user_md_verified  \
3916                 20                 0.0             False   
4898                  0                 0.0          

In [None]:
import pandas as pd

# Convert the 'user_md_total_post' column to numeric, forcing invalid values to NaN
sampled_tiktok_data['user_md_total_post'] = pd.to_numeric(sampled_tiktok_data['user_md_total_post'], errors='coerce')

# Explicitly cast to float (even if it could be integers)
sampled_tiktok_data['user_md_total_post'] = sampled_tiktok_data['user_md_total_post'].astype('float64')

# Confirm the conversion
print(sampled_tiktok_data.dtypes)

# Display the first few rows to verify
print(sampled_tiktok_data[['user_id', 'user_md_total_post']].head())


user_id                  object
user_md_follow_ratio    float64
user_md_total_post      float64
user_md_total_like      float64
user_md_verified           bool
user_md_protected          bool
dtype: object
                                   user_id  user_md_total_post
3916  10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4                20.0
4898  5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC                 0.0
9185  83C07A5E-0F5E-4519-A1FF-E87CEFC905D3                 8.0
4459  CF33854C-298F-4C7F-AB77-6F7631B59FC8                 2.0
8322  BB1A148A-380B-411C-B053-D3256C5BEF4C                 0.0


In [None]:
user_columns_unlabeled = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                    'username_numeric', 'username_special', 'username_length', 'username_se',
                    'screenname', 'screenname_uppercase', 'screenname_lowercase',
                    'screenname_numeric', 'screenname_special', 'screenname_length',
                    'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                    'screenname_word', 'description', 'description_length',
                    'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                    'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                    'user_md_protected']

post_columns_unlabeled = ['post_md_like_mean', 'post_md_like_std',
                    'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean',
                    'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std',
                    'post_text_length_mean', 'post_text_length_std',
                    'post_sentiment_score_mean', 'post_sentiment_score_std',
                    'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                    'post_sentiment_numeric_prop_positive',
                    'post_sentiment_numeric_prop_negative']


# Function to calculate string entropy
def calculate_entropy(s):
    if pd.isna(s):  # Handle null values
        return pd.NA

    # Strip leading and trailing whitespace
    s = s.strip()

    # Check if the stripped string is empty after removing whitespace
    if pd.isna(s):  # If the string is empty, return pd.NA
        return pd.NA

    # Calculate frequency of each character
    freq = {}
    for char in s:
        freq[char] = freq.get(char, 0) + 1

    # Calculate probability of each character
    probabilities = [count / len(s) for count in freq.values()]

    # Calculate entropy
    entropy = -sum(p * np.log2(p) for p in probabilities if p > 0)
    return entropy

# Function to convert to boolean or mapped float value
def to_boolean(value):
    if pd.isna(value):  # Handle null values
        return float('nan')  # Return NaN for missing values
    if isinstance(value, bool):
        return 1.0 if value else 0.0  # Map True to 1.0, False to 2.0

    if isinstance(value, str):
        value = value.strip()

    if value in [1, "1", "yes", "True", True]:  # Check for truthy values
        return 1.0  # True -> 1.0
    if value in [0, "0", "no", "False", False]:  # Check for falsy values
        return 0.0  # False -> 0.0
    return float('nan')  # Return NaN for unexpected values

# User Metadata Features
sampled_tiktok_data['user_md_verified'] = sampled_tiktok_data['user_md_verified'] = sampled_tiktok_data['user_md_verified'].map(to_boolean)
sampled_tiktok_data['user_md_protected'] = sampled_tiktok_data['user_md_protected'] = sampled_tiktok_data['user_md_protected'].map(to_boolean)

for col in user_columns_unlabeled:
    if col not in sampled_tiktok_data.columns:
        sampled_tiktok_data[col] = np.nan  # Add missing columns and fill with NaN

sampled_tiktok_data = sampled_tiktok_data[user_columns_unlabeled]

print (sampled_tiktok_data)

print (sampled_tiktok_data.dtypes)

                                   user_id  username  username_uppercase  \
3916  10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN                 NaN   
4898  5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC       NaN                 NaN   
9185  83C07A5E-0F5E-4519-A1FF-E87CEFC905D3       NaN                 NaN   
4459  CF33854C-298F-4C7F-AB77-6F7631B59FC8       NaN                 NaN   
8322  BB1A148A-380B-411C-B053-D3256C5BEF4C       NaN                 NaN   
...                                    ...       ...                 ...   
187   B17C1427-0C93-4BB1-B0BD-0523B0161B79       NaN                 NaN   
4157  58E86FB3-27E8-4673-BC8B-673C02C8F9EF       NaN                 NaN   
4528  72BF16D0-2D7D-4659-8A17-712C18786C79       NaN                 NaN   
4908  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN                 NaN   
8839  16DEF46E-65AD-4CB8-B144-DFE751386965       NaN                 NaN   

      username_lowercase  username_numeric  username_special  username_length  \
3916  

In [None]:

# Load the TikTok posts data from the Excel file
tiktok_posts = pd.read_excel('./drive/MyDrive/Datasets/2-Train_2/2020 TikTok Bots and Clickworker/CSV/videos.xlsx')

# Specify the columns you want to extract from tiktok_posts
post_columns = ['AccountId', 'NumberOfLikes', 'NumberOfComments', 'VideoDescriptionLength']

# Filter to retain only the relevant columns
tiktok_posts_filtered = tiktok_posts[post_columns]

# Create an empty DataFrame to store the extracted data
extracted_post_data = pd.DataFrame()

# Iterate over each user_id in the sampled_tiktok_data
for user_id in sampled_tiktok_data['user_id']:
    # Filter the post data for the current user_id
    user_post_data = tiktok_posts_filtered[tiktok_posts_filtered['AccountId'] == user_id]

    # Limit to a maximum of 20 rows per user_id
    user_post_data_limited = user_post_data.head(20)

    # Append the limited data to the extracted_post_data DataFrame
    extracted_post_data = pd.concat([extracted_post_data, user_post_data_limited])

# Display the extracted post data for verification
print(extracted_post_data)

                                  AccountId  NumberOfLikes NumberOfComments  \
348    10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            3.0                0   
2480   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            0.0                0   
4292   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            2.0                0   
5117   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            2.0                0   
6828   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4            1.0                0   
...                                     ...            ...              ...   
49390  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            3.0                0   
50489  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            1.0                0   
61918  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            2.0                0   
62203  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            0.0                0   
68168  98E48D15-3AA7-412A-A7DD-7B9F979DD33A            1.0                0   

       VideoDescriptionLength  
348                

In [None]:
extracted_post_data = extracted_post_data.rename(columns={'AccountId': 'user_id',
                                                          'NumberOfLikes': 'post_md_like',
                                                          'NumberOfComments': 'post_md_reply',
                                                          'VideoDescriptionLength': 'post_text_length'})

print (extracted_post_data)

                                    user_id  post_md_like post_md_reply  \
348    10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           3.0             0   
2480   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           0.0             0   
4292   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           2.0             0   
5117   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           2.0             0   
6828   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4           1.0             0   
...                                     ...           ...           ...   
49390  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           3.0             0   
50489  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           1.0             0   
61918  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           2.0             0   
62203  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           0.0             0   
68168  98E48D15-3AA7-412A-A7DD-7B9F979DD33A           1.0             0   

       post_text_length  
348                 0.0  
2480                0.0  
4292                0

In [None]:
unlabeled_posts = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts.csv')

In [None]:
for col in unlabeled_posts.columns:
    if col not in extracted_post_data.columns:
        extracted_post_data[col] = np.nan  # Add missing columns and fill with NaN

extracted_post_data = extracted_post_data[unlabeled_posts.columns]

print (extracted_post_data)

                                    user_id  username  post_md_like  \
348    10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           3.0   
2480   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           0.0   
4292   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           2.0   
5117   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           2.0   
6828   10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4       NaN           1.0   
...                                     ...       ...           ...   
49390  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           3.0   
50489  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           1.0   
61918  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           2.0   
62203  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           0.0   
68168  98E48D15-3AA7-412A-A7DD-7B9F979DD33A       NaN           1.0   

       post_md_retweet post_md_reply  post_md_quote  post_text  \
348                NaN             0            NaN        NaN   
2480           

In [None]:
combined_unlabeled_posts = pd.concat([unlabeled_posts, extracted_post_data], ignore_index=True)

print (combined_unlabeled_posts)

combined_unlabeled_posts.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_UPDATED.csv', index=False)

                                     user_id        username  post_md_like  \
0                                 u234450632  REALISE_innOV8           0.0   
1                       u1465728402372059144   DAIRInstitute           0.0   
2                                u2822124414    jaredliangtw           0.0   
3                                u2315526308    nimmirastogi           0.0   
4                                 u330562292   ubergasmonkey           1.0   
...                                      ...             ...           ...   
147156  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           3.0   
147157  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   
147158  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           2.0   
147159  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           0.0   
147160  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   

        post_md_retweet post_md_reply  post_md_quote  \
0      

In [None]:
import pandas as pd

# Step 1: Remove last 40 rows from the first 7140 rows and first 40 rows from the last 7140 rows
# Remove the last 40 rows from the first subset (up to row 7139)
removed_first_subset = unlabeled_users.iloc[7100:7140]

# Remove the first 40 rows from the last subset (starting from the last 7140 rows)
removed_last_subset = unlabeled_users.iloc[-7140:-7100]

# Step 2: Combine the removed rows into a separate DataFrame
removed_accounts = pd.concat([removed_first_subset, removed_last_subset], ignore_index=True)

# Display the removed accounts
print("Removed Accounts:")
print(removed_accounts)

unlabeled_users_filtered = unlabeled_users.drop(removed_first_subset.index).drop(removed_last_subset.index)

print (unlabeled_users_filtered)

Removed Accounts:
                 user_id        username  username_uppercase  \
0                    NaN             NaN                 NaN   
1                    NaN             NaN                 NaN   
2                    NaN             NaN                 NaN   
3                    NaN             NaN                 NaN   
4                    NaN             NaN                 NaN   
..                   ...             ...                 ...   
75  u1484652566885580801        yciejxla                 0.0   
76  u1124604567663120385   NanotechEvent                 2.0   
77            u392930821  GiorgioCafiero                 2.0   
78            u460107942     CustomPCMag                 4.0   
79   u827046914285830144  emanuelewolves                 0.0   

    username_lowercase  username_numeric  username_special  username_length  \
0                  NaN               NaN               NaN              NaN   
1                  NaN               NaN               

In [None]:
# Step 3: Get the unique user IDs of the removed accounts
removed_user_ids = removed_accounts['user_id'].unique()

# Step 4: Filter the 'unlabeled_posts' to remove posts related to the removed accounts
filtered_unlabeled_posts = combined_unlabeled_posts[~combined_unlabeled_posts['user_id'].isin(removed_user_ids)]

# Display the filtered posts
print("Filtered Unlabeled Posts:")
print(filtered_unlabeled_posts)

Filtered Unlabeled Posts:
                                     user_id        username  post_md_like  \
0                                 u234450632  REALISE_innOV8           0.0   
1                       u1465728402372059144   DAIRInstitute           0.0   
2                                u2822124414    jaredliangtw           0.0   
3                                u2315526308    nimmirastogi           0.0   
4                                 u330562292   ubergasmonkey           1.0   
...                                      ...             ...           ...   
147156  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           3.0   
147157  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   
147158  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           2.0   
147159  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           0.0   
147160  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   

        post_md_retweet post_md_reply

In [None]:
# Combine the sampled TikTok data with the filtered unlabeled accounts
combined_unlabeled_data = pd.concat([sampled_tiktok_data, unlabeled_users_filtered], ignore_index=True)

# Display the combined data
print("Combined Data:")
print(combined_unlabeled_data)

# Optionally, save the combined data to a CSV file
# combined_data.to_csv('combined_unlabeled_accounts.csv', index=False)


Combined Data:
                                    user_id        username  \
0      10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             NaN   
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             NaN   
2      83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             NaN   
3      CF33854C-298F-4C7F-AB77-6F7631B59FC8             NaN   
4      BB1A148A-380B-411C-B053-D3256C5BEF4C             NaN   
...                                     ...             ...   
21295                   u958807016520192008   HackYFutureBE   
21296                  u1486053324554227715   Empirerising4   
21297                  u1145483306563244032  _sheenavasquez   
21298                  u1265061521597648897      Ieomessiok   
21299                           u1010593116      SharkawyMD   

       username_uppercase  username_lowercase  username_numeric  \
0                     NaN                 NaN               NaN   
1                     NaN                 NaN               NaN   
2                     NaN  

In [None]:
# Check if user_id in filtered_unlabeled_posts exists in combined_unlabeled_data
filtered_unlabeled_posts['user_exists'] = filtered_unlabeled_posts['user_id'].isin(combined_unlabeled_data['user_id'])

# Filter rows where user_id exists in combined_unlabeled_data
posts_with_existing_users = filtered_unlabeled_posts[filtered_unlabeled_posts['user_exists']]

# Optionally, filter rows where user_id doesn't exist in combined_unlabeled_data (for validation or further actions)
posts_without_existing_users = filtered_unlabeled_posts[~filtered_unlabeled_posts['user_exists']]

# Display the filtered posts with existing users
print("Posts without existing users:")
print(posts_without_existing_users)

# Optionally, save the result to CSV
# posts_with_existing_users.to_csv('posts_with_existing_users.csv', index=False)


Posts without existing users:
                           user_id username  post_md_like  post_md_retweet  \
57366  to reserve at least 40% of…      140           NaN              NaN   

      post_md_reply  post_md_quote post_text  post_text_length  user_exists  
57366           NaN            NaN       NaN               NaN        False  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_unlabeled_posts['user_exists'] = filtered_unlabeled_posts['user_id'].isin(combined_unlabeled_data['user_id'])


In [None]:
# Filter and keep only rows where user_id exists in combined_unlabeled_data
filtered_unlabeled_posts_existing = filtered_unlabeled_posts[filtered_unlabeled_posts['user_id'].isin(combined_unlabeled_data['user_id'])]

# Save the filtered posts without the missing users if needed
# filtered_unlabeled_posts_existing.to_csv('filtered_unlabeled_posts_existing.csv', index=False)

# Overwrite the original DataFrame if desired
filtered_unlabeled_posts = filtered_unlabeled_posts_existing

print (filtered_unlabeled_posts)

filtered_unlabeled_posts.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_UPDATED.csv')

combined_unlabeled_data.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_UPDATED.csv')

                                     user_id        username  post_md_like  \
0                                 u234450632  REALISE_innOV8           0.0   
1                       u1465728402372059144   DAIRInstitute           0.0   
2                                u2822124414    jaredliangtw           0.0   
3                                u2315526308    nimmirastogi           0.0   
4                                 u330562292   ubergasmonkey           1.0   
...                                      ...             ...           ...   
147156  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           3.0   
147157  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   
147158  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           2.0   
147159  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           0.0   
147160  98E48D15-3AA7-412A-A7DD-7B9F979DD33A             NaN           1.0   

        post_md_retweet post_md_reply  post_md_quote  \
0      

In [None]:
print (combined_unlabeled_data)

                                    user_id        username  \
0      10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             NaN   
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             NaN   
2      83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             NaN   
3      CF33854C-298F-4C7F-AB77-6F7631B59FC8             NaN   
4      BB1A148A-380B-411C-B053-D3256C5BEF4C             NaN   
...                                     ...             ...   
21295                   u958807016520192008   HackYFutureBE   
21296                  u1486053324554227715   Empirerising4   
21297                  u1145483306563244032  _sheenavasquez   
21298                  u1265061521597648897      Ieomessiok   
21299                           u1010593116      SharkawyMD   

       username_uppercase  username_lowercase  username_numeric  \
0                     NaN                 NaN               NaN   
1                     NaN                 NaN               NaN   
2                     NaN                 

In [None]:
unlabeled_post_sentiment = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv')

print (unlabeled_post_sentiment.dtypes)

user_id                    object
username                   object
post_md_like              float64
post_md_retweet           float64
post_md_reply             float64
post_md_quote             float64
post_text                  object
post_text_length          float64
user_exists                  bool
post_sentiment_score      float64
post_sentiment_numeric      int64
dtype: object


  unlabeled_post_sentiment = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv')


In [None]:
unlabeled_post_sentiment = unlabeled_post_sentiment.sort_values(by=['user_id'])

for index, row in unlabeled_post_sentiment.iterrows():
  if pd.isna(row['post_text']):
    unlabeled_post_sentiment.loc[index, ['post_sentiment_score', 'post_sentiment_numeric']] = [np.nan, np.nan]



unlabeled_post_sentiment.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv', index=False)


In [None]:
# unlabeled_post_sentiment = unlabeled_post_sentiment.drop(columns=['user_exists'])
unlabeled_post_sentiment.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_SENTIMENT.csv', index=False)

In [None]:
print (unlabeled_post_sentiment)

                                     user_id        username  post_md_like  \
145514  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           2.0   
145516  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           5.0   
145515  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           2.0   
145519  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN           0.0   
145513  0001B2D9-59A7-464E-9427-704DAB4677A7             NaN          11.0   
...                                      ...             ...           ...   
71485                    u999492441513148416  timevalueofbtc           0.0   
71054                    u999492441513148416  timevalueofbtc           0.0   
48037                    u999492441513148416  timevalueofbtc           0.0   
62112                    u999492441513148416  timevalueofbtc          14.0   
56582                    u999492441513148416  timevalueofbtc          51.0   

        post_md_retweet  post_md_reply  post_md_quote  \
145514

### Aggregate Unlabeled Text Data

In [None]:
# Sample post data DataFrame for demonstration
# Assuming your post data is loaded into a DataFrame named 'train_data_text_updated'

def safe_mean(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return np.nan

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    return valid_values.mean()  # Return the mean of the modified values

# Function to safely compute standard deviation, returning 0 if only one valid value exists
def safe_std(series):
    # Return NaN if the series is empty
    if series.empty or series.isnull().all():
        return np.nan

    # Replace negative values with 0
    valid_values = series.where(series >= 0, 0)

    if len(valid_values) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return valid_values.std(ddof=0)  # Population std deviation

def safe_std_sentiment(series):
    if series.empty or series.isnull().all():
        return np.nan

    if len(series) <= 1:  # If there's only one value or none
        return 0.0  # Return 0 for std deviation in such cases

    return series.std(ddof=0)  # Population std deviation



# Aggregation functions including proportion of positive and negative sentiment
aggregation_functions = {
    'post_md_like': [safe_mean, safe_std],
    'post_md_retweet': [safe_mean, safe_std],
    'post_md_reply': [safe_mean, safe_std],
    'post_md_quote': [safe_mean, safe_std],
    'post_text_length': [safe_mean, safe_std],
    'post_sentiment_score': ['mean', safe_std_sentiment],
    'post_sentiment_numeric': ['mean', safe_std_sentiment,
        ('prop_positive', lambda x: (x == 1).mean() if x.notna().any() else np.nan),
        ('prop_negative', lambda x: (x == -1).mean() if x.notna().any() else np.nan)
    ]
}

# Group by user_id and aggregate using the functions
aggregated_df = unlabeled_post_sentiment.groupby('user_id').agg(aggregation_functions)

# Custom column renaming to handle function names and aggregation names
new_columns = []
for col in aggregated_df.columns:
    feature_name = col[0]
    aggregation_name = col[1]

    if callable(aggregation_name):  # If the second element is a function, name it based on the function
        aggregation_name = aggregation_name.__name__

    new_columns.append(f"{feature_name}_{aggregation_name}")

# Apply the new column names
aggregated_df.columns = new_columns

# Reset index to make user_id a column
aggregated_df.reset_index(inplace=True)

aggregated_df = aggregated_df.rename(columns={'post_md_like_safe_mean': 'post_md_like_mean',
                                              'post_md_like_safe_std': 'post_md_like_std',
                                              'post_md_retweet_safe_mean': 'post_md_retweet_mean',
                                              'post_md_retweet_safe_std': 'post_md_retweet_std',
                                              'post_md_reply_safe_mean': 'post_md_reply_mean',
                                              'post_md_reply_safe_std': 'post_md_reply_std',
                                              'post_md_quote_safe_mean': 'post_md_quote_mean',
                                              'post_md_quote_safe_std': 'post_md_quote_std',
                                              'post_text_length_safe_mean': 'post_text_length_mean',
                                              'post_text_length_safe_std': 'post_text_length_std',
                                              'post_sentiment_score_safe_std_sentiment': 'post_sentiment_score_std',
                                              'post_sentiment_numeric_safe_std_sentiment': 'post_sentiment_numeric_std'})

# Display the aggregated DataFrame
print(aggregated_df)

# Save the aggregated DataFrame to CSV
aggregated_df.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_AGGREGATED.csv', index=False)

                                   user_id  post_md_like_mean  \
0     0001B2D9-59A7-464E-9427-704DAB4677A7           6.500000   
1     0006C595-1722-4D1C-9438-E1698690D71F          12.000000   
2     0008C48D-E99A-4E6A-AF0C-86E8CE896D0B          12.950000   
3     000A53DD-11AB-471F-9992-77B9A32CF0F9           0.650000   
4     00118AEC-44A5-4965-BFB5-7D8698F5F1CA          11.950000   
...                                    ...                ...   
6859                   u994843692660862976           0.300000   
6860                   u995369593744035840         562.800000   
6861                              u9963832         211.285714   
6862                            u997150783         100.000000   
6863                   u999492441513148416          11.200000   

      post_md_like_std  post_md_retweet_mean  post_md_retweet_std  \
0            16.363068                   NaN                  NaN   
1             1.000000                   NaN                  NaN   
2           

In [None]:
unlabeled_accounts = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_UPDATED.csv')
unlabeled_posts = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_posts_AGGREGATED.csv')

In [None]:
unlabeled_dataset_columns_aggregated = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                    'username_numeric', 'username_special', 'username_length', 'username_se',
                    'screenname', 'screenname_uppercase', 'screenname_lowercase',
                    'screenname_numeric', 'screenname_special', 'screenname_length',
                    'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                    'screenname_word', 'description', 'description_length',
                    'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                    'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                    'user_md_protected', 'post_md_like_mean', 'post_md_like_std',
                    'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean',
                    'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std',
                    'post_text_length_mean', 'post_text_length_std',
                    'post_sentiment_score_mean', 'post_sentiment_score_std',
                    'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                    'post_sentiment_numeric_prop_positive',
                    'post_sentiment_numeric_prop_negative']

# Ensure both DataFrames have 'user_id' as the key for merging
combined_df = pd.merge(unlabeled_accounts, unlabeled_posts, on='user_id', how='left')

combined_df = combined_df[unlabeled_dataset_columns_aggregated]

# Display the combined DataFrame
print(combined_df)

combined_df.to_csv('./drive/MyDrive/Datasets/FINAL/Unlabeled/unlabeled_accounts_AGGREGATED.csv', index=False)

                                    user_id        username  \
0      10832E7F-2B0F-42CE-B4DC-611FFD1FB4F4             NaN   
1      5FE82B30-ED1E-4A13-8D5C-7B6CD61C21CC             NaN   
2      83C07A5E-0F5E-4519-A1FF-E87CEFC905D3             NaN   
3      CF33854C-298F-4C7F-AB77-6F7631B59FC8             NaN   
4      BB1A148A-380B-411C-B053-D3256C5BEF4C             NaN   
...                                     ...             ...   
21295                   u958807016520192008   HackYFutureBE   
21296                  u1486053324554227715   Empirerising4   
21297                  u1145483306563244032  _sheenavasquez   
21298                  u1265061521597648897      Ieomessiok   
21299                           u1010593116      SharkawyMD   

       username_uppercase  username_lowercase  username_numeric  \
0                     NaN                 NaN               NaN   
1                     NaN                 NaN               NaN   
2                     NaN                 

In [4]:
user_train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_AGGREGATED.csv')

# print (user_train_data_labeled.columns)

# print (user_train_data_labeled.dtypes)

In [105]:
# user_train_data_labeled = pd.read_csv('./drive/MyDrive/Datasets/FINAL/Labeled/labeled_accounts_AGGREGATED.csv')

# print (train_data)

features = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
            'username_numeric', 'username_special', 'username_length', 'username_se',
            'screenname', 'screenname_uppercase', 'screenname_lowercase',
            'screenname_numeric', 'screenname_special', 'screenname_length',
            'screenname_se', 'screenname_emoji', 'screenname_hashtag',
            'screenname_word', 'description', 'description_length',
            'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
            'user_md_total_post', 'user_md_total_like', 'user_md_verified',
            'user_md_protected', 'post_md_like_mean',
            'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
            'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean',
            'post_md_quote_std', 'post_text_length_mean', 'post_text_length_std',
            'post_sentiment_score_mean', 'post_sentiment_score_std',
            'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
            'post_sentiment_numeric_prop_positive',
            'post_sentiment_numeric_prop_negative']

### Initial training of model_1

In [106]:
# Assign the entire labeled dataset to X_train and y_train
X = user_train_data_labeled[features]  # All feature columns from the labeled dataset
y = user_train_data_labeled['label']    # Label column from the labeled dataset

# Split the dataset into training and testing sets (80% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Initialize an empty dictionary to store models and validation results
models_1 = {}

# Train separate models for each feature set
feature_sets = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se',],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                   'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                   'user_md_protected'],  # Add user metadata features
    'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'post_md_quote_mean', 'post_md_quote_std'],  # Add post metadata features
    'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']  # Add post text features (like BERT embeddings)
}

for feature_name, feature_columns in tqdm(feature_sets.items(), desc="Training Models", ncols=100):
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train[feature_columns], y_train)
    models_1[feature_name] = rf_model

Training Models: 100%|████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.27it/s]


In [None]:
import joblib

# Save the entire dictionary of models to a single file
filename = './drive/MyDrive/Datasets/Models/Supervised_Models_10-14.joblib'
joblib.dump(models_1, filename)

print("All models have been saved successfully as a whole.")


All models have been saved successfully as a whole.


In [None]:
models_2 = {}

feature_sets_2 = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se',],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                   'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                   'user_md_protected']  # Add user metadata features
}

for feature_name, feature_columns in tqdm(feature_sets_2.items(), desc="Training Models", ncols=100):
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train[feature_columns], y_train)
    models_2[feature_name] = rf_model

Training Models: 100%|████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.85it/s]


In [None]:
print (models_1)

print (models_2)

{'username': RandomForestClassifier(random_state=42), 'screenname': RandomForestClassifier(random_state=42), 'description': RandomForestClassifier(random_state=42), 'user_metadata': RandomForestClassifier(random_state=42), 'post_metadata': RandomForestClassifier(random_state=42), 'post_text': RandomForestClassifier(random_state=42)}
{'username': RandomForestClassifier(random_state=42), 'screenname': RandomForestClassifier(random_state=42), 'description': RandomForestClassifier(random_state=42), 'user_metadata': RandomForestClassifier(random_state=42)}


### Validation with weighted voting

In [None]:
print (feature_sets)

{'username': ['username_uppercase', 'username_lowercase', 'username_numeric', 'username_special', 'username_length', 'username_se'], 'screenname': ['screenname_uppercase', 'screenname_lowercase', 'screenname_numeric', 'screenname_special', 'screenname_length', 'screenname_se'], 'description': ['description_length'], 'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio', 'user_md_total_post', 'user_md_total_like', 'user_md_verified', 'user_md_protected'], 'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std', 'post_md_quote_mean', 'post_md_quote_std'], 'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean', 'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std', 'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']}


In [107]:
# Step 2: Validation Phase
# Initialize dictionaries to store weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = .80

# Step 1: Generate predictions for the validation set (X_test) using individual models with weighted voting
for feature_name, model in models_1.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)


    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Step 2: Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Step 3: Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Step 4: Evaluation

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')


Accuracy: 0.85
Precision: 0.83
Recall: 0.88
F1 Score: 0.85
MCC: 0.70


### Validation with weighted voting and Platt's scaling

In [43]:
from sklearn.calibration import CalibratedClassifierCV
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# Step 2: Validation Phase
# Initialize dictionaries to store weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Step 1: Apply Platt’s scaling (calibration) to each model before validation
calibrated_models = {}
for feature_name, model in models_1.items():
    # Apply Platt's scaling using CalibratedClassifierCV
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Step 2: Generate predictions for the validation set (X_test) using calibrated models with weighted voting
for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Step 3: Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Step 4: Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Step 5: Evaluation

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')


Accuracy: 0.85
Precision: 0.83
Recall: 0.89
F1 Score: 0.86
MCC: 0.71


# Supervised Training FINAL

### Import and prepare the dataset

In [8]:
user_train_data_labeled = pd.read_csv('../data/labeled_accounts_AGGREGATED.csv')

# OPTIONAL Print current training dataset
# print (user_train_data_labeled)

In [9]:
# Step 1: Split the dataset into training and validation sets (90% train, 10% validation)
train_data, val_data = train_test_split(user_train_data_labeled, test_size=0.2, random_state=42, stratify=user_train_data_labeled['label'])

# Step 2: Save the datasets
train_data.to_csv('../data/initial_train_data.csv', index=False)
val_data.to_csv('../data/val_data.csv', index=False)


In [10]:

# Step 3: Split the features and labels for each dataset
X_train = train_data[features]  # All feature columns from the training dataset
y_train = train_data['label']    # Label column from the training dataset

X_test = val_data[features]  # All feature columns from the validation dataset
y_test = val_data['label']    # Label column from the validation dataset

### Supervised Learning with Hyperparameters FINAL

In [15]:
# Define the feature subsets for each model
feature_sets = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se'],  # Add all screenname features
    'description': ['description_length'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                      'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                      'user_md_protected'],  # Add user metadata features
    'post_metadata': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'post_md_quote_mean', 'post_md_quote_std'],  # Add post metadata features
    'post_text': ['post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_negative']  # Add post text features
}

# Define hyperparameter grid for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20],  # Depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples to split a node
    'min_samples_leaf': [1, 2, 5],    # Minimum samples in a leaf node
    'max_features': ['sqrt', 'log2']  # Number of features to consider at each split
}

# Initialize an empty dictionary to store models with tuned hyperparameters
tuned_models = {}

# Train and tune separate models for each feature subset
for feature_name, feature_columns in tqdm(feature_sets.items(), desc="Training Models", ncols=100):
    print(f"Training model for {feature_name} feature set...")

    # Initialize the RandomForestClassifier
    rf_model = RandomForestClassifier(random_state=42)

    # Perform Grid Search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

    # Fit the grid search to the training data (for the specific feature set)
    grid_search.fit(X_train[feature_columns], y_train)

    # Store the best model with tuned hyperparameters
    tuned_models[feature_name] = grid_search.best_estimator_

    # Print the best hyperparameters for the feature set
    print(f"Best hyperparameters for {feature_name}: {grid_search.best_params_}")

Training Models:   0%|                                                        | 0/6 [00:00<?, ?it/s]

Training model for username feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best hyperparameters for username: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 200}
Training model for screenname feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best hyperparameters for screenname: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 200}
Training model for description feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best hyperparameters for description: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}
Training model for user_metadata feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best hyperparameters for user_metadata: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Training model for post_metadata feature set...
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best hyperparameters for post_metadata: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Training model for post_text feature set...
Fitting 5 folds for each of 

  _data = np.array(data, dtype=dtype, copy=copy,


Best hyperparameters for post_text: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}


### Save Models

In [16]:
# Save the entire dictionary of models to a single file
filename = '../models/Initial_Models_10-26.joblib'

# Rename models if needed
joblib.dump(tuned_models, filename)

print("All models have been saved successfully as a whole.")

All models have been saved successfully as a whole.


### Validation on the validation set with Weighted Voting and Platt's Scaling FINAL

In [23]:
models_1 = joblib.load('../models/Initial_Models_10-26.joblib')

In [3]:
# Initialize arrays to accumulate weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Initialize array for calbirated models
calibrated_models = {}

# Apply Platt's scaling to each model using CalibratedClassifierCV
for feature_name, model in models_1.items():
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Generate predictions for each model using calibrated models
for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')

NameError: name 'X_test' is not defined

## Unsupervised Learning ITERATIVE

In [10]:
train_data_unlabeled = pd.read_csv('../data/unlabeled_accounts_AGGREGATED.csv')
train_data_labeled = pd.read_csv('../data/initial_train_data.csv')

### Iterative with min max confidence

In [216]:
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

completeness_threshold = 0.80
threshold = 0.65
pseudo_label_log = []
pseudo_label_consistency = {}

moderate_conf_min = 0.40
moderate_conf_max = 0.60

def track_pseudo_label_quality(pseudo_labeled_data, previous_labels):
    # Track metrics for quality assessment
    high_conf_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]
    avg_confidence = high_conf_data['confidence'].mean()
    label_counts = high_conf_data['predicted_label'].value_counts(normalize=True)
    
    # Consistency check
    if previous_labels is not None:
        aligned_labels = previous_labels.reindex(high_conf_data.index)
        consistency_rate = (aligned_labels == high_conf_data['predicted_label']).mean()
    else:
        consistency_rate = np.nan  # First iteration has no prior labels
    
    # Log metrics
    pseudo_label_log.append({
        'iteration': len(pseudo_label_log) + 1,
        'high_conf_count': len(high_conf_data),
        'avg_confidence': avg_confidence,
        'bot_ratio': label_counts.get(1, 0),
        'human_ratio': label_counts.get(0, 0),
        'consistency_rate': consistency_rate
    })
    
    # Update previous_labels for next iteration consistency check
    previous_labels = high_conf_data['predicted_label'].copy()
    return previous_labels

# Function to iterate through unsupervised learning until stopping condition is met
def iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns, threshold, completeness_threshold):
    iteration = 0
    improvements = True
    previous_confidently_labeled = 0
    previous_labels = None

    # Step 2: Apply Platt's scaling (calibration) and generate pseudo-labels for the unlabeled dataset
    calibrated_models = {}
    for feature_name, model in models_1.items():
        # Apply Platt's scaling
        calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
        calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
        models_1[feature_name] = calibrated_model

    while improvements:
        print(f"Iteration: {iteration + 1}")

        # Step 1: Initialize arrays for weighted probabilities
        bot_prob_sum = np.zeros(len(train_data_unlabeled))
        human_prob_sum = np.zeros(len(train_data_unlabeled))
        total_weights = np.zeros(len(train_data_unlabeled))
        pseudo_labeled_data = train_data_unlabeled.copy()

        # # Step 2: Apply Platt's scaling (calibration) and generate pseudo-labels for the unlabeled dataset
        # calibrated_models = {}
        # for feature_name, model in models_1.items():
        #     # Apply Platt's scaling
        #     calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
        #     calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
        #     calibrated_models[feature_name] = calibrated_model

        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            completeness = train_data_unlabeled[feature_columns].notnull().mean(axis=1)
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            probas = model.predict_proba(train_data_unlabeled[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe

        for i in range(len(avg_human_prob)):
            completeness = train_data_unlabeled[feature_sets[feature_name]].notnull().mean(axis=1)[i]
            smoothing_factor = 0.5  # Adjust to control impact of completeness (0.5 gives equal weight)
    
            if avg_human_prob[i] > avg_bot_prob[i]:
                pseudo_label = 0  # Human
                confidence = smoothing_factor * avg_human_prob[i] + (1 - smoothing_factor) * completeness
            else:
                pseudo_label = 1  # Bot
                confidence = smoothing_factor * avg_bot_prob[i] + (1 - smoothing_factor) * completeness

            pseudo_labeled_data.at[i, 'predicted_label'] = pseudo_label
            pseudo_labeled_data.at[i, 'confidence'] = confidence
        
        high_confidence_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]

        # Identify uncertainty-based samples (moderately confident samples)
        moderate_conf_data = pseudo_labeled_data[
            (pseudo_labeled_data['confidence'] >= moderate_conf_min) & 
            (pseudo_labeled_data['confidence'] <= moderate_conf_max)
        ]

        # Combine high-confidence and moderate-confidence samples
        combined_data = pd.concat([high_confidence_data, moderate_conf_data])

        # Use combined data for retraining
        confidently_labeled_count = len(combined_data)

        # confidently_labeled_count = len(high_confidence_data)

        previous_labels = track_pseudo_label_quality(pseudo_labeled_data, previous_labels)

        # Log summary of pseudo-label quality across iterations
        quality_df = pd.DataFrame(pseudo_label_log)
        print(quality_df)

        print(confidently_labeled_count)
        
        # Check improvement and decide if further self-training is beneficial
        # if pseudo_label_log[-1]['avg_confidence'] < threshold or pseudo_label_log[-1]['consistency_rate'] < threshold:
        #     print("Pseudo-label quality threshold reached. Stopping self-training.")
        #     break

        if confidently_labeled_count <= previous_confidently_labeled:
            improvements = False  # Stop if no more improvements
            break
        else:
            previous_confidently_labeled = confidently_labeled_count

        # Step 3: Augment the training data
        augmented_data = high_confidence_data.copy()
        augmented_data['label'] = high_confidence_data['predicted_label'].map(lambda x: True if x == 1 else False)
        augmented_data = augmented_data.drop(columns=['predicted_label', 'confidence'])
        augmented_data = augmented_data[dataset_columns]

        # Combine original labeled data with high-confidence pseudo-labeled data
        user_train_data_labeled_augmented = pd.concat([train_data_labeled, augmented_data], ignore_index=True)
        X_train_augmented = user_train_data_labeled_augmented.drop(columns=['label'])
        y_train_augmented = user_train_data_labeled_augmented['label']

        # Step 4: Retrain models with the augmented data
        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            model.fit(X_train_augmented[feature_columns], y_train_augmented)

        # Step 5: Validate the performance on the validation set
        bot_prob_sum = np.zeros(len(X_test))
        human_prob_sum = np.zeros(len(X_test))
        total_weights = np.zeros(len(X_test))
        # calibrated_models = {}

        # for feature_name, model in models_1.items():
        #     calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
        #     calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
        #     models_1[feature_name] = calibrated_model

        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            completeness = X_test[feature_columns].notnull().mean(axis=1)
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            probas = model.predict_proba(X_test[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe
        final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

        accuracy = accuracy_score(y_test, final_predictions)
        precision = precision_score(y_test, final_predictions, pos_label=True)
        recall = recall_score(y_test, final_predictions, pos_label=True)
        f1 = f1_score(y_test, final_predictions, pos_label=True)
        mcc = matthews_corrcoef(y_test, final_predictions)

        print(f"Iteration {iteration + 1} Evaluation Metrics:")
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print(f'MCC: {mcc:.2f}')

        

        

        iteration += 1

        # Stop if all unlabeled data has been confidently labeled
        if confidently_labeled_count == len(train_data_unlabeled):
            print("All unlabeled data has been confidently labeled.")
            break

# Example usage:
iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns, threshold, completeness_threshold)


Iteration: 1
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1             2479        0.795285   0.496571     0.503429   

   consistency_rate  
0               NaN  
9123
Iteration 1 Evaluation Metrics:
Accuracy: 0.89
Precision: 0.86
Recall: 0.92
F1 Score: 0.89
MCC: 0.78
Iteration: 2
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1             2479        0.795285   0.496571     0.503429   
1          2             2479        0.782964   0.476402     0.523598   

   consistency_rate  
0               NaN  
1          0.940299  
8627


### Iterative without min max confidence

In [224]:
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

completeness_threshold = 0.80
threshold = 0.65
pseudo_label_log = []
pseudo_label_consistency = {}

def track_pseudo_label_quality(pseudo_labeled_data, previous_labels):
    # Track metrics for quality assessment
    high_conf_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]
    avg_confidence = high_conf_data['confidence'].mean()
    label_counts = high_conf_data['predicted_label'].value_counts(normalize=True)
    
    # Consistency check
    if previous_labels is not None:
        aligned_labels = previous_labels.reindex(high_conf_data.index)
        consistency_rate = (aligned_labels == high_conf_data['predicted_label']).mean()
    else:
        consistency_rate = np.nan  # First iteration has no prior labels
    
    # Log metrics
    pseudo_label_log.append({
        'iteration': len(pseudo_label_log) + 1,
        'high_conf_count': len(high_conf_data),
        'avg_confidence': avg_confidence,
        'bot_ratio': label_counts.get(1, 0),
        'human_ratio': label_counts.get(0, 0),
        'consistency_rate': consistency_rate
    })
    
    # Update previous_labels for next iteration consistency check
    previous_labels = high_conf_data['predicted_label'].copy()
    return previous_labels

# Function to iterate through unsupervised learning until stopping condition is met
def iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns, threshold, completeness_threshold):
    iteration = 0
    improvements = True
    previous_confidently_labeled = 0
    previous_labels = None

    # Step 2: Apply Platt's scaling (calibration) and generate pseudo-labels for the unlabeled dataset
    calibrated_models = {}
    for feature_name, model in models_1.items():
        # Apply Platt's scaling
        calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
        calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
        models_1[feature_name] = calibrated_model

    while improvements:
        print(f"Iteration: {iteration + 1}")

        # Step 1: Initialize arrays for weighted probabilities
        bot_prob_sum = np.zeros(len(train_data_unlabeled))
        human_prob_sum = np.zeros(len(train_data_unlabeled))
        total_weights = np.zeros(len(train_data_unlabeled))
        pseudo_labeled_data = train_data_unlabeled.copy()

        # # Step 2: Apply Platt's scaling (calibration) and generate pseudo-labels for the unlabeled dataset
        # calibrated_models = {}
        # for feature_name, model in models_1.items():
        #     # Apply Platt's scaling
        #     calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
        #     calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
        #     calibrated_models[feature_name] = calibrated_model

        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            completeness = train_data_unlabeled[feature_columns].notnull().mean(axis=1)
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            probas = model.predict_proba(train_data_unlabeled[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe

        for i in range(len(avg_human_prob)):
            completeness = train_data_unlabeled[feature_sets[feature_name]].notnull().mean(axis=1)[i]
            smoothing_factor = 0.7  # Adjust to control impact of completeness (0.5 gives equal weight)
    
            if avg_human_prob[i] > avg_bot_prob[i]:
                pseudo_label = 0  # Human
                confidence = smoothing_factor * avg_human_prob[i] + (1 - smoothing_factor) * completeness
            else:
                pseudo_label = 1  # Bot
                confidence = smoothing_factor * avg_bot_prob[i] + (1 - smoothing_factor) * completeness

            pseudo_labeled_data.at[i, 'predicted_label'] = pseudo_label
            pseudo_labeled_data.at[i, 'confidence'] = confidence

        high_confidence_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > threshold]
        confidently_labeled_count = len(high_confidence_data)

        previous_labels = track_pseudo_label_quality(pseudo_labeled_data, previous_labels)
        

        # Log summary of pseudo-label quality across iterations
        quality_df = pd.DataFrame(pseudo_label_log)
        print(quality_df)
        
        # Check improvement and decide if further self-training is beneficial
        # if pseudo_label_log[-1]['avg_confidence'] < threshold or pseudo_label_log[-1]['consistency_rate'] < threshold:
        #     print("Pseudo-label quality threshold reached. Stopping self-training.")
        #     break

        if confidently_labeled_count <= previous_confidently_labeled:
            improvements = False  # Stop if no more improvements
            break
        else:
            previous_confidently_labeled = confidently_labeled_count

        # Step 3: Augment the training data
        augmented_data = high_confidence_data.copy()
        augmented_data['label'] = high_confidence_data['predicted_label'].map(lambda x: True if x == 1 else False)
        augmented_data = augmented_data.drop(columns=['predicted_label', 'confidence'])
        augmented_data = augmented_data[dataset_columns]

        # Combine original labeled data with high-confidence pseudo-labeled data
        user_train_data_labeled_augmented = pd.concat([train_data_labeled, augmented_data], ignore_index=True)
        X_train_augmented = user_train_data_labeled_augmented.drop(columns=['label'])
        y_train_augmented = user_train_data_labeled_augmented['label']

        # Step 4: Retrain models with the augmented data
        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            model.fit(X_train_augmented[feature_columns], y_train_augmented)

        # Step 5: Validate the performance on the validation set
        bot_prob_sum = np.zeros(len(X_test))
        human_prob_sum = np.zeros(len(X_test))
        total_weights = np.zeros(len(X_test))
        calibrated_models = {}

        for feature_name, model in models_1.items():
            calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
            calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)
            models_1[feature_name] = calibrated_model

        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            completeness = X_test[feature_columns].notnull().mean(axis=1)
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            probas = model.predict_proba(X_test[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe
        final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

        accuracy = accuracy_score(y_test, final_predictions)
        precision = precision_score(y_test, final_predictions, pos_label=True)
        recall = recall_score(y_test, final_predictions, pos_label=True)
        f1 = f1_score(y_test, final_predictions, pos_label=True)
        mcc = matthews_corrcoef(y_test, final_predictions)

        print(f"Iteration {iteration + 1} Evaluation Metrics:")
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print(f'MCC: {mcc:.2f}')

        print(confidently_labeled_count)

        

        iteration += 1

        # Stop if all unlabeled data has been confidently labeled
        if confidently_labeled_count == len(train_data_unlabeled):
            print("All unlabeled data has been confidently labeled.")
            break

# Example usage:
iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns, threshold, completeness_threshold)


Iteration: 1
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1             6205        0.703503   0.356648     0.643352   

   consistency_rate  
0               NaN  
Iteration 1 Evaluation Metrics:
Accuracy: 0.89
Precision: 0.85
Recall: 0.95
F1 Score: 0.90
MCC: 0.79
6205
Iteration: 2
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1             6205        0.703503   0.356648     0.643352   
1          2             6476        0.705070   0.386658     0.613342   

   consistency_rate  
0               NaN  
1          0.937925  
Iteration 2 Evaluation Metrics:
Accuracy: 0.89
Precision: 0.85
Recall: 0.95
F1 Score: 0.90
MCC: 0.79
6476
Iteration: 3
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1             6205        0.703503   0.356648     0.643352   
1          2             6476        0.705070   0.386658     0.613342   
2          3             6717        0.703847   0.419

### Iterations Output

Iteration: 1
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1             6205        0.703503   0.356648     0.643352   

   consistency_rate  
0               NaN  
Iteration 1 Evaluation Metrics:
Accuracy: 0.89
Precision: 0.85
Recall: 0.95
F1 Score: 0.90
MCC: 0.79


Iteration: 15
    iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0           1             6205        0.703503   0.356648     0.643352   
1           2             6476        0.705070   0.386658     0.613342   
2           3             6717        0.703847   0.419384     0.580616   
3           4             6833        0.702928   0.431582     0.568418   
4           5             6880        0.702487   0.440843     0.559157   
5           6             6923        0.702102   0.448794     0.551206   
6           7             6947        0.701898   0.453721     0.546279   
7           8             6985        0.701635   0.459556     0.540444   
8           9             7002        0.701556   0.460583     0.539417   
9          10             7015        0.701458   0.463150     0.536850   
10         11             7028        0.701352   0.464997     0.535003   
11         12             7031        0.701315   0.465368     0.534632   
12         13             7032        0.701270   0.466297     0.533703   
13         14             7035        0.701213   0.467520     0.532480   
14         15             7039        0.701170   0.468106     0.531894   

    consistency_rate  
0                NaN  
1           0.937925  
2           0.946851  
3           0.971901  
4           0.984448  
5           0.988155  
6           0.991939  
7           0.990122  
8           0.994716  
9           0.996009  
10          0.996301  
11          0.997724  
12          0.998009  
13          0.997726  
14          0.997443  
Iteration 15 Evaluation Metrics:
Accuracy: 0.88
Precision: 0.84
Recall: 0.95
F1 Score: 0.89
MCC: 0.78

# Model Import and Export

## Save

In [225]:
import joblib

# Save the entire dictionary of models to a single file
filename = '../models/Unsupervised_Models_11_04_02.joblib'
joblib.dump(models_1, filename)

print("All models have been saved successfully as a whole.")


All models have been saved successfully as a whole.


## Load

### Import Semi-Supervised Learning Model

In [232]:

filename = '../models/Unsupervised_Models_11_04_02.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")


All models have been loaded successfully.


### Import Initial Model

In [223]:
filename = '../models/Initial_Models_10-26.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")

All models have been loaded successfully.


## Test on Validation Set

### Validation with Platt's

In [233]:
# Initialize arrays to accumulate weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Initialize array for calbirated models
calibrated_models = {}

# Apply Platt's scaling to each model using CalibratedClassifierCV
for feature_name, model in models_1.items():
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Generate predictions for each model using calibrated models
for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')

Accuracy: 0.88
Precision: 0.84
Recall: 0.95
F1 Score: 0.89
MCC: 0.77


### Validation without Platt's

In [228]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# Initialize arrays to accumulate weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

# Initialize array for calibrated models
calibrated_models = {}

# Apply Platt's scaling to each model using CalibratedClassifierCV
for feature_name, model in models_1.items():
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Generate predictions for each model using calibrated models
for feature_name, model in models_1.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict calibrated probabilities for X_test
    probas = model.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

# Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, final_predictions)
precision = precision_score(y_test, final_predictions, pos_label=True)
recall = recall_score(y_test, final_predictions, pos_label=True)
f1 = f1_score(y_test, final_predictions, pos_label=True)
mcc = matthews_corrcoef(y_test, final_predictions)

# Analyze class distribution in predictions
bot_predictions_count = sum(final_predictions)
human_predictions_count = len(final_predictions) - bot_predictions_count
bot_prediction_ratio = bot_predictions_count / len(final_predictions)
human_prediction_ratio = human_predictions_count / len(final_predictions)

# Calculate average confidence for bot and human predictions
avg_bot_confidence = avg_bot_prob[final_predictions].mean() if bot_predictions_count > 0 else 0
avg_human_confidence = avg_human_prob[~final_predictions].mean() if human_predictions_count > 0 else 0

# Print evaluation results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'MCC: {mcc:.2f}')
print("\n--- Prediction Distribution ---")
print(f'Bot Predictions: {bot_predictions_count} ({bot_prediction_ratio:.2%})')
print(f'Human Predictions: {human_predictions_count} ({human_prediction_ratio:.2%})')
print("\n--- Average Confidence Scores ---")
print(f'Average Bot Confidence: {avg_bot_confidence:.2f}')
print(f'Average Human Confidence: {avg_human_confidence:.2f}')


Accuracy: 0.88
Precision: 0.84
Recall: 0.95
F1 Score: 0.89
MCC: 0.78

--- Prediction Distribution ---
Bot Predictions: 369 (56.94%)
Human Predictions: 279 (43.06%)

--- Average Confidence Scores ---
Average Bot Confidence: 0.83
Average Human Confidence: 0.85


# Final Code

In [251]:
filename = '../models/Unsupervised_Models_11_04_02.joblib'
models_1 = joblib.load(filename)

print("All models have been loaded successfully.")


All models have been loaded successfully.


In [252]:
train_data_labeled = pd.read_csv('../data/initial_train_data.csv')
val_data = pd.read_csv('../data/val_data.csv')

X_train = train_data_labeled.drop(columns=['label'])
y_train = train_data_labeled['label']

X_test = val_data.drop(columns=['label'])
y_test = val_data['label']

In [253]:
# Placeholder: Number of high-confidence accounts to trigger retraining
retrain_threshold = 10  # For example, retrain after 1000 high-confidence accounts
high_confidence_accounts = []  # Store high-confidence predictions

# Initialize dictionaries to store probabilities for each feature subset
probabilities = {feature_name: [] for feature_name in models_1.keys()}

# Initialize arrays to accumulate probabilities
bot_prob_sum = np.zeros(len(X_test))  # Replace 'test_data' with your testing data
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.80

calibrated_models = {}
for feature_name, model in models_1.items():
    # Apply Platt's scaling using CalibratedClassifierCV
    calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets[feature_name]], y_train)  # Fit the model with training data
    calibrated_models[feature_name] = calibrated_model

In [261]:
# Step 1: Process each model for its feature subset
probabilities = {feature_name: [] for feature_name in models_1.keys()}

for feature_name, model in calibrated_models.items():
    feature_columns = feature_sets[feature_name]

    # Calculate feature completeness per instance (account)
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Assign weights based on completeness
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)

    # Predict probabilities for the test data
    probas = model.predict_proba(X_test[feature_columns])

    # Store probabilities for each feature subset
    probabilities[feature_name] = probas

    # Accumulate weighted probabilities
    human_prob_sum += probas[:, 0] * weights  # Probability of being human
    bot_prob_sum += probas[:, 1] * weights    # Probability of being a bot

    if feature_name == 'post_text':
        post_text_human_prob = probas[:, 0] * weights
        post_text_bot_prob = probas[:, 1] * weights
    elif feature_name == 'post_metadata':
        post_text_human_prob = probas[:, 0] * weights
        post_text_bot_prob = probas[:, 1] * weights

    # Accumulate total weights for normalization
    total_weights += weights

In [262]:
# Step 2: Normalize probabilities to avoid division by zero
total_weights_safe = np.where(total_weights == 0, 1, total_weights)  # Handle zero weights
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Calculate average probabilities for post (weighted by completeness if needed)
post_human_prob = (probabilities['post_metadata'][:, 0] + probabilities['post_text'][:, 0]) / 2
post_bot_prob = (probabilities['post_metadata'][:, 1] + probabilities['post_text'][:, 1]) / 2

# Add combined post probabilities to the dictionary
probabilities['post'] = np.column_stack([post_human_prob, post_bot_prob])

# Optionally remove individual post_metadata and post_text from the final output
del probabilities['post_metadata']
del probabilities['post_text']

In [263]:
# Step 3: Output final classification probabilities and assign final labels
final_predictions = []        # Stores final predicted labels (0 for Human, 1 for Bot)
final_probabilities = []      # Stores probabilities of both classes [Human_prob, Bot_prob]
confidence_scores = []        # Stores penalized confidence scores

for i in range(len(avg_human_prob)):
    completeness = max(0.0, X_test[feature_sets[feature_name]].notnull().mean(axis=1)[i])

    # Calculate and store the probabilities for both classes
    human_prob = avg_human_prob[i]
    bot_prob = avg_bot_prob[i]

    # Append both class probabilities to the list
    final_probabilities.append([human_prob, bot_prob])

    # Final classification: 0 (Human) or 1 (Bot)
    if human_prob > bot_prob:
        final_predictions.append(False)  # Human
        confidence = human_prob * completeness  # Penalize confidence by completeness
    else:
        final_predictions.append(True)  # Bot
        confidence = bot_prob * completeness  # Penalize confidence by completeness

    # Store the confidence score
    confidence_scores.append(confidence)



In [193]:
# Example Output
for i in range(len(final_predictions)):
    print(f"Index: {i + 1}")  # Print the index
    print(f"Prediction: {'Human' if final_predictions[i] == False else 'Bot'}")
    print(f"Probabilities - Human: {final_probabilities[i][0]:.4f}, Bot: {final_probabilities[i][1]:.4f}")
    print(f"Confidence Score: {confidence_scores[i]:.4f}\n")

    # Output feature probabilities
    print(f"Feature Probabilities:\nUsername - Human: {probabilities['username'][i][0]:.4f}, Bot: {probabilities['username'][i][1]:.4f}")
    print(f"Screenname - Human: {probabilities['screenname'][i][0]:.4f}, Bot: {probabilities['screenname'][i][1]:.4f}")
    print(f"Description - Human: {probabilities['description'][i][0]:.4f}, Bot: {probabilities['description'][i][1]:.4f}")
    print(f"User Metadata - Human: {probabilities['user_metadata'][i][0]:.4f}, Bot: {probabilities['user_metadata'][i][1]:.4f}")
    print(f"Post - Human: {probabilities['post'][i][0]:.4f}, Bot: {probabilities['post'][i][1]:.4f}")

    print("-" * 50)

Index: 1
Prediction: Bot
Probabilities - Human: 0.4148, Bot: 0.5852
Confidence Score: 0.5852

Feature Probabilities:
Username - Human: 0.7623, Bot: 0.2377
Screenname - Human: 0.6449, Bot: 0.3551
Description - Human: 0.3655, Bot: 0.6345
User Metadata - Human: 0.0493, Bot: 0.9507
Post - Human: 0.3783, Bot: 0.6217
--------------------------------------------------
Index: 2
Prediction: Bot
Probabilities - Human: 0.0484, Bot: 0.9516
Confidence Score: 0.0000

Feature Probabilities:
Username - Human: 0.5028, Bot: 0.4972
Screenname - Human: 0.5191, Bot: 0.4809
Description - Human: 0.4822, Bot: 0.5178
User Metadata - Human: 0.0484, Bot: 0.9516
Post - Human: 0.5185, Bot: 0.4815
--------------------------------------------------
Index: 3
Prediction: Bot
Probabilities - Human: 0.4913, Bot: 0.5087
Confidence Score: 0.0000

Feature Probabilities:
Username - Human: 0.1655, Bot: 0.8345
Screenname - Human: 0.7167, Bot: 0.2833
Description - Human: 0.3169, Bot: 0.6831
User Metadata - Human: 0.9722, Bot: 

In [257]:
def augment_training_data(high_confidence_accounts, user_train_data_labeled, dataset_columns):
    # Updated to access 'predicted_label' from the nested 'account' dictionary
    high_confidence_df = pd.DataFrame(high_confidence_accounts).apply(lambda x: x['account'].to_dict(), axis=1, result_type='expand')

    # Drop confidence column and format the high-confidence data for training
    # Access 'predicted_label' from the correct level in the dictionary
    high_confidence_df['label'] = high_confidence_df.apply(lambda row: True if row.get('predicted_label', None) == 1 else False, axis=1)
    # Select desired columns, ensuring they exist in high_confidence_df
    high_confidence_df = high_confidence_df[[col for col in dataset_columns if col in high_confidence_df.columns]]

    # Augment the original labeled data with the new high-confidence data
    augmented_train_data = pd.concat([user_train_data_labeled, high_confidence_df], ignore_index=True)

    return augmented_train_data

def retrain_models(trained_models, augmented_train_data, feature_sets):
    X_train_augmented = augmented_train_data.drop(columns=['label'])  # Features
    y_train_augmented = augmented_train_data['label']  # Labels

    # Retrain each model using the augmented data
    for feature_name, model in trained_models.items():
        feature_columns = feature_sets[feature_name]
        model.fit(X_train_augmented[feature_columns], y_train_augmented)

In [264]:
high_confidence_accounts = []  # Store high-confidence predictions


# Step 4: Store accounts with high confidence
high_confidence_threshold = 0.65  # Set high-confidence threshold

for i, confidence in enumerate(confidence_scores):
    if confidence > high_confidence_threshold:
        high_confidence_accounts.append({
            'account': X_test.iloc[i],  # Store the account's features
            'predicted_label': final_predictions[i],  # The final classification
            'confidence': confidence  # The confidence score
        })

        print(f"Index: {i + 1}")  # Print the index
        print(f"Prediction: {'Human' if final_predictions[i] == 0 else 'Bot'}")
        print(f"Probabilities - Human: {final_probabilities[i][0]:.4f}, Bot: {final_probabilities[i][1]:.4f}")
        print(f"Confidence Score: {confidence_scores[i]:.4f}")
        print("-" * 50)

Index: 9
Prediction: Bot
Probabilities - Human: 0.2873, Bot: 0.7127
Confidence Score: 0.7127
--------------------------------------------------
Index: 15
Prediction: Bot
Probabilities - Human: 0.2978, Bot: 0.7022
Confidence Score: 0.7022
--------------------------------------------------
Index: 21
Prediction: Bot
Probabilities - Human: 0.2026, Bot: 0.7974
Confidence Score: 0.7974
--------------------------------------------------
Index: 31
Prediction: Bot
Probabilities - Human: 0.3414, Bot: 0.6586
Confidence Score: 0.6586
--------------------------------------------------
Index: 44
Prediction: Bot
Probabilities - Human: 0.2239, Bot: 0.7761
Confidence Score: 0.7761
--------------------------------------------------
Index: 45
Prediction: Bot
Probabilities - Human: 0.2403, Bot: 0.7597
Confidence Score: 0.7597
--------------------------------------------------
Index: 52
Prediction: Bot
Probabilities - Human: 0.2104, Bot: 0.7896
Confidence Score: 0.7896
-------------------------------------

In [265]:
print (len(high_confidence_accounts))

65


In [266]:
# Check if retraining should be triggered
if len(high_confidence_accounts) >= retrain_threshold:
    print ("Retraining triggered")

    # Step 5: Augment the training data with high-confidence pseudo-labeled data
    train_data_labeled = augment_training_data(high_confidence_accounts, train_data_labeled, dataset_columns)

    # Retrain the models with the augmented data
    retrain_models(models_1, train_data_labeled, feature_sets)

    # Clear high-confidence accounts after retraining
    high_confidence_accounts = []

Retraining triggered


# Notes

### Feature Engineering Review
  - Real-World Feature Shift
  - Imputation of Missing Values (?)

### NaN vs 0 Values
  - Include missingness indicators

###
  - Generalize feature engineering code
