In [2]:
# Standard Library Imports
import os
import json

# Data Manipulation and Processing
import pandas as pd
import numpy as np
import re
from pandas import json_normalize
from tqdm.notebook import tqdm

# Text Processing
import emoji
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation



# Machine Learning Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
import joblib

# Statistical Utilities
from scipy.stats import mode

In [3]:
features_LDA = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                'username_numeric', 'username_special', 'username_length', 'username_se', 'is_missing_username',
                'screenname', 'screenname_uppercase', 'screenname_lowercase',
                'screenname_numeric', 'screenname_special', 'screenname_length',
                'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                'screenname_word', 'is_missing_screenname', 'description', 'description_length',
                'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4',
                'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'is_missing_description',  
                'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                'is_missing_user_metadata', 'post_md_like_mean',
                'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
                'post_md_reply_mean', 'post_md_reply_std',
                'is_missing_post_metadata', 'post_text_length_mean', 'post_text_length_std',
                'post_sentiment_score_mean', 'post_sentiment_score_std',
                'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                'post_sentiment_numeric_prop_positive',
                'post_sentiment_numeric_prop_neutral',
                'post_sentiment_numeric_prop_negative', 'is_missing_post_text']

feature_sets_LDA = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se', 'is_missing_username'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se', 'is_missing_screenname'],  # Add all screenname features
    'description': ['description_length', 'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4',
                'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'is_missing_description'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                      'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                      'is_missing_user_metadata'],  # Add user metadata features
    'post': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'is_missing_post_metadata', 'post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_neutral',
                  'post_sentiment_numeric_prop_negative', 'is_missing_post_text']  # Add post text features
}

dataset_columns = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                  'username_numeric', 'username_special', 'username_length', 'username_se', 'is_missing_username',
                  'screenname', 'screenname_uppercase', 'screenname_lowercase',
                  'screenname_numeric', 'screenname_special', 'screenname_length',
                  'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                  'screenname_word', 'is_missing_screenname', 'description', 'description_length',
                  'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4',
                  'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'is_missing_description',  
                  'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                  'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                  'is_missing_user_metadata', 'post_md_like_mean',
                  'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
                  'post_md_reply_mean', 'post_md_reply_std',
                  'is_missing_post_metadata', 'post_text_length_mean', 'post_text_length_std',
                  'post_sentiment_score_mean', 'post_sentiment_score_std',
                  'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive',
                  'post_sentiment_numeric_prop_neutral',
                  'post_sentiment_numeric_prop_negative', 'is_missing_post_text', 'label']

features = features_LDA
feature_sets = feature_sets_LDA

train_data_unlabeled = pd.read_parquet('../../data/final/unlabeled/combined/combined_unlabeled_accounts_ROBERTA_LDA_missing_dropped.parquet')
train_data_labeled = pd.read_parquet('../../data/final/labeled/combined/ROBERTA/train_labeled_LDA_missing_dropped.parquet')
test_data = pd.read_parquet('../../data/final/testing/combined/combined_testing_accounts_ROBERTA_LDA_missing_dropped_2.parquet')
val_data = pd.read_parquet('../../data/final/labeled/combined/ROBERTA/val_labeled_LDA_missing_dropped.parquet')

X_train = train_data_labeled[features]
y_train = train_data_labeled['label']

X_val = val_data[features]
y_val = val_data['label']

X_test = test_data[features]
y_test = test_data['label']

models_1 = joblib.load('../../models/Initial_Models_12_11_supervised_dropped_5_2.joblib')


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [144]:
import numpy as np
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# Existing constants
completeness_threshold = 0.75
threshold = 0.75
pseudo_label_log = []
pseudo_label_consistency = {}

def track_pseudo_label_quality(pseudo_labeled_data, previous_labels):
    # Track metrics for quality assessment
    high_conf_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > 0]
    avg_confidence = high_conf_data['confidence'].mean()
    label_counts = high_conf_data['predicted_label'].value_counts(normalize=True)
    
    # Consistency check
    if previous_labels is not None:
        aligned_labels = previous_labels.reindex(high_conf_data.index)
        consistency_rate = (aligned_labels == high_conf_data['predicted_label']).mean()
    else:
        consistency_rate = np.nan  # First iteration has no prior labels
    
    # Log metrics
    pseudo_label_log.append({
        'iteration': len(pseudo_label_log) + 1,
        'high_conf_count': len(high_conf_data),
        'avg_confidence': avg_confidence,
        'bot_ratio': label_counts.get(1, 0),
        'human_ratio': label_counts.get(0, 0),
        'consistency_rate': consistency_rate
    })
    
    # Update previous_labels for next iteration consistency check
    previous_labels = high_conf_data['predicted_label'].copy()
    return previous_labels, consistency_rate

def dynamic_thresholding(bot_ratio, human_ratio, iteration):
    # Aim for a balanced bot-to-human ratio
    target_ratio = 0.5
    adjustment_factor = max(0.01, 0.03 - iteration * 0.0005)
    # Adjustments based on the ratio difference
    ratio_difference = abs(bot_ratio - human_ratio)
    
    # Combination of scaling and exponential adjustment
    scaling_factor = 3  # Controls the sensitivity of the ratio difference
    adjustment = adjustment_factor * iteration * np.exp(-ratio_difference)
    
    if bot_ratio > target_ratio:
        # Decrease human threshold more aggressively
        human_threshold = max(threshold - adjustment, 0.6)
        # Increase bot threshold more conservatively
        bot_threshold = min(threshold + adjustment, 0.9)
    else:
        # Decrease bot threshold more aggressively
        bot_threshold = max(threshold - adjustment, 0.6)
        # Increase human threshold more conservatively
        human_threshold = min(threshold + adjustment, 0.9)

    return bot_threshold, human_threshold


def class_balanced_sampling(data, bot_ratio, human_ratio, target_size):
    # Perform stratified sampling to maintain class balance
    bot_data = data[data['predicted_label'] == 1]
    human_data = data[data['predicted_label'] == 0]

    # Determine the number of samples to balance
    bot_count = int(target_size * bot_ratio / (bot_ratio + human_ratio))
    human_count = target_size - bot_count

    sampled_bots = bot_data.sample(n=min(bot_count, len(bot_data)), replace=False)
    sampled_humans = human_data.sample(n=min(human_count, len(human_data)), replace=False)

    return pd.concat([sampled_bots, sampled_humans])


def iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns, threshold, completeness_threshold):
    iteration = 0
    improvements = True
    previous_confidently_labeled = 0
    previous_labels = None
    pseudo_labeled_data = train_data_unlabeled.copy()
    train_data_unlabeled['is_labeled'] = False
    consistency_rate = np.nan
    previous_consistency_rate = np.nan

    # Initialize the parameters
    initial_threshold = 0.6  # Starting threshold
    decay_factor = 0.02  # Decay rate (linear)
    max_iterations = 10  # Maximum number of iterations

    # Initialize threshold for first iteration
    current_threshold = initial_threshold


    while improvements:
        print(f"Iteration: {iteration + 1}")

        current_threshold = initial_threshold + decay_factor * iteration
        if current_threshold < 0.5:  # You can set a minimum threshold
            current_threshold = 0.5  # Prevent threshold from going too low

        # Initialize arrays for weighted probabilities
        bot_prob_sum = np.zeros(len(train_data_unlabeled))
        human_prob_sum = np.zeros(len(train_data_unlabeled))
        total_weights = np.zeros(len(train_data_unlabeled))
        # Only process rows that are not labeled yet
        unlabeled_data = train_data_unlabeled[train_data_unlabeled['is_labeled'] == False]
        # pseudo_labeled_data = train_data_unlabeled.copy()

        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            completeness = train_data_unlabeled[feature_columns].notnull().mean(axis=1)
            # is_missing_subset = train_data_unlabeled[f'is_missing_{feature_name}']
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            # weights = np.where(is_missing_subset == 1, 0.0, weights)

            probas = model.predict_proba(train_data_unlabeled[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe

        # Assign pseudo-labels and calculate confidence
        newly_labeled_indices = []  # Track newly labeled data points

        # Calculate bot-to-human ratio
        if iteration == 0:
            # Initialize bot-to-human ratio during the first iteration
            bot_ratio = 0.5
            human_ratio = 0.5
        else:
            # Calculate bot-to-human ratio based on pseudo-labeled data
            label_counts = high_confidence_data['predicted_label'].value_counts(normalize=True)
            bot_ratio = label_counts.get(1, 0)
            human_ratio = label_counts.get(0, 0)
            
        # Apply dynamic thresholding
        bot_threshold, human_threshold = dynamic_thresholding(bot_ratio, human_ratio, iteration)

        print (f"Bot ratio: {bot_ratio:.2f}, Human ratio: {human_ratio:.2f}")
        print (f"Bot threshold: {bot_threshold:.2f}, Human threshold: {human_threshold:.2f}")
        
        for i in range(len(avg_human_prob)):
            # Access the index of the current row in train_data_unlabeled
            if not train_data_unlabeled.iloc[i]['is_labeled']:  # Only process unlabeled data
                completeness = train_data_unlabeled[feature_sets[feature_name]].notnull().mean(axis=1).iloc[i]
                
                # Adjust the smoothing factor dynamically based on previous performance
                if previous_consistency_rate is not np.nan:
                    if previous_consistency_rate > 0.8:
                        smoothing_factor = 0.7  # High consistency, increase smoothing
                    elif previous_consistency_rate < 0.5:
                        smoothing_factor = 0.3  # Low consistency, decrease smoothing
                    else:
                        smoothing_factor = 0.5
                else:
                    smoothing_factor = 0.5  # Neutral, default smoothing factor
                
                if avg_human_prob[i] > avg_bot_prob[i]:
                    pseudo_label = 0  # Human
                    confidence = smoothing_factor * avg_human_prob[i] + (1 - smoothing_factor) * completeness
                    threshold = human_threshold
                else:
                    pseudo_label = 1  # Bot
                    confidence = smoothing_factor * avg_bot_prob[i] + (1 - smoothing_factor) * completeness
                    threshold = bot_threshold

                # Apply Confidence Smoothing based on consistency
                if consistency_rate is not np.nan:  # If there is a consistency rate from the previous iteration
                    if consistency_rate > 0.8:  # If model has been highly consistent
                        confidence *= 1.2  # Boost confidence by 10%
                    elif consistency_rate < 0.5:  # If model has been inconsistent
                        confidence *= 0.8  # Reduce confidence by 30%

                # Only update labels and confidence for unlabeled data
                if confidence > threshold:
                    # Update the pseudo-labeled data with confidence and predicted labels
                    pseudo_labeled_data.at[train_data_unlabeled.index[i], 'predicted_label'] = pseudo_label
                    pseudo_labeled_data.at[train_data_unlabeled.index[i], 'confidence'] = confidence
                    train_data_unlabeled.at[train_data_unlabeled.index[i], 'is_labeled'] = True  # Mark as labeled
                    newly_labeled_indices.append(train_data_unlabeled.index[i])

        if len(newly_labeled_indices) == 0:
            print("No new data points were labeled in this iteration.")
            break

        # print (pseudo_labeled_data[pseudo_labeled_data['confidence'] > 0])

        print (current_threshold)
        high_confidence_data = pseudo_labeled_data[pseudo_labeled_data['confidence'] > current_threshold]
        # high_confidence_data.to_csv("../data/final/unlabeled/combined/high_confidence.csv")

        # # Uncertainty sampling: Select most uncertain samples
        # uncertainty_scores = np.abs(pseudo_labeled_data['confidence'] - 0.5)
        # most_uncertain_indices = uncertainty_scores.nsmallest(len(high_confidence_data)).index
        # uncertainty_samples = pseudo_labeled_data.loc[most_uncertain_indices]

        # Combine high-confidence and uncertain samples
        # combined_data = pd.concat([high_confidence_data, uncertainty_samples])

        combined_data = high_confidence_data.copy()

        # Apply class-balanced sampling
        target_size = len(combined_data)
        combined_data = class_balanced_sampling(combined_data, bot_ratio, human_ratio, target_size)

        # Print the number of confidently labeled data points
        confidently_labeled_count = len(high_confidence_data)

        # Track pseudo label consistency
        previous_labels, consistency_rate = track_pseudo_label_quality(high_confidence_data, previous_labels)

        # Log summary of pseudo-label quality across iterations
        quality_df = pd.DataFrame(pseudo_label_log)
        print(quality_df)
        
        # Check improvement and decide if further self-training is beneficial
        # if pseudo_label_log[-1]['avg_confidence'] < threshold or pseudo_label_log[-1]['consistency_rate'] < threshold:
        #     print("Pseudo-label quality threshold reached. Stopping self-training.")
        #     break

        if consistency_rate > 0.99:  # If consistency is high, stop self-training
            print("High pseudo-label consistency, stopping self-training.")
            improvements = False
            break

        # Augment the training data
        augmented_data = combined_data.copy()
        augmented_data['label'] = combined_data['predicted_label'].map(lambda x: True if x == 1 else False)
        augmented_data = augmented_data.drop(columns=['predicted_label', 'confidence'])
        augmented_data = augmented_data[dataset_columns]

        # Combine original labeled data with augmented data
        user_train_data_labeled_augmented = pd.concat([train_data_labeled, augmented_data], ignore_index=True)
        # user_train_data_labeled_augmented.to_parquet("../data/final/unlabeled/combined/augmented_data.parquet")
        X_train_augmented = user_train_data_labeled_augmented.drop(columns=['label'])
        y_train_augmented = user_train_data_labeled_augmented['label']

        # Retrain models with augmented data
        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            model.fit(X_train_augmented[feature_columns], y_train_augmented)

        # Validate the performance on the validation set (metrics calculation remains unchanged)

        bot_prob_sum = np.zeros(len(X_test))
        human_prob_sum = np.zeros(len(X_test))
        total_weights = np.zeros(len(X_test))
        # calibrated_models = {}

        # for feature_name, model in models_1.items():
        #     calibrated_model = CalibratedClassifierCV(estimator=model, method='sigmoid', cv='prefit')
        #     calibrated_model.fit(X_val[feature_sets[feature_name]], y_val)
        #     models_1[feature_name] = calibrated_model

        for feature_name, model in models_1.items():
            feature_columns = feature_sets[feature_name]
            completeness = X_test[feature_columns].notnull().mean(axis=1)
            weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
            probas = model.predict_proba(X_test[feature_columns])
            human_prob_sum += probas[:, 0] * weights
            bot_prob_sum += probas[:, 1] * weights
            total_weights += weights

        total_weights_safe = np.where(total_weights == 0, 1, total_weights)
        avg_human_prob = human_prob_sum / total_weights_safe
        avg_bot_prob = bot_prob_sum / total_weights_safe
        final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

        accuracy = accuracy_score(y_test, final_predictions)
        precision = precision_score(y_test, final_predictions, pos_label=True)
        recall = recall_score(y_test, final_predictions, pos_label=True)
        f1 = f1_score(y_test, final_predictions, pos_label=True)
        mcc = matthews_corrcoef(y_test, final_predictions)

        previous_model_accuracy = accuracy  # Example starting accuracy (this could be tracked across iterations)
        previous_consistency_rate = consistency_rate  # Example starting consistency rate

        print(f"Iteration {iteration + 1} Evaluation Metrics:")
        print(f'Accuracy: {accuracy:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1:.2f}')
        print(f'MCC: {mcc:.2f}')

        iteration += 1

        # Stop if all unlabeled data has been confidently labeled
        if confidently_labeled_count == len(train_data_unlabeled):
            print("All unlabeled data has been confidently labeled.")
            break


iterative_self_training(train_data_unlabeled, train_data_labeled, models_1, feature_sets, X_train, y_train, X_test, y_test, dataset_columns, threshold, completeness_threshold)


Iteration: 1
Bot ratio: 0.50, Human ratio: 0.50
Bot threshold: 0.75, Human threshold: 0.75
0.6
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1              978        0.775874    0.55726      0.44274   

   consistency_rate  
0               NaN  
Iteration 1 Evaluation Metrics:
Accuracy: 0.61
Precision: 0.61
Recall: 0.63
F1 Score: 0.62
MCC: 0.23
Iteration: 2
Bot ratio: 0.56, Human ratio: 0.44
Bot threshold: 0.78, Human threshold: 0.72
0.62
   iteration  high_conf_count  avg_confidence  bot_ratio  human_ratio  \
0          1              978        0.775874   0.557260     0.442740   
1          2             1268        0.769916   0.455836     0.544164   

   consistency_rate  
0               NaN  
1          0.771293  
Iteration 2 Evaluation Metrics:
Accuracy: 0.62
Precision: 0.63
Recall: 0.60
F1 Score: 0.61
MCC: 0.24
Iteration: 3
Bot ratio: 0.46, Human ratio: 0.54
Bot threshold: 0.70, Human threshold: 0.80
0.64
   iteration  high_conf_count  avg