In [None]:
# Standard Library Imports
import os
import json

# Data Manipulation and Processing
import pandas as pd
import numpy as np
import re
from pandas import json_normalize
from tqdm.notebook import tqdm

# Text Processing
import emoji
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation



# Machine Learning Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
import joblib

# Statistical Utilities
from scipy.stats import mode

In [None]:
model_proposed = joblib.load('../Semi_supervised_models_12_13_8.joblib')
model_baseline = joblib.load('../Initial_Models_12_11_supervised_dropped_5_2.joblib')
test_df = pd.read_parquet('../data/final/testing/combined/combined_testing_accounts_ROBERTA_LDA_missing_dropped_2.parquet')
train_df = pd.read_parquet('../data/final/labeled/combined/ROBERTA/train_labeled_LDA_missing_dropped.parquet')
val_df = pd.read_parquet('../data/final/labeled/combined/ROBERTA/val_labeled_LDA_missing_dropped.parquet')

In [None]:
features_LDA = ['user_id', 'username', 'username_uppercase', 'username_lowercase',
                'username_numeric', 'username_special', 'username_length', 'username_se', 'is_missing_username',
                'screenname', 'screenname_uppercase', 'screenname_lowercase',
                'screenname_numeric', 'screenname_special', 'screenname_length',
                'screenname_se', 'screenname_emoji', 'screenname_hashtag',
                'screenname_word', 'is_missing_screenname', 'description', 'description_length',
                'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4',
                'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'is_missing_description',  
                'user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                'is_missing_user_metadata', 'post_md_like_mean',
                'post_md_like_std', 'post_md_retweet_mean', 'post_md_retweet_std',
                'post_md_reply_mean', 'post_md_reply_std',
                'is_missing_post_metadata', 'post_text_length_mean', 'post_text_length_std',
                'post_sentiment_score_mean', 'post_sentiment_score_std',
                'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                'post_sentiment_numeric_prop_positive',
                'post_sentiment_numeric_prop_neutral',
                'post_sentiment_numeric_prop_negative', 'is_missing_post_text']

feature_sets_LDA = {
    'username': ['username_uppercase', 'username_lowercase', 'username_numeric',
                 'username_special', 'username_length', 'username_se', 'is_missing_username'],  # Add all username features
    'screenname': ['screenname_uppercase', 'screenname_lowercase',
                   'screenname_numeric', 'screenname_special', 'screenname_length',
                   'screenname_se', 'is_missing_screenname'],  # Add all screenname features
    'description': ['description_length', 'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4',
                'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'is_missing_description'],  # Add all description features
    'user_metadata': ['user_md_follower', 'user_md_following', 'user_md_follow_ratio',
                      'user_md_total_post', 'user_md_total_like', 'user_md_verified',
                      'is_missing_user_metadata'],  # Add user metadata features
    'post': ['post_md_like_mean', 'post_md_like_std', 'post_md_retweet_mean',
                      'post_md_retweet_std', 'post_md_reply_mean', 'post_md_reply_std',
                      'is_missing_post_metadata', 'post_text_length_mean', 'post_text_length_std', 'post_sentiment_score_mean',
                  'post_sentiment_score_std', 'post_sentiment_numeric_mean', 'post_sentiment_numeric_std',
                  'post_sentiment_numeric_prop_positive', 'post_sentiment_numeric_prop_neutral',
                  'post_sentiment_numeric_prop_negative', 'is_missing_post_text']  # Add post text features
}

In [None]:
X_train = train_df[features_LDA]  # All feature columns from the training dataset
y_train = train_df['label']    # Label column from the training dataset

X_test = test_df[features_LDA]  # All feature columns from the validation dataset
y_test = test_df['label']    # Label column from the validation dataset

X_val = val_df[features_LDA]  # All feature columns from the validation dataset
y_val = val_df['label']    # Label column from the validation dataset

In [None]:
# Initialize arrays to accumulate weighted probabilities
bot_prob_sum = np.zeros(len(X_test))
human_prob_sum = np.zeros(len(X_test))
total_weights = np.zeros(len(X_test))  # To normalize the weighted sums

# Define completeness threshold for assigning full weights
completeness_threshold = 0.75

# Initialize array for calibrated models
calibrated_models = {}

# Initialize a dictionary to store metrics for each individual model
individual_model_metrics = {}

# Apply Platt's scaling to each model using CalibratedClassifierCV
# Change model
for feature_name, models in model_baseline.items():
    calibrated_model = CalibratedClassifierCV(estimator=models, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_train[feature_sets_LDA[feature_name]], y_train)  # Assuming models are already trained
    calibrated_models[feature_name] = calibrated_model

# Generate predictions for each model using calibrated models
for feature_name, models in calibrated_models.items():
    feature_columns = feature_sets_LDA[feature_name]

    # Calculate feature completeness per instance (user) for X_test
    completeness = X_test[feature_columns].notnull().mean(axis=1)

    # Get the is_missing indicator for the subset (e.g., user_metadata, post_text)
    # is_missing_subset = X_test[f'is_missing_{feature_name}']

    # Adjust weights based on completeness and is_missing indicator
    weights = np.where(completeness >= completeness_threshold, 1.0, completeness)
    # If the subset is missing entirely (is_missing == 1), reduce weight (set it to 0)
    # weights = np.where(is_missing_subset == 1, 0.0, weights)

    # Predict calibrated probabilities for X_test
    probas = models.predict_proba(X_test[feature_columns])

    # Accumulate weighted probabilities for bot and human predictions
    human_prob_sum += probas[:, 0] * weights  # Human probabilities
    bot_prob_sum += probas[:, 1] * weights    # Bot probabilities

    # Accumulate total weights for normalization
    total_weights += weights

    # Make predictions based on probabilities for individual model performance
    individual_predictions = np.where(probas[:, 1] > probas[:, 0], True, False)

    # Calculate performance metrics for the individual model
    accuracy = accuracy_score(y_test, individual_predictions)
    precision = precision_score(y_test, individual_predictions, pos_label=True)
    recall = recall_score(y_test, individual_predictions, pos_label=True)
    f1 = f1_score(y_test, individual_predictions, pos_label=True)
    mcc = matthews_corrcoef(y_test, individual_predictions)

    # Store metrics for the individual model
    individual_model_metrics[feature_name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "mcc": mcc
    }

# Normalize the weighted probabilities
# Avoid division by zero in case no weights were assigned
total_weights_safe = np.where(total_weights == 0, 1, total_weights)
avg_human_prob = human_prob_sum / total_weights_safe
avg_bot_prob = bot_prob_sum / total_weights_safe

# Assign final predictions based on aggregated weighted probabilities
final_predictions = np.where(avg_bot_prob > avg_human_prob, True, False)

# Evaluate the overall ensemble performance
ensemble_accuracy = accuracy_score(y_test, final_predictions)
ensemble_precision = precision_score(y_test, final_predictions, pos_label=True)
ensemble_recall = recall_score(y_test, final_predictions, pos_label=True)
ensemble_f1 = f1_score(y_test, final_predictions, pos_label=True)
ensemble_mcc = matthews_corrcoef(y_test, final_predictions)

# Print evaluation results for the ensemble
print(f"Ensemble Performance:")
print(f'Accuracy: {ensemble_accuracy:.2f}')
print(f'Precision: {ensemble_precision:.2f}')
print(f'Recall: {ensemble_recall:.2f}')
print(f'F1 Score: {ensemble_f1:.2f}')
print(f'MCC: {ensemble_mcc:.2f}')

# Print evaluation results for each individual model
print("\nIndividual Model Performance:")
for feature_name, metrics in individual_model_metrics.items():
    print(f"Performance for {feature_name} model:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}")
