<a href="https://colab.research.google.com/github/Swaathi-iyer/Hidden-Influencer-Detection/blob/main/Hidden_Influencer_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Swaathi-iyer/Hidden-Influencer-Detection.git

In [None]:
!pip install numpy==1.26.4 -q
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -q
!pip install node2vec networkx pandas scikit-learn matplotlib seaborn tqdm -q

import torch

if torch.cuda.is_available():
    cuda_version = torch.version.cuda.replace('.', '')
    print(f"Detected CUDA Version: {cuda_version}")
    !pip install -q torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}+{cuda_version}.html
    !pip install torch-geometric -q
    print("PyTorch Geometric installed for GPU.")
else:
    print("CUDA not available. Installing CPU version.")
    !pip install -q torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}+cpu.html
    !pip install torch-geometric -q
    print("PyTorch Geometric installed for CPU.")

print("Graph Dependencies check complete.")

In [None]:
import os

REQUIRED_FILES = [
    'twitter_edges.csv',
    'twitter_node_features.csv',
    'twitter_graph.gpickle',
    'twitter_metadata.csv',
    'youtube_edges.csv',
    'youtube_node_features.csv',
    'youtube_graph.gpickle',
    'youtube_metadata.csv'
]

print("Checking for required files...")

missing_files = [f for f in REQUIRED_FILES if not os.path.exists(f)]

if missing_files:
    print("\nCRITICAL ERROR: The following required files are missing:")
    for f in missing_files:
        print(f"   - {f}")
    print("\nPlease upload these files to Colab before running this cell.")
    raise FileNotFoundError("Missing required data files.")
else:
    print(f"All {len(REQUIRED_FILES)} required files are present.")
    print("Ready to proceed to next cell.")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data

import pandas as pd
import numpy as np
import networkx as nx
from node2vec import Node2Vec
import pickle
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, roc_auc_score,
                             confusion_matrix, precision_recall_curve,
                             average_precision_score)

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Initializing Environment & Checking Device")
print(f"Using device: {device.type.upper()}")

if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print(f"Note: CUDA not available. Training on CPU.")

In [None]:
import os
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("LOADING DATASETS")
print("="*80)

def load_data_file(filename, file_type='csv'):
    if not os.path.exists(filename):
        raise FileNotFoundError(f"CRITICAL ERROR: Required file '{filename}' not found.")

    if file_type == 'csv':
        return pd.read_csv(filename, dtype={'source_user_id': object, 'target_user_id': object, 'user_id': object, 'user': object, 'channel_name': object})
    elif file_type == 'metadata':
        try:
            with open(filename, 'r') as f:
                data = json.load(f)
            return data[0] if isinstance(data, list) else data
        except Exception as e:
            try:
                return pd.read_csv(filename).to_dict('records')[0]
            except Exception:
                return {}

print(f"\nLOADING TWITTER DATA...")
twitter_edges = load_data_file('twitter_edges.csv', 'csv')
twitter_nodes = load_data_file('twitter_node_features.csv', 'csv')

if 'user' in twitter_nodes.columns:
    twitter_nodes.rename(columns={'user': 'user_id'}, inplace=True)
elif 'user_id' not in twitter_nodes.columns:
    raise KeyError("Missing user identifier column in twitter_node_features.")

twitter_meta = load_data_file('twitter_metadata.csv', 'metadata')
print(f"Edges: {len(twitter_edges):,}")
print(f"Nodes: {len(twitter_nodes):,}")

print(f"\nLOADING YOUTUBE DATA...")
youtube_edges = load_data_file('youtube_edges.csv', 'csv')
youtube_nodes = load_data_file('youtube_node_features.csv', 'csv')

if 'channel_name' in youtube_nodes.columns:
    youtube_nodes.rename(columns={'channel_name': 'user_id'}, inplace=True)
elif 'user_id' not in youtube_nodes.columns:
    raise KeyError("Missing user identifier column in youtube_node_features.")

youtube_meta = load_data_file('youtube_metadata.csv', 'metadata')
print(f"Edges: {len(youtube_edges):,}")
print(f"Nodes: {len(youtube_nodes):,}")

if 'source_user_id' not in twitter_edges.columns and 'source' in twitter_edges.columns:
    twitter_edges.rename(columns={'source': 'source_user_id', 'target': 'target_user_id'}, inplace=True)
if 'source_user_id' not in youtube_edges.columns and 'source' in youtube_edges.columns:
    youtube_edges.rename(columns={'source': 'source_user_id', 'target': 'target_user_id'}, inplace=True)

all_edges = pd.concat([twitter_edges, youtube_edges], ignore_index=True)
all_nodes = pd.concat([twitter_nodes, youtube_nodes], ignore_index=True)

print(f"\nCOMBINED DATA SUMMARY:")
print(f"Total unique users (nodes): {all_nodes['user_id'].nunique():,}")
print(f"Total node features rows: {len(all_nodes):,}")
print(f"Total edges rows: {len(all_edges):,}")

In [None]:
print("\n" + "="*80)
print("DATA PREPROCESSING + LABEL GENERATION")
print("="*80)

np.random.seed(42)

all_nodes['followers'] = all_nodes['subscribers'].fillna(0) + \
                         (all_nodes['pagerank'] * 1000000 * np.random.uniform(0.8, 1.2)).fillna(0)
all_nodes['followers'] = all_nodes['followers'].apply(lambda x: max(100, x)).astype(int)

all_nodes['posts_count'] = (200 + all_nodes['pagerank'] * 100000 + np.random.randint(-50, 50)).astype(int)
all_nodes['posts_count'] = all_nodes['posts_count'].apply(lambda x: max(1, x))

all_nodes['engagement_rate'] = (
    0.01 +
    all_nodes['pagerank'] * 50 +
    all_nodes['influence_score'].fillna(0) * 0.1 +
    np.random.uniform(0.001, 0.05)
)
all_nodes['engagement_rate'] = all_nodes['engagement_rate'].clip(0.001, 0.5)

all_nodes['mfim_score'] = (
    all_nodes['influence_score'].fillna(0) * 0.8 +
    all_nodes['pagerank'] * 50 +
    np.random.uniform(0.1, 0.3)
)
all_nodes['mfim_score'] = all_nodes['mfim_score'].clip(0.0, 1.0)

all_nodes['authenticity_score'] = np.random.uniform(0.8, 0.95, size=len(all_nodes))

feature_cols = [col for col in all_nodes.columns
                if col not in ['user_id', 'platform', 'subscribers', 'total_views', 'influence_score']]

print(f"\nHandling missing values...")
all_nodes[feature_cols] = all_nodes[feature_cols].fillna(0)

print(f"\n" + "="*80)
print("GENERATING LABELS")
print("="*80)

engagement_threshold = all_nodes['engagement_rate'].quantile(0.50)
follower_threshold = all_nodes['followers'].quantile(0.75)
mfim_threshold = all_nodes['mfim_score'].quantile(0.40)
authenticity_threshold = 0.8
min_posts = 10

all_nodes['is_hidden_influencer'] = (
    (all_nodes['engagement_rate'] >= engagement_threshold) &
    (all_nodes['followers'] <= follower_threshold) &
    (all_nodes['mfim_score'] >= mfim_threshold) &
    (all_nodes['authenticity_score'] > authenticity_threshold) &
    (all_nodes['posts_count'] > min_posts)
).astype(int)

hidden_count = all_nodes['is_hidden_influencer'].sum()
total_count = len(all_nodes)
hidden_percentage = (hidden_count / total_count) * 100

print(f"\nLABELS GENERATED")
print(f"Total users: {total_count:,}")
print(f"Hidden influencers: {hidden_count:,} ({hidden_percentage:.1f}%)")
print(f"Regular users: {total_count - hidden_count:,} ({100-hidden_percentage:.1f}%)")

labels_df = all_nodes[['user_id', 'is_hidden_influencer']]
labels_df.to_csv('labels_generated.csv', index=False)

print(f"\n" + "="*80)
print("NORMALIZING FEATURES")
print("="*80)

feature_cols_for_norm = [col for col in feature_cols]

scaler = StandardScaler()
all_nodes[feature_cols_for_norm] = scaler.fit_transform(all_nodes[feature_cols_for_norm])

all_nodes = all_nodes.reset_index(drop=True)
user_to_idx = {user_id: idx for idx, user_id in enumerate(all_nodes['user_id'])}
idx_to_user = {idx: user_id for user_id, idx in user_to_idx.items()}

print(f"\nPreprocessing complete")
print(f"Total users: {len(all_nodes):,}")
print(f"Feature dimension: {len(feature_cols_for_norm)}")

In [None]:
print("\n" + "="*80)
print("TRAINING NODE2VEC EMBEDDINGS")
print("="*80)

G = nx.Graph()
for _, row in all_edges.iterrows():
    src = row['source_user_id']
    tgt = row['target_user_id']
    weight = row.get('weight', 1.0)

    if src in user_to_idx and tgt in user_to_idx:
        G.add_edge(src, tgt, weight=weight)

graph_nodes = set(G.nodes())
valid_users = set(all_nodes['user_id'])
nodes_to_remove = list(graph_nodes - valid_users)
G.remove_nodes_from(nodes_to_remove)

print(f"\nGraph statistics:")
print(f"Nodes: {G.number_of_nodes():,}")
print(f"Edges: {G.number_of_edges():,}")
print(f"Density: {nx.density(G):.6f}")

if nx.is_connected(G):
    print(f"Graph is connected")
else:
    components = list(nx.connected_components(G))
    print(f"Graph has {len(components)} components")
    if components:
        print(f"Largest component: {len(max(components, key=len)):,} nodes")

print(f"\nTraining Node2Vec...")

node2vec = Node2Vec(
    G,
    dimensions=128,
    walk_length=30,
    num_walks=200,
    workers=4,
    p=1,
    q=1,
    quiet=False
)

model_n2v = node2vec.fit(window=10, min_count=1, batch_words=4)

print(f"\nNode2Vec training complete")

print(f"\nExtracting embeddings...")
graph_embeddings = []
for user_id in all_nodes['user_id']:
    try:
        emb = model_n2v.wv[user_id]
    except KeyError:
        emb = np.zeros(128)
    graph_embeddings.append(emb)

graph_embeddings = np.array(graph_embeddings)
print(f"Shape: {graph_embeddings.shape}")

np.save('node2vec_embeddings.npy', graph_embeddings)
model_n2v.save('node2vec_model.bin')
print(f"\nSaved embeddings and model")

In [None]:
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
import networkx as nx
import numpy as np

print("\n" + "="*80)
print("GENERATING STRUCTURAL FEATURES")
print("="*80)

structural_features = {}
feature_names = ['degree_centrality', 'clustering_coefficient', 'hits_authority']

print(f"\nCalculating 3 graph-structural views for all nodes...")

deg_cent = nx.degree_centrality(G)
clust_coeff = nx.clustering(G)

try:
    hubs, authorities = nx.hits(G, max_iter=100)
    hits_auth = authorities
except Exception as e:
    print(f"HITS failed: {e}. Falling back to PageRank for global score.")
    try:
        hits_auth = nx.pagerank(G)
    except Exception:
        print("PageRank also failed. Assigning 0s.")
        hits_auth = {node: 0.0 for node in all_nodes['user_id']}

for user_id in tqdm(all_nodes['user_id'], desc="Preparing structural sequences"):
    sequence = np.array([
        [deg_cent.get(user_id, 0.0)],
        [clust_coeff.get(user_id, 0.0)],
        [hits_auth.get(user_id, 0.0)]
    ])
    structural_features[user_id] = sequence

structural_array = np.array([structural_features[user_id] for user_id in all_nodes['user_id']])

N, S, D = structural_array.shape
temp_array = structural_array.reshape(-1, D)
scaler_struct = StandardScaler()
temp_array_norm = scaler_struct.fit_transform(temp_array)
structural_array_norm = temp_array_norm.reshape(N, S, D)

print(f"\nStructural sequences prepared: {len(structural_features):,} users")
print(f"Sequence shape: {structural_array_norm.shape}")

temporal_sequences = structural_features
temporal_array = structural_array_norm
temporal_cols = feature_names

In [None]:
print("\n" + "="*80)
print("DEFINING MODEL ARCHITECTURES")
print("="*80)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TemporalTransformer(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=8, num_layers=4,
                 dim_feedforward=512, dropout=0.1):
        super(TemporalTransformer, self).__init__()

        self.input_projection = nn.Linear(input_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_projection = nn.Linear(d_model, d_model)

    def forward(self, x):
        x = self.input_projection(x)
        x = self.positional_encoding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.output_projection(x)
        return x

class SpatialCNN(nn.Module):
    def __init__(self, input_dim, hidden_channels=[64, 128, 64],
                 kernel_sizes=[3, 5, 7], output_dim=128):
        super(SpatialCNN, self).__init__()

        self.convs = nn.ModuleList()
        in_channels = 1

        for hidden_ch, kernel_size in zip(hidden_channels, kernel_sizes):
            self.convs.append(nn.Sequential(
                nn.Conv1d(in_channels, hidden_ch, kernel_size, padding=kernel_size//2),
                nn.BatchNorm1d(hidden_ch),
                nn.ReLU(),
                nn.MaxPool1d(2),
                nn.Dropout(0.3)
            ))
            in_channels = hidden_ch

        self.flatten_dim = hidden_channels[-1] * (input_dim // (2 ** len(hidden_channels)))

        self.fc = nn.Sequential(
            nn.Linear(self.flatten_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        x = x.unsqueeze(1)

        for conv in self.convs:
            x = conv(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

class HybridInfluencerDetector(nn.Module):
    def __init__(self, graph_embed_dim=128, static_feature_dim=6,
                 temporal_feature_dim=1, d_model=128):
        super(HybridInfluencerDetector, self).__init__()

        self.temporal_transformer = TemporalTransformer(
            input_dim=temporal_feature_dim,
            d_model=d_model,
            nhead=8,
            num_layers=4
        )

        spatial_input_dim = graph_embed_dim + static_feature_dim
        self.spatial_cnn = SpatialCNN(
            input_dim=spatial_input_dim,
            hidden_channels=[64, 128, 64],
            output_dim=d_model
        )

        fusion_input_dim = d_model * 2
        self.fusion = nn.Sequential(
            nn.Linear(fusion_input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, graph_embeddings, static_features, temporal_sequences):
        temporal_features = self.temporal_transformer(temporal_sequences)

        spatial_input = torch.cat([graph_embeddings, static_features], dim=1)
        spatial_features = self.spatial_cnn(spatial_input)

        combined = torch.cat([temporal_features, spatial_features], dim=1)
        output = self.fusion(combined)

        return output

print("Model architectures defined")

In [None]:
print("\n" + "="*80)
print("PREPARING DATASETS")
print("="*80)

feature_cols_for_model = [col for col in feature_cols_for_norm]

static_features = all_nodes[feature_cols_for_model].values
labels = all_nodes['is_hidden_influencer'].values

temporal_array = np.array([temporal_sequences[user_id] for user_id in all_nodes['user_id']])

print(f"\nData shapes:")
print(f"Graph embeddings: {graph_embeddings.shape}")
print(f"Static features: {static_features.shape}")
print(f"Temporal sequences: {temporal_array.shape}")
print(f"Labels: {labels.shape}")

indices = np.arange(len(all_nodes))
train_idx, temp_idx = train_test_split(indices, test_size=0.3,
                                       stratify=labels, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5,
                                     stratify=labels[temp_idx], random_state=42)

print(f"\nData split:")
print(f"Train: {len(train_idx):,} ({len(train_idx)/len(all_nodes):.1%})")
print(f"Val:   {len(val_idx):,} ({len(val_idx)/len(all_nodes):.1%})")
print(f"Test:  {len(test_idx):,} ({len(test_idx)/len(all_nodes):.1%})")

class InfluencerDataset(Dataset):
    def __init__(self, graph_emb, static_feat, temporal_seq, labels):
        self.graph_emb = torch.FloatTensor(graph_emb)
        self.static_feat = torch.FloatTensor(static_feat)
        self.temporal_seq = torch.FloatTensor(temporal_seq)
        self.labels = torch.FloatTensor(labels).unsqueeze(1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'graph_embeddings': self.graph_emb[idx],
            'static_features': self.static_feat[idx],
            'temporal_features': self.temporal_seq[idx],
            'label': self.labels[idx]
        }

train_dataset = InfluencerDataset(
    graph_embeddings[train_idx], static_features[train_idx], temporal_array[train_idx], labels[train_idx]
)
val_dataset = InfluencerDataset(
    graph_embeddings[val_idx], static_features[val_idx], temporal_array[val_idx], labels[val_idx]
)
test_dataset = InfluencerDataset(
    graph_embeddings[test_idx], static_features[test_idx], temporal_array[test_idx], labels[test_idx]
)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"\nDataLoaders created:")
print(f"Batch size: {batch_size}")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

In [None]:
print("\n" + "="*80)
print("INITIALIZING MODEL")
print("="*80)

total_count = int(len(all_nodes))
total_hidden_count = int(all_nodes['is_hidden_influencer'].sum())

if total_hidden_count == 0:
    print("\nCRITICAL ERROR: Total hidden influencers is 0.")
    pos_weight_value = 1.0
else:
    pos_weight_value = (total_count - total_hidden_count) / total_hidden_count

model = HybridInfluencerDetector(
    graph_embed_dim=128,
    static_feature_dim=len(feature_cols_for_model),
    temporal_feature_dim=temporal_array.shape[2],
    d_model=128
).to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel statistics:")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

pos_weight_tensor = torch.tensor([pos_weight_value], dtype=torch.float).to(device)
criterion = nn.BCELoss(weight=None, reduction='mean')

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', patience=5, factor=0.5
)

print(f"\nTraining setup complete")

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for data in tqdm(loader, desc="Training"):
        graph_emb = data['graph_embeddings'].to(device)
        static_feat = data['static_features'].to(device)
        temporal_feat = data['temporal_features'].to(device)
        labels = data['label'].to(device)

        optimizer.zero_grad()

        output = model(graph_emb, static_feat, temporal_feat)
        loss = criterion(output, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(labels)

        all_preds.extend(output.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)

    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)

    try:
        auc_score = roc_auc_score(all_labels, all_preds)
    except ValueError:
        auc_score = 0.5

    return avg_loss, auc_score

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    for data in tqdm(loader, desc="Validating"):
        graph_emb = data['graph_embeddings'].to(device)
        static_feat = data['static_features'].to(device)
        temporal_feat = data['temporal_features'].to(device)
        labels = data['label'].to(device)

        output = model(graph_emb, static_feat, temporal_feat)
        loss = criterion(output, labels)

        total_loss += loss.item() * len(labels)

        all_preds.extend(output.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)

    all_labels = np.array(all_labels)
    all_preds = np.array(all_preds)

    try:
        auc_score = roc_auc_score(all_labels, all_preds)
        avg_precision = average_precision_score(all_labels, all_preds)
    except ValueError:
        auc_score = 0.5
        avg_precision = 0.0

    return avg_loss, auc_score, avg_precision

epochs = 50
best_val_auc = 0
patience_counter = 0
print(f"\nStarting training for {epochs} epochs...")

history = {'train_loss': [], 'train_auc': [], 'val_loss': [], 'val_auc': [], 'val_ap': []}

for epoch in range(1, epochs + 1):
    train_loss, train_auc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_auc, val_ap = evaluate(model, val_loader, criterion, device)

    scheduler.step(val_loss)

    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Train AUC: {train_auc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val AUC: {val_auc:.4f} | Val AP: {val_ap:.4f}")

    history['train_loss'].append(train_loss)
    history['train_auc'].append(train_auc)
    history['val_loss'].append(val_loss)
    history['val_auc'].append(val_auc)
    history['val_ap'].append(val_ap)

    if val_auc > best_val_auc:
        best_val_auc = val_auc
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
        print("Model saved")
    else:
        patience_counter += 1

    if patience_counter >= 10:
        print("Early stopping triggered")
        break

print("\nTraining complete")

In [None]:
import pandas as pd
import numpy as np
import torch
import networkx as nx
import os
from tqdm.notebook import tqdm
from gensim.models import Word2Vec

print("="*80)
print("HYBRID MODEL TESTING ON NEW DATA")
print("="*80)

TEST_FILE_NODES = 'twitter_node_features.csv'
TEST_FILE_EDGES = 'twitter_edges.csv'

STATIC_FEATURE_NAMES = [
    'followers',
    'posts_count',
    'engagement_rate',
    'mfim_score',
    'authenticity_score',
    'pagerank'
]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_model = HybridInfluencerDetector(
    graph_embed_dim=128,
    static_feature_dim=len(STATIC_FEATURE_NAMES),
    temporal_feature_dim=1,
    d_model=128
).to(device)

best_model.load_state_dict(torch.load('best_model.pt', map_location=device))
best_model.eval()

model_n2v = Word2Vec.load('node2vec_model.bin')

print("\n" + "="*80)
print("LOADING AND PREPROCESSING NEW DATA")
print("="*80)

if not os.path.exists(TEST_FILE_NODES) or not os.path.exists(TEST_FILE_EDGES):
    raise FileNotFoundError("Required test files not found")

new_node_features = pd.read_csv(TEST_FILE_NODES)
new_edge_list = pd.read_csv(TEST_FILE_EDGES)

# -------------------------------
# FIX NODE ID COLUMN
# -------------------------------
if 'user_id' not in new_node_features.columns:
    if 'user' in new_node_features.columns:
        new_node_features.rename(columns={'user': 'user_id'}, inplace=True)
    elif 'channel_name' in new_node_features.columns:
        new_node_features.rename(columns={'channel_name': 'user_id'}, inplace=True)
    else:
        raise KeyError("No valid node ID column found")

node_ids_new = new_node_features['user_id'].unique()
new_data = new_node_features.set_index('user_id')

# -------------------------------
# FIX EDGE COLUMN NAMES
# -------------------------------
if 'source_user_id' not in new_edge_list.columns:
    if 'source' in new_edge_list.columns and 'target' in new_edge_list.columns:
        new_edge_list.rename(
            columns={
                'source': 'source_user_id',
                'target': 'target_user_id'
            },
            inplace=True
        )
    else:
        raise KeyError("No valid edge columns found")

# -------------------------------
# FILL MISSING STATIC FEATURES
# -------------------------------
for col in STATIC_FEATURE_NAMES:
    if col not in new_data.columns:
        new_data[col] = 0.0

X_static_new = new_data[STATIC_FEATURE_NAMES].values

# -------------------------------
# NODE2VEC EMBEDDINGS
# -------------------------------
graph_embeddings_new = []
for user_id in tqdm(node_ids_new):
    try:
        emb = model_n2v.wv[user_id]
    except KeyError:
        emb = np.zeros(128)
    graph_embeddings_new.append(emb)

graph_embeddings_new = np.array(graph_embeddings_new)

# -------------------------------
# BUILD GRAPH FOR STRUCTURAL FEATURES
# -------------------------------
G_new = nx.Graph()
for _, row in new_edge_list.iterrows():
    G_new.add_edge(row['source_user_id'], row['target_user_id'])

deg_cent = nx.degree_centrality(G_new)
clust_coeff = nx.clustering(G_new)

try:
    _, authorities = nx.hits(G_new, max_iter=100)
    hits_auth = authorities
except Exception:
    hits_auth = nx.pagerank(G_new)

temporal_array_new = []
for user_id in node_ids_new:
    temporal_array_new.append([
        [deg_cent.get(user_id, 0.0)],
        [clust_coeff.get(user_id, 0.0)],
        [hits_auth.get(user_id, 0.0)]
    ])

temporal_array_new = np.array(temporal_array_new)

print("\n" + "="*80)
print("RUNNING INFERENCE")
print("="*80)

graph_emb_tensor = torch.FloatTensor(graph_embeddings_new).to(device)
static_feat_tensor = torch.FloatTensor(X_static_new).to(device)
temporal_feat_tensor = torch.FloatTensor(temporal_array_new).to(device)

with torch.no_grad():
    predictions_prob = best_model(
        graph_emb_tensor,
        static_feat_tensor,
        temporal_feat_tensor
    ).cpu().numpy().flatten()

predictions = (predictions_prob >= 0.5).astype(int)

print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

results_df = pd.DataFrame({
    'user_id': node_ids_new,
    'predicted_label': predictions,
    'prediction_probability': predictions_prob
})

results_df = results_df.merge(new_node_features, on='user_id', how='left')
results_df.to_csv('predictions_out_of_sample.csv', index=False)

print("Inference complete")
print(f"Total Nodes: {len(node_ids_new)}")
print(f"Predicted Influencers: {int(predictions.sum())}")
print(f"Prediction Rate: {predictions.mean():.2%}")
