In [1]:
import ast
import numpy as np
import pandas as pd
import networkx as nx
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_auc_score,
                             precision_recall_curve)
from sklearn.preprocessing import StandardScaler

from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GINConv

In [2]:
def create_message_graph(file_path):
    df = pd.read_csv(file_path)
    df['group_list'] = df['group_list'].apply(ast.literal_eval)
    
    group_dict = {}

    for _, row in df.iterrows():
        user_id = row['id']
        for group in row['group_list']:
            for group_id, message_count in group.items():
                group_dict.setdefault(group_id, []).append((user_id, message_count))
    
    G = nx.DiGraph()

    for _, row in df.iterrows():
        node_id = row['id']
        attributes = row.drop('group_list').to_dict()
        G.add_node(node_id, **attributes)

    return G

In [3]:
def extract_node_features(G):
    features = []
    for node in G.nodes():
        node_data = G.nodes[node]
        vector = [
            node_data.get('groups'),
            node_data.get('number_of_messages'),
            node_data.get('texts'),
            node_data.get('text_ratio'),
            node_data.get('midia'),
            node_data.get('midia_ratio'), 
            node_data.get('virals'),
            node_data.get('repeated_messages'),
            node_data.get('strenght'),
            node_data.get('viral_strenght')
        ]
        features.append(vector)
    return features

In [4]:
csv_file_path = 'users_selected_features.csv'
graph = create_message_graph(csv_file_path)

print(f'Number of nodes: {graph.number_of_nodes()}')
print(f'Number of edges: {graph.number_of_edges()}')

Number of nodes: 5364
Number of edges: 0


In [5]:
pyg_data = from_networkx(graph)
node_features = extract_node_features(graph)
scaler = StandardScaler()

X_normalized = scaler.fit_transform(node_features)
X = torch.tensor(X_normalized, dtype=torch.float)

labels = torch.tensor([graph.nodes[node]['disinformer'] for node in graph.nodes()],
                      dtype=torch.long)

data = Data(x=X, edge_index=pyg_data.edge_index, y=labels)
train_indices, test_indices = train_test_split(range(data.num_nodes),
                                                test_size=0.3,
                                                random_state=42)

data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.train_mask[train_indices] = True

data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask[test_indices] = True

In [6]:
class GINNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super(GINNet, self).__init__()

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

        self.conv = GINConv(self.mlp)
        self.dropout = nn.Dropout(p=dropout)
        self.lin = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.lin(x)

        return F.log_softmax(x, dim=1)

input_dim = X.size(1)
hidden_dim = 32 
output_dim = 2 
dropout_rate = 0.3

model = GINNet(input_dim, hidden_dim, output_dim, dropout=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-6)

In [7]:
class_counts = torch.bincount(data.y)
total_samples = len(data.y)
class_weights = total_samples / (len(class_counts) * class_counts.float())

criterion = torch.nn.NLLLoss(weight=class_weights.to(data.y.device))

In [8]:
def train():
    model.train()
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    with torch.no_grad():
        output = model(data)
        predictions = output.argmax(dim=1)
        correct = predictions[data.test_mask].eq(data.y[data.test_mask]).sum().item()
        accuracy = correct / data.test_mask.sum().item()
    return accuracy

In [9]:
for epoch in range(2000):
    loss = train()

    if epoch % 100 == 0:
        accuracy = test()
        print(f'\nEpoch {epoch}, Loss: {loss:.10f}')
        print(f'Accuracy: {accuracy:.5f}')

    if loss <= 0.005 and test() >= 0.998:
        print(f'\nStop: Epoch {epoch}, Loss: {loss:.10f}')
        print(f'Accuracy: {test():.5f}')
        break


Epoch 0, Loss: 0.6832715869
Accuracy: 0.68571

Epoch 100, Loss: 0.0145142609
Accuracy: 0.99441

Epoch 200, Loss: 0.0078008100
Accuracy: 0.99689

Epoch 300, Loss: 0.0039995462
Accuracy: 0.99752

Stop: Epoch 380, Loss: 0.0027193627
Accuracy: 0.99814


In [10]:
def find_best_threshold(y_true, positive_probs):
    precision, recall, thresholds = precision_recall_curve(y_true, positive_probs)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)

    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]
    
    return best_threshold

In [11]:
def evaluate_model(y_true, y_pred_prob, nodes, mask=None, dataset_name="Dataset"):
    if mask is not None:
        if isinstance(mask, torch.Tensor):
            mask = mask.cpu().numpy()
        y_true = y_true[mask]
        y_pred_prob = y_pred_prob[mask]
        nodes = [node for idx, node in enumerate(nodes) if mask[idx]]

    best_threshold = find_best_threshold(y_true, y_pred_prob)
    y_pred = (y_pred_prob >= best_threshold).astype(int)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_pred_prob)

    conf_matrix = confusion_matrix(y_true, y_pred)
    fp = conf_matrix[0][1] if conf_matrix.shape[0] > 1 else 0
    fn = conf_matrix[1][0] if conf_matrix.shape[0] > 1 else 0

    misinformers = [node for node, label in zip(nodes, y_pred) if label == 1]
    total_misinformers = sum(y_true)
    true_positives = sum(1 for t, p in zip(y_true, y_pred) if t == 1 and p == 1)
    percentage_identified = (true_positives / total_misinformers * 100) if total_misinformers > 0 else 0.0

    print(f"\nEvaluation on {dataset_name}:")
    print(f"  Accuracy: {accuracy:.6f}")
    print(f"  Precision: {precision:.6f}")
    print(f"  Recall: {recall:.6f}")
    print(f"  F1 Score: {f1:.6f}")
    print(f"  AUC: {auc:.6f}")
    print(f"  False Positives: {fp}")
    print(f"  False Negatives: {fn}")
    print(f"  Number of predicted misinformers ({dataset_name.lower()}): {len(misinformers)}")
    print(f"  Percentage of misinformers identified ({dataset_name.lower()}): {percentage_identified:.5f}%")

In [12]:
model.eval()
with torch.no_grad():
    output = model(data)
    y_prob = torch.exp(output)[:, 1].cpu().numpy()
    y_true = data.y.cpu().numpy()
    node_list = list(graph.nodes())

evaluate_model(
    y_true=y_true,
    y_pred_prob=y_prob,
    nodes=node_list,
    mask=data.test_mask,
    dataset_name="Test Set"
)

evaluate_model(
    y_true=y_true,
    y_pred_prob=y_prob,
    nodes=node_list,
    mask=None,
    dataset_name="Complete Set"
)


Evaluation on Test Set:
  Accuracy: 0.999379
  Precision: 1.000000
  Recall: 0.976190
  F1 Score: 0.987952
  AUC: 0.998132
  False Positives: 0
  False Negatives: 1
  Number of predicted misinformers (test set): 41
  Percentage of misinformers identified (test set): 97.61905%

Evaluation on Complete Set:
  Accuracy: 0.999627
  Precision: 0.992424
  Recall: 0.992424
  F1 Score: 0.992424
  AUC: 0.999440
  False Positives: 1
  False Negatives: 1
  Number of predicted misinformers (complete set): 132
  Percentage of misinformers identified (complete set): 99.24242%
