In [None]:
try:

    import torch_geometric
except ImportError:
    !pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
# dagnn_recommender_comparison.py
# Сравнение 5 моделей: Popularity, Random Forest, GCN, DAGNN, GAT на графе вызовов сервисов
import json
import torch
import torch.nn.functional as F
import torch.nn as nn
import networkx as nx
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.nn import APPNP
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, ndcg_score

# ==================== DAGNN (встроенный) ====================
class DAGNN(nn.Module):
    def __init__(self, in_channels: int, K: int):
        super().__init__()
        self.propagation = APPNP(K=K, alpha=0)
        self.att = nn.Parameter(torch.Tensor(K + 1))
        self.reset_parameters()

    def reset_parameters(self):
        self.propagation.reset_parameters()
        nn.init.zeros_(self.att)

    def forward(self, x, edge_index):
        xs = [x]
        edge_weight = torch.ones(edge_index.size(1), dtype=torch.float32, device=edge_index.device)
        for _ in range(self.propagation.K):
            x = self.propagation.propagate(edge_index, x=x, edge_weight=edge_weight)
            xs.append(x)
        out = torch.stack(xs, dim=-1)
        out = (out * self.att.view(1, 1, -1)).sum(dim=-1)
        return out

# ==================== DAGNN Recommender ====================
class DAGNNRecommender(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, K=10):
        super().__init__()
        self.lin1 = nn.Linear(in_channels, hidden_channels)
        self.dagnn = DAGNN(hidden_channels, K)
        self.lin2 = nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.lin1(x))
        x = self.dagnn(x, edge_index)
        x = self.lin2(x)
        return x

# ==================== GAT ====================
class GATRecommender(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=2):
        super().__init__()
        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads)
        self.gat2 = GATConv(hidden_channels * heads, out_channels, heads=1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.gat1(x, edge_index))
        x = self.gat2(x, edge_index)
        return x


# ==================== GCN ====================
class GCNRecommender(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

# ==================== Утилита ====================
# Функция для оценки модели с nDCG
def evaluate_model_with_ndcg(preds, true_labels, proba_preds=None, name="Model", label_binarizer=None):
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='macro')
    precision = precision_score(true_labels, preds, average='macro')
    recall = recall_score(true_labels, preds, average='macro')

    # if proba_preds is not None and label_binarizer is not None:
    #     true_binarized = label_binarizer.transform(true_labels)
    #     ndcg = ndcg_score(true_binarized, proba_preds)
    # else:
    #     ndcg = None
    print(f"\n📊 {name} Metrics")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    # if ndcg is not None:
    #     print(f"nDCG:      {ndcg:.4f}")
    # else:
    #     print("nDCG:      Not available")
    if proba_preds is not None:
        try:
            n_classes = proba_preds.shape[1]
            lb = LabelBinarizer()
            lb.fit(range(n_classes))
            true_bin = lb.transform(true_labels)

            # Если true_bin одномерный, превратим его в one-hot вручную
            if true_bin.ndim == 1:
                true_bin = np.eye(n_classes)[true_labels]

            ndcg = ndcg_score(true_bin, proba_preds)
            print(f"nDCG:      {ndcg:.4f}")
        except Exception as e:
            print(f"nDCG:      ❌ Error calculating nDCG: {e}")
    else:
        print("nDCG:      Not available (no probabilities)")

# ==================== Основной код ====================
# Загрузка графа из JSON
with open("compositionsDAG.json", "r", encoding="utf-8") as f:
    data = json.load(f)

dag = nx.DiGraph()
id_to_mid = {}

for composition in data:
    for node in composition["nodes"]:
        if "mid" in node:
            id_to_mid[str(node["id"])] = f"service_{node['mid']}"
        else:
            id_to_mid[str(node["id"])] = f"table_{node['id']}"

    for link in composition["links"]:
        source = str(link["source"])
        target = str(link["target"])
        src_node = id_to_mid[source]
        tgt_node = id_to_mid[target]
        dag.add_node(src_node, type='service' if src_node.startswith("service") else 'table')
        dag.add_node(tgt_node, type='service' if tgt_node.startswith("service") else 'table')
        dag.add_edge(src_node, tgt_node)

# Построение путей для тренировочной выборки
# Получаем все возможные пути в DAG длиной от 2 до N (контексты)
paths = []

for start_node in dag.nodes:
    if dag.out_degree(start_node) > 0:
        for path in nx.dfs_edges(dag, source=start_node):
            full_path = [path[0], path[1]]
            while dag.out_degree(full_path[-1]) > 0:
                next_nodes = list(dag.successors(full_path[-1]))
                if not next_nodes:
                    break
                full_path.append(next_nodes[0])  # можно улучшить: рассматривать все ветки
            if len(full_path) > 1:
                paths.append(full_path)

# Строим выборку: (контекст) -> (следующий mid)
X_raw = []
y_raw = []

for path in paths:
    for i in range(1, len(path) - 1):
        context = tuple(path[:i])  # контекст: предыдущие шаги
        next_step = path[i]
        if next_step.startswith("service"):
            X_raw.append(context)
            y_raw.append(next_step)

# Векторизация
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(X_raw)
le = LabelEncoder()
y = le.fit_transform(y_raw)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Подготовка бинализатора для nDCG (используем тестовую выборку)
lb = LabelBinarizer()
lb.fit(y_test)

# ==================== Model 1: Random Forest ====================
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_proba = rf.predict_proba(X_test)
# evaluate_model(rf_preds, y_test, name="Random Forest")
evaluate_model_with_ndcg(rf_preds, y_test, proba_preds=rf_proba, name="Random Forest", label_binarizer=lb)


# ==================== Model 2: Popularity ====================
counter = Counter(y_raw)
top_label = counter.most_common(1)[0][0]
pop_preds = [le.transform([top_label])[0]] * len(y_test)
# Симулируем вероятности: 1 для самого популярного класса, 0 для остальных
pop_proba = np.zeros_like(rf_proba)
top_label_index = le.transform([top_label])[0]
pop_proba[:, top_label_index] = 1
evaluate_model_with_ndcg(pop_preds, y_test, proba_preds=pop_proba, name="Popularity-Based", label_binarizer=lb)

# ==================== PyG подготовка ====================
node_list = list(dag.nodes)
node_encoder = LabelEncoder()
node_ids = node_encoder.fit_transform(node_list)
node_map = {node: idx for node, idx in zip(node_list, node_ids)}

edge_index = torch.tensor([[node_map[u], node_map[v]] for u, v in dag.edges], dtype=torch.long).t()
features = [[1, 0] if dag.nodes[n]['type'] == 'service' else [0, 1] for n in node_list]
x = torch.tensor(features, dtype=torch.float)
data_pyg = Data(x=x, edge_index=edge_index)

contexts = torch.tensor([node_map[context[-1]] for context in X_raw], dtype=torch.long)
targets = torch.tensor([node_map[y] for y in y_raw], dtype=torch.long)

lb = LabelBinarizer()
lb.fit(targets.numpy())  # gcn_true = targets.numpy()

# ==================== Model 3: GCN ====================
gcn = GCNRecommender(in_channels=2, hidden_channels=16, out_channels=len(node_list))
opt = torch.optim.Adam(gcn.parameters(), lr=0.01)

for epoch in range(100):
    # gcn.train()
    # opt.zero_grad()
    # out = gcn(data_pyg)[contexts]
    # loss = F.cross_entropy(out, targets)
    # loss.backward()
    # opt.step()

    gcn.train()
    opt.zero_grad()
    out = gcn(data_pyg)[contexts]
    loss = torch.nn.functional.cross_entropy(out, targets)
    loss.backward()
    opt.step()


gcn.eval()
with torch.no_grad():
    gcn_output = gcn(data_pyg)[contexts]
    gcn_preds = gcn_output.argmax(dim=1).numpy()
    gcn_proba = torch.nn.functional.softmax(gcn_output, dim=1).numpy()
    gcn_true = targets.numpy()

    evaluate_model_with_ndcg(gcn_preds, gcn_true, proba_preds=gcn_proba, name="GCN", label_binarizer=lb)

# ==================== Model 4: DAGNN ====================
dagnn = DAGNNRecommender(in_channels=2, hidden_channels=16, out_channels=len(node_list))
opt = torch.optim.Adam(dagnn.parameters(), lr=0.01)

for epoch in range(100):
    dagnn.train()
    opt.zero_grad()
    out = dagnn(data_pyg.x, data_pyg.edge_index)[contexts]
    loss = F.cross_entropy(out, targets)
    loss.backward()
    opt.step()

dagnn.eval()
with torch.no_grad():
    dagnn_output = dagnn(data_pyg.x, data_pyg.edge_index)[contexts]
    dagnn_preds = dagnn_output.argmax(dim=1).numpy()
    dagnn_proba = torch.nn.functional.softmax(dagnn_output, dim=1).numpy()
    evaluate_model_with_ndcg(dagnn_preds, gcn_true, proba_preds=dagnn_proba, name="DAGNN", label_binarizer=lb)

# ==================== Model 5: GAT ====================
gat = GATRecommender(in_channels=2, hidden_channels=16, out_channels=len(node_list), heads=2)
opt = torch.optim.Adam(gat.parameters(), lr=0.01)

for epoch in range(100):
    gat.train()
    opt.zero_grad()
    out = gat(data_pyg)[contexts]
    loss = F.cross_entropy(out, targets)
    loss.backward()
    opt.step()

gat.eval()
with torch.no_grad():
    gat_output = gat(data_pyg)[contexts]
    gat_preds = gat_output.argmax(dim=1).numpy()
    gat_proba = torch.nn.functional.softmax(gat_output, dim=1).numpy()
    evaluate_model_with_ndcg(gat_preds, gcn_true, proba_preds=gat_proba, name="GAT", label_binarizer=lb)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



📊 Random Forest Metrics
Accuracy:  0.4545
F1-score:  0.3500
Precision: 0.3125
Recall:    0.5000
nDCG:      0.7590

📊 Popularity-Based Metrics
Accuracy:  0.2424
F1-score:  0.0976
Precision: 0.0606
Recall:    0.2500
nDCG:      0.6368


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



📊 GCN Metrics
Accuracy:  0.4953
F1-score:  0.3750
Precision: 0.3333
Recall:    0.5000
nDCG:      0.7807


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



📊 DAGNN Metrics
Accuracy:  0.4206
F1-score:  0.3209
Precision: 0.3258
Recall:    0.4231
nDCG:      0.7381

📊 GAT Metrics
Accuracy:  0.3271
F1-score:  0.2248
Precision: 0.3182
Recall:    0.3269
nDCG:      0.6849


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
