In [24]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from log_to_df import logs_to_dataframe
from sklearn.model_selection import train_test_split

In [20]:
# Загружаем данные
df = pd.read_csv("../Test Cases/TestCase 1/anomalies_problems.csv",sep=';')  # должно быть две колонки: "Аномалия" и "Проблема"

In [22]:
# Формируем положительные пары
positive_pairs = [InputExample(texts=[row["Аномалия"], row["Проблема"]], label=1.0) for _, row in df.iterrows()]

# Формируем отрицательные пары (аномалия + случайная проблема)
negative_pairs = []
for _, row in df.iterrows():
    problem = np.random.choice(df["Проблема"].values)
    negative_pairs.append(InputExample(texts=[row["Аномалия"], problem], label=0.0))

In [23]:
# Объединяем
train_examples = positive_pairs + negative_pairs
train_data, val_data = train_test_split(train_examples, test_size=0.1, random_state=42)

# Загружаем модель
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Настраиваем обучение
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

# Обучаем
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    show_progress_bar=True
)

# Сохраняем модель
model.save("anomaly_problem_matcher")



Step,Training Loss


In [25]:
VC3 = logs_to_dataframe('../Validation_Cases/ValidationCase_3')
VC3.head(10)

Unnamed: 0,datetime,level,source,text,filename,line_number
0,2025-10-06T00:24:39,WARNING,network,Performance degradation detected,app_server1_log.txt,1480
1,2025-10-06T08:59:32,ERROR,os,Host overheating alert,app_server1_log.txt,32373
2,2025-10-06T08:36:41,WARNING,os,Overheating alert in host,app_server2_log.txt,31002
3,2025-10-06T09:16:28,WARNING,os,Overheating alert in host,app_server2_log.txt,33389
4,2025-10-06T08:51:26,WARNING,network,Slow virtual network performance,backup_server_log.txt,31887
5,2025-10-06T02:25:21,WARNING,network,User-facing VM network issue,firewall_log.txt,8722
6,2025-10-06T11:41:12,ERROR,network,Virtual network misconfiguration,switch1_log.txt,42073


In [32]:
anomaly = []
problems = []
for index, row in VC3.iterrows():
    if row.level == 'WARNING':
        anomaly.append(row.text)
    elif row.level == 'ERROR':
        problems.append(row.text)

print(anomaly)
print(problems)

['Performance degradation detected', 'Overheating alert in host', 'Overheating alert in host', 'Slow virtual network performance', 'User-facing VM network issue']
['Host overheating alert', 'Virtual network misconfiguration']


In [33]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer("anomaly_problem_matcher")

# Считаем сходство
emb_anomaly = model.encode(anomaly[0], convert_to_tensor=True)
emb_problems = model.encode(problems, convert_to_tensor=True)
cosine_scores = util.cos_sim(emb_anomaly, emb_problems)[0]

# Находим наиболее похожую проблему
best_idx = np.argmax(cosine_scores)
print(f"Наиболее вероятная проблема: {problems[best_idx]}")
print(f"Сходство: {cosine_scores[best_idx].item():.4f}")


Наиболее вероятная проблема: Host overheating alert
Сходство: 0.3144


  return conv.wrap(result, to_scalar=False)
