In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [6]:
# Создаем пример данных (замените этот блок своим реальным журналом событий)
df = pd.read_csv("incident.csv")
df = df.drop(columns=["number"])

df = df.fillna('')

# Объединение текстовых данных в одну строку
df['combined_text'] = df.agg(' '.join, axis=1)

In [7]:
# Преобразование текста в числовые признаки с использованием TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["combined_text"])

In [44]:
# Разделение данных на обучающий и тестовый наборы
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [3]:
# Создаем пользовательский класс для PyTorch Dataset
class MyDataset(Dataset):
    def __init__(self, X):
        self.X = torch.tensor(X.toarray(), dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx]

# Создаем DataLoader
train_dataset = MyDataset(X_train)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [4]:
# Определяем автоэнкодер
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(X.shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, X.shape[1]),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [45]:
# Инициализация модели, функции потерь и оптимизатора
model = Autoencoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Обучение автоэнкодера
num_epochs = 10
for epoch in range(num_epochs):
    print("Epoch", epoch + 1, "/", num_epochs)
    for inputs in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()

Epoch 1 / 10
Epoch 2 / 10
Epoch 3 / 10
Epoch 4 / 10
Epoch 5 / 10
Epoch 6 / 10
Epoch 7 / 10
Epoch 8 / 10
Epoch 9 / 10
Epoch 10 / 10


In [46]:
torch.save(model.state_dict(), "anomaly_detection_model.pt")

In [8]:
loaded_model = Autoencoder()
loaded_model.load_state_dict(torch.load('anomaly_detection_model.pt'))
loaded_model.eval()

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=93, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=93, bias=True)
    (3): Sigmoid()
  )
)

In [17]:
with torch.no_grad():
    test_inputs = torch.tensor(X.toarray(), dtype=torch.float32)
    test_outputs = loaded_model(test_inputs)
    mse_loss = nn.MSELoss(reduction='none')(test_outputs, test_inputs).mean(dim=1)

threshold = 0.005
anomaly_labels = (mse_loss > threshold).int().tolist()

In [18]:
df_new = df.drop(columns=["combined_text"])
anomaly_column = pd.Series(anomaly_labels, name='anomaly')
df_new = pd.concat([df_new, anomaly_column], axis=1)
df_new.head(10)

Unnamed: 0,sys_created_on,sys_created_by,opened_at,resolved_at,reopened_time,activity_due,closed_at,closed_by,due_date,sla_due,contact_type,category,urgency,short_description,priority,state,escalation,anomaly
0,2020-01-01 02:22:21,UserA,2020-03-26 01:40:07,2020-06-06 13:34:48,,,,UserC,2020-05-17 22:15:02,,walk-in,Help,1 High,This is a description,2 - High,In Progress,Moderate,0
1,2020-02-28 11:44:42,UserA,2020-03-09 05:35:03,,,2020-04-13 09:16:21,,UserA,,,phone,Software,1 High,This is a description,3 - Moderate,In Progress,New,0
2,2020-01-12 18:57:25,UserG,2020-03-27 04:05:08,,,2020-03-25 00:06:29,,UserB,,2020-06-05 07:38:10,phone,Hardware,2 Medium,This is a description,4 - Low,Normal,,0
3,2020-01-08 06:31:25,UserB,2020-02-24 16:16:52,,,,,UserB,,,phone,Software,3 Low,This is a description,4 - Low,On Hold,Overdue,0
4,2020-02-23 03:59:31,UserA,2020-02-18 21:54:20,,,2020-03-26 09:29:55,,UserA,,,email,Network,2 Medium,This is a description,4 - Low,On Hold,Normal,0
5,2020-02-23 09:51:05,UserC,2020-03-14 18:53:33,2020-06-11 08:58:36,,2020-03-27 23:57:19,,UserA,2020-04-15 05:43:36,,email,Help,2 Medium,This is a description,2 - High,Normal,,0
6,2020-02-22 17:19:40,UserA,2020-03-16 00:02:51,,,,,UserB,,,email,Help,2 Medium,This is a description,3 - Moderate,In Progress,New,0
7,2020-02-11 10:17:29,UserA,2020-03-23 07:14:50,,,,,UserA,,2020-05-28 17:19:31,email,Hardware,3 Low,This is a description,3 - Moderate,In Progress,New,0
8,2020-02-15 18:32:18,UserB,2020-02-04 09:11:22,,,,2020-09-22 23:23:42,UserC,,,email,Software,2 Medium,This is a description,3 - Moderate,On Hold,Normal,0
9,2020-02-05 02:03:09,UserE,2020-03-13 22:33:06,,,2020-04-17 13:56:34,,UserB,,,walk-in,Smartphone,2 Medium,This is a description,2 - High,In Progress,Normal,0


In [19]:
df_filtered = df_new[df_new['anomaly'] == 0]
df_filtered

Unnamed: 0,sys_created_on,sys_created_by,opened_at,resolved_at,reopened_time,activity_due,closed_at,closed_by,due_date,sla_due,contact_type,category,urgency,short_description,priority,state,escalation,anomaly
0,2020-01-01 02:22:21,UserA,2020-03-26 01:40:07,2020-06-06 13:34:48,,,,UserC,2020-05-17 22:15:02,,walk-in,Help,1 High,This is a description,2 - High,In Progress,Moderate,0
1,2020-02-28 11:44:42,UserA,2020-03-09 05:35:03,,,2020-04-13 09:16:21,,UserA,,,phone,Software,1 High,This is a description,3 - Moderate,In Progress,New,0
2,2020-01-12 18:57:25,UserG,2020-03-27 04:05:08,,,2020-03-25 00:06:29,,UserB,,2020-06-05 07:38:10,phone,Hardware,2 Medium,This is a description,4 - Low,Normal,,0
3,2020-01-08 06:31:25,UserB,2020-02-24 16:16:52,,,,,UserB,,,phone,Software,3 Low,This is a description,4 - Low,On Hold,Overdue,0
4,2020-02-23 03:59:31,UserA,2020-02-18 21:54:20,,,2020-03-26 09:29:55,,UserA,,,email,Network,2 Medium,This is a description,4 - Low,On Hold,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2020-02-08 12:12:33,UserF,2020-02-19 03:13:07,,,2020-04-08 08:03:22,,UserD,2020-05-23 05:10:17,,email,Software,1 High,This is a description,5 - Planning,,,0
49996,2020-01-22 05:56:29,UserA,2020-03-14 21:06:57,,,,,UserA,2020-05-13 03:43:58,,email,Smartphone,2 Medium,This is a description,3 - Moderate,,High,0
49997,2020-02-09 22:35:22,UserB,2020-02-10 10:59:58,,,2020-03-04 02:09:01,,UserB,,,phone,Hardware,3 Low,This is a description,5 - Planning,Normal,Normal,0
49998,2020-02-21 12:26:55,UserE,2020-03-25 16:30:28,,,,,UserB,,,walk-in,Network,2 Medium,This is a description,5 - Planning,Closed,Overdue,0


In [69]:
df_filtered.to_csv("incident_filtered.csv")