### autoreload classes in case you change something in the files

In [None]:
%load_ext autoreload
%autoreload 2

# Imports

In [None]:
from sklearn.metrics import f1_score,recall_score,precision_score,confusion_matrix,accuracy_score

In [None]:
import pandas as pd
import numpy as np
from tqdm import trange
import pickle
import json
import sys
import time
from sklearn.model_selection import train_test_split

### Local imports

In [None]:
sys.path.append("classes")
from SLI_loss1 import LossCompute
from model import *
from networks import *
from tokenizer import *
from data_loader import *
from prototype import get_prototypes
from SLImodel import SLIModel

### Load data

In [None]:
df = pd.read_csv("./github_logs_.csv").drop(columns=["Unnamed: 0"])
df['log_message'] = df['log_message'].str.replace("\<\*\>", " ")
df['log_message'] = df['log_message'].str.replace("\[STR\]", " ")
df['log_message'] = df['log_message'].str.replace("\[NUM\]", " ")


In [None]:
# df = pd.read_csv("./filtered_log_df.csv")
load = df['log_message'].values
labels = df['log_level'].values
df.tail()

In [None]:
# replace all special characters
regex = re.compile('[^a-zA-Z ]')
df['log_message'] =df['log_message'].apply(lambda x:' '.join(regex.sub('', x ).strip().split()))

In [None]:
# df.loc[df['log_level']=='trace'] = 'debug'     
# df.loc[df['log_level']=='critical'] = 'error'
# df.loc[df['log_level']=='exception'] = 'debug'
# df.loc[df['log_level']=='fatal'] = 'error'
# df.loc[df['log_level']=='warn'] = 'debug'


In [None]:
df.loc[df['log_level']=='trace'] = 'debug'     
df.loc[df['log_level']=='critical'] = 'error'
df.loc[df['log_level']=='exception'] = 'error'
df.loc[df['log_level']=='fatal'] = 'error'
df.loc[df['log_level']=='warn'] = 'warning'


In [None]:
df = df[df['log_level']!='debug']
df = df[df['log_level']!='log']
df = df[df['log_level']!='warning']
df = df.reset_index()
df = df.drop("index", axis=1)

In [None]:
np.unique(df.log_level)

In [None]:
def fcn(x):
    if x =="warning":
        return "anomaly"
    elif x=="error":
        return "anomaly"
    else:
        return "normal"
df.log_level = df.log_level.apply(lambda x: fcn(x))
# df[df.log_level=='warning'].log_level = 'normal'
# df[df.log_level=='info'].log_level = 'normal'
# df[df.log_level=='error'].log_level = 'anomaly'
# normal class: info, warning
# anomaly class: critical, error, exception, fatal,

In [None]:
df1 = df.loc[:, ["log_level", "log_message"]]
df1.columns = ["t", "Content"]
df1 = df1[df1.t=="anomaly"].drop_duplicates()
df1.Content = df1.Content.apply(lambda x: " ".join([z.lower() for z in x.rsplit()]))
df1.to_csv("anomalies_github.csv", sep=" ", index=False)

In [None]:
# {'debug': 0, 'error': 1, 'info': 2, 'log': 3, 'warning': 4}

In [None]:
# get only unique log messages
# df = df.drop_duplicates(subset=['log_message']).reset_index().drop(columns=['index'])

load = df['log_message'].values
labels = df['log_level'].values

### Class conunt

In [None]:
class_count = df.groupby("log_level").count()['log_message']
class_count

### Tokenize data

In [None]:
label_mapper = {class_count.index[i]:i for i in range(len(class_count))}

In [None]:
label_mapper

In [None]:
tokenizer = LogTokenizer()
# tokenizer = LogTokenizer("tokenizer_SLI.json")
tokenized = []
for i in trange(0, len(df)):
        tokenized.append(np.array(tokenizer.tokenize(df['log_message'][i])))
        
labels_tokenized = [label_mapper[label] for label in labels]
# labels_tokenized = pickle.load(open("tokenizer/tokenizer.pickle",'rb'))
# tokenizer = pickle.load(open("tokenizer_SLI.json",'rb'))

In [None]:
with open("tokenizer/tokenizer_256.pickle",'wb') as file:
    pickle.dump(tokenizer,file,pickle.HIGHEST_PROTOCOL)
    
with open("tokenizer/labels_tokenized_256.pickle",'wb') as file:
    pickle.dump(labels_tokenized,file,pickle.HIGHEST_PROTOCOL)

# Prepare data

In [None]:
load_train,load_test, labels_train, labels_test = train_test_split(np.array(tokenized), np.array(labels_tokenized),train_size=0.8)

# 

In [None]:
batch_size = 2048
pad_len = 50
train_dataloader, test_dataloader = create_data_loaders(load_train, labels_train, load_test,
                                                                     labels_test, pad_len, batch_size)

In [None]:
torch.cuda.empty_cache()

In [None]:
src_vocab = tokenizer.n_words
tgt_vocab = 256
n_layers=2
in_features=256
out_features=256
num_heads=2
dropout=0.05
max_len=50

# CrossEntropy loss

In [None]:
calculate_weights = lambda x, i: x.sum() / (len(x)*x[i])

In [None]:
weights = [calculate_weights(class_count,i) for i in range(len(class_count))]
weights /= max(weights)
class_weights=torch.FloatTensor(weights).cuda()

cross_entropoy_loss = nn.CrossEntropyLoss(weight=class_weights).cuda()

In [None]:
with open("./weights_class3.pickle", "wb") as file:
    pm = pickle.dump(weights, file)

# Hyperspherical Prototype 

In [None]:
conf = {
    "classes": 5,
    "dims": 16,
    "learning_rate": 0.01,
    "momentum": 0.9,
    "epochs": 1000,
    "seed": 300,

}
prototypes = torch.from_numpy(get_prototypes(conf)).float()

cos_sim = nn.CosineSimilarity(eps=1e-9).cuda()

In [None]:
prototypes.shape

# Optimizers

In [None]:
def run_train(dataloader, model, optimizer, f_loss, epoch,polars=None):
    model.train()
    total_loss = 0
#     start = time.time()
    for i, batch in enumerate(dataloader):
        load, y = batch
        if polars  is not None:
            y = polars[y.numpy()]
            y = torch.autograd.Variable(y).cuda()
    
        out = model.forward(load.cuda().long(), None)

        if isinstance(f_loss,nn.CosineSimilarity):
             loss = (1 - f_loss(out,y)).pow(2).sum()
        else:
            loss = f_loss(out,y.cuda().long())
        
            
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss
        
#         elapsed = time.time() - start
        if i%5==0:
            print("Epoch %d Train Step: %d / %d Loss: %f" %
                  (epoch,i, len(dataloader), loss), end='\r')
    
    print("Epoch %d Train Step: %d / %d Loss: %f" %
                  (epoch,i, len(dataloader), loss), end='\r')        
    return total_loss/len(dataloader)



def run_test(dataloader, model, optimizer, f_loss, epoch, polars=None):
    model.eval()
    total_loss = 0
    preds = []
    tmps = []
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            load, y = batch


            out = model.forward(load.cuda().long(), None)
#             print(out)
            if isinstance(f_loss,nn.CosineSimilarity):
                x = F.normalize(out, p=2, dim=1)

                x = torch.mm(x, polars.t().cuda())
                pred = x.max(1, keepdim=True)[1].reshape(1,-1)[0]
                preds += list(pred.detach().cpu().numpy())
            else:
                tmp = out.detach().cpu().numpy()
                preds += list(np.argmax(tmp, axis=1))
                tmps += list(tmp)
#             if i%5==0:
#                 print("Epoch %d Test Step: %d / %d Loss: %f" %
#                       (epoch,i, len(dataloader), loss), end='\r')

#     print("Epoch %d Test Step: %d / %d Loss: %f" %
#                   (epoch,i, len(dataloader), loss), end='\r')        
    return preds



import time
def run_optimizer(model, train_dataloader,labels_test,optimizer,n_epochs,f_loss,polars,class_weights):
    conf_matrix = []
    train_time = []
    test_time = []
    for epoch in range(1,1+n_epochs):
        print("Epoch", epoch)
        start_train_time = time()
        loss = run_train(train_dataloader, model, optimizer, f_loss, epoch,polars)
        train_time.append(time()-start_train_time)
        print("Epoch %d Train Loss: %f" % (epoch, loss)," "*30) 
        
        start_test_time = time()
        preds = run_test(test_dataloader, model, optimizer, f_loss, epoch,polars)
        test_time.append(time()-start_test_time)
#         print("Epoch %d Test Loss: %f" % (epoch, loss)," "*30)
        print(f"Accuracy:{round(accuracy_score(preds,labels_test),2)}")
        print(f"f1_score:{round(f1_score(preds,labels_test,average='weighted'),2)}")
        print(f"recall_score:{round(recall_score(preds,labels_test,average='weighted'),2)}")
        print(f"precision_score:{round(precision_score(preds,labels_test,average='weighted'),2)}")
        conf_matrix.append(confusion_matrix(preds,labels_test))
    return model, preds, conf_matrix, train_time, test_time

In [None]:
optimizer = optimizers['adam']
n_epochs = 20
loss_f = cos_sim

In [None]:
model = SLIModel(src_vocab=src_vocab, tgt_vocab=2,
                     n_layers=n_layers, in_features=in_features,
                     out_features=out_features,num_heads=num_heads,
                     dropout=dropout, max_len=max_len).get_model()
torch.cuda.set_device(0)
model.cuda()


In [None]:

learning_rate = 0.0001
decay = 0.001
betas = (0.9, 0.999)
momentum = 0.9
sgd_opt = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=decay)
adam_opt = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=betas, weight_decay=decay)




optimizers = {"adam":adam_opt,"sgd":sgd_opt}
optimizer = optimizers['adam']
n_epochs = 50
loss_f = cross_entropoy_loss

In [None]:
model, preds, conf_matrix, train_time, test_time = run_optimizer(model,train_dataloader,labels_test,optimizer,n_epochs,cross_entropoy_loss,polars=None,class_weights=weights)

In [None]:
model_weights_path= "./model_params_2classes_prototypes.pickle"
tokenizer_path = "./tokenizer_dict_model_params_model_params_2classes_prototypes.json"


state_dict = model.state_dict()
state_dict.pop('generator.proj.bias', None)
state_dict.pop('generator.proj.weight', None)

import pickle

with open(model_weights_path,'wb') as file:
    pickle.dump(state_dict,file,pickle.HIGHEST_PROTOCOL)
    
    
with open(tokenizer_path,'w') as file:
    json.dump(tokenizer.word2index,file)

with open("train_time256.pickle", "wb") as file:
    pickle.dump(train_time, file)

with open("test_time256.pickle", "wb") as file:
    pickle.dump(test_time, file)