# We want to recreate the original sessions + adding the predictions the model gave

In [1]:
import pandas as pd

### Importing predictions
#### Remember: we made a prediction for every WORD

In [2]:
predictions = pd.read_csv(f"./predictions.csv")
print(f"We have {predictions.shape[0]} predictions")
predictions.head(2)

We have 17231851 predictions


Unnamed: 0,Predictions,Session_ids,Logits,new_predictions
0,1,0,6.916717,1
1,1,0,6.848216,1


In [3]:
print(f"We analysed {predictions.Session_ids.nunique()} sessions")

We analysed 160562 sessions


### Import labels

In [4]:
with open("../Dataset/Training/Supervised/labels.txt", "r") as f:
    labels = [el.strip() for el in f.readlines()]
id2label = {i: label for i, label in enumerate(labels)}
#Add "No Prediction" as a class
id2label[10] = "No Prediction"
label2id = {v: k for k, v in id2label.items()}

### Convert predictions_id to labels

In [5]:
from tqdm import tqdm
tqdm.pandas()
predictions["Predicted_classes"] = predictions["new_predictions"].progress_apply(lambda prediction_id: id2label[prediction_id])
predictions.head(2)

100%|██████████| 17231851/17231851 [00:10<00:00, 1695679.17it/s]


Unnamed: 0,Predictions,Session_ids,Logits,new_predictions,Predicted_classes
0,1,0,6.916717,1,Discovery
1,1,0,6.848216,1,Discovery


In [6]:
predictions["Models_predictions"] = predictions["Predictions"].progress_apply(lambda prediction_id: id2label[prediction_id])
predictions.head(2)

100%|██████████| 17231851/17231851 [00:10<00:00, 1662437.37it/s]


Unnamed: 0,Predictions,Session_ids,Logits,new_predictions,Predicted_classes,Models_predictions
0,1,0,6.916717,1,Discovery,Discovery
1,1,0,6.848216,1,Discovery,Discovery


### Normalize logits

In [7]:
df_min_max = pd.read_csv("../Training/Trained_Model/min_max_scaler.csv")
min_training_logits = df_min_max["min_training_logits"].iloc[0]
max_training_logits = df_min_max["max_training_logits"].iloc[0]

predictions["Normalized_logits"] = predictions["Logits"].progress_apply(lambda logit: (logit - min_training_logits) / max_training_logits)
predictions.head(2)

100%|██████████| 17231851/17231851 [00:14<00:00, 1192078.50it/s]


Unnamed: 0,Predictions,Session_ids,Logits,new_predictions,Predicted_classes,Models_predictions,Normalized_logits
0,1,0,6.916717,1,Discovery,Discovery,0.923465
1,1,0,6.848216,1,Discovery,Discovery,0.913597


### Importing Corpus

In [8]:
corpus = pd.read_csv(f"./original_sessions.csv")
print(f"Corpus contains {corpus.shape[0]} sessions")
corpus.head(2)

Corpus contains 160562 sessions


Unnamed: 0,session_id,full_session,source_ips,first_timestamp,date
0,0,enable ; system ; shell ; sh ; cat /proc/mount...,213.6.160.50,2021-03-06 00:01:13,2021-03-06
1,1,enable ; system ; shell ; sh ; cat /proc/mount...,213.94.56.23,2021-03-06 00:02:55,2021-03-06


### Reconstruct the sessions from the prediction

In [9]:
sessions_predictions = predictions.groupby("Session_ids")['Predicted_classes'].progress_apply(list).reset_index()
sessions_predictions.head(2)

100%|██████████| 160562/160562 [00:03<00:00, 42591.45it/s]


Unnamed: 0,Session_ids,Predicted_classes
0,0,"[Discovery, Discovery, Discovery, Discovery, D..."
1,1,"[Discovery, Discovery, Discovery, Discovery, D..."


#### Also save the model's predictions

In [10]:
sessions_model_predictions = predictions.groupby("Session_ids")['Models_predictions'].progress_apply(list).reset_index()
sessions_model_predictions.head(2)

100%|██████████| 160562/160562 [00:03<00:00, 40511.50it/s]


Unnamed: 0,Session_ids,Models_predictions
0,0,"[Discovery, Discovery, Discovery, Discovery, D..."
1,1,"[Discovery, Discovery, Discovery, Discovery, D..."


### Also save how confident the model is on those predictions

In [11]:
sessions_logits = predictions.groupby("Session_ids")['Normalized_logits'].progress_apply(list).reset_index()
sessions_logits.head(2)

100%|██████████| 160562/160562 [00:04<00:00, 37391.75it/s]


Unnamed: 0,Session_ids,Normalized_logits
0,0,"[0.9234652890281074, 0.9135971909033552, 0.923..."
1,1,"[0.9552223095097276, 0.9466695868669596, 0.955..."


### Merge the 3 datasets

In [12]:
sessions_info = sessions_predictions.merge(sessions_logits, on = "Session_ids").merge(sessions_model_predictions, on = "Session_ids")
sessions_info = sessions_info.rename({"Session_ids": "session_id"}, axis = 1)
sessions_info.head(2)

Unnamed: 0,session_id,Predicted_classes,Normalized_logits,Models_predictions
0,0,"[Discovery, Discovery, Discovery, Discovery, D...","[0.9234652890281074, 0.9135971909033552, 0.923...","[Discovery, Discovery, Discovery, Discovery, D..."
1,1,"[Discovery, Discovery, Discovery, Discovery, D...","[0.9552223095097276, 0.9466695868669596, 0.955...","[Discovery, Discovery, Discovery, Discovery, D..."


### Eventually, merge with original corpus

In [13]:
merged_corpus = corpus.merge(sessions_info, on = "session_id")
merged_corpus.head(2)

Unnamed: 0,session_id,full_session,source_ips,first_timestamp,date,Predicted_classes,Normalized_logits,Models_predictions
0,0,enable ; system ; shell ; sh ; cat /proc/mount...,213.6.160.50,2021-03-06 00:01:13,2021-03-06,"[Discovery, Discovery, Discovery, Discovery, D...","[0.9234652890281074, 0.9135971909033552, 0.923...","[Discovery, Discovery, Discovery, Discovery, D..."
1,1,enable ; system ; shell ; sh ; cat /proc/mount...,213.94.56.23,2021-03-06 00:02:55,2021-03-06,"[Discovery, Discovery, Discovery, Discovery, D...","[0.9552223095097276, 0.9466695868669596, 0.955...","[Discovery, Discovery, Discovery, Discovery, D..."


#### Adjust sessions to eliminate extra spaces

In [14]:
merged_corpus["full_session"] = merged_corpus["full_session"].progress_apply(lambda session: " ".join([el.strip() for el in session.split(" ") if el.strip()!= ""]))
merged_corpus.head(2)

100%|██████████| 160562/160562 [00:02<00:00, 72993.33it/s]


Unnamed: 0,session_id,full_session,source_ips,first_timestamp,date,Predicted_classes,Normalized_logits,Models_predictions
0,0,enable ; system ; shell ; sh ; cat /proc/mount...,213.6.160.50,2021-03-06 00:01:13,2021-03-06,"[Discovery, Discovery, Discovery, Discovery, D...","[0.9234652890281074, 0.9135971909033552, 0.923...","[Discovery, Discovery, Discovery, Discovery, D..."
1,1,enable ; system ; shell ; sh ; cat /proc/mount...,213.94.56.23,2021-03-06 00:02:55,2021-03-06,"[Discovery, Discovery, Discovery, Discovery, D...","[0.9552223095097276, 0.9466695868669596, 0.955...","[Discovery, Discovery, Discovery, Discovery, D..."


#### Delete those sessions which got truncated even with the mitigation policy
##### We find them by comparing the number of words with the number of labels (should be 1to1 match)

In [15]:
n_words = merged_corpus.full_session.progress_apply(lambda session: len(session.split(" ")))
n_labels = merged_corpus.Predicted_classes.progress_apply(lambda el: len(el))
indexes_to_remove = n_words.compare(n_labels).index
non_labeled_words = n_words.compare(n_labels).apply(lambda row: row.self - row.other, axis = 1).sum()
print(f"Adopting the mitigation approach, we still lose {indexes_to_remove.shape[0]} sessions ({indexes_to_remove.shape[0]/n_words.shape[0] *100:.2f} % of total) ", end = "")
print(f"and {non_labeled_words} words ({non_labeled_words/n_words.sum()*100:.2f} % of toral)")

100%|██████████| 160562/160562 [00:00<00:00, 299702.74it/s]
100%|██████████| 160562/160562 [00:00<00:00, 1175148.43it/s]

Adopting the mitigation approach, we still lose 86 sessions (0.05 % of total) and 4380.0 words (0.03 % of toral)





In [16]:
print(f"Shape before dropping: {merged_corpus.shape[0]}")
final_df = merged_corpus.drop(indexes_to_remove)
print(f"Shape after dropping: {final_df.shape[0]} (removed {indexes_to_remove.shape[0]} rows)")

Shape before dropping: 160562
Shape after dropping: 160476 (removed 86 rows)


#### Obtain average logit per session + min logit per session

In [17]:
import numpy as np
final_df["avg_confidence_per_session"] = final_df["Normalized_logits"].progress_apply(lambda logits: np.mean(logits))
final_df["min_confidence_per_session"] = final_df["Normalized_logits"].progress_apply(lambda logits: np.min(logits))
final_df.drop(["Normalized_logits"], axis = 1).head(2)

100%|██████████| 160476/160476 [00:02<00:00, 66078.07it/s]
100%|██████████| 160476/160476 [00:01<00:00, 84011.28it/s]


Unnamed: 0,session_id,full_session,source_ips,first_timestamp,date,Predicted_classes,Models_predictions,avg_confidence_per_session,min_confidence_per_session
0,0,enable ; system ; shell ; sh ; cat /proc/mount...,213.6.160.50,2021-03-06 00:01:13,2021-03-06,"[Discovery, Discovery, Discovery, Discovery, D...","[Discovery, Discovery, Discovery, Discovery, D...",0.793385,0.25324
1,1,enable ; system ; shell ; sh ; cat /proc/mount...,213.94.56.23,2021-03-06 00:02:55,2021-03-06,"[Discovery, Discovery, Discovery, Discovery, D...","[Discovery, Discovery, Discovery, Discovery, D...",0.818248,0.175923


### Select only those rows that contain unique sessions (if there are still non-unique)

In [18]:
final_df["first_timestamp"] = pd.to_datetime(final_df["first_timestamp"])
final_df.sort_values(by = "first_timestamp", ascending = True, inplace = True)
print(f"Before removing duplicates: {final_df.shape[0]}")
final_df = final_df.drop_duplicates("full_session")
print(f"After removing duplicates: {final_df.shape[0]}")

Before removing duplicates: 160476
After removing duplicates: 160475


### Export corpus with predictions
#### First, convert predictions to strings

In [19]:
final_df["Predicted_classes"] = final_df["Predicted_classes"].progress_apply(lambda prediction: " -- ".join(prediction))
final_df["Models_predictions"] = final_df["Models_predictions"].progress_apply(lambda prediction: " -- ".join(prediction))
final_df.head(2)

100%|██████████| 160475/160475 [00:00<00:00, 414088.98it/s]
100%|██████████| 160475/160475 [00:00<00:00, 415694.78it/s]


Unnamed: 0,session_id,full_session,source_ips,first_timestamp,date,Predicted_classes,Normalized_logits,Models_predictions,avg_confidence_per_session,min_confidence_per_session
0,0,enable ; system ; shell ; sh ; cat /proc/mount...,213.6.160.50,2021-03-06 00:01:13,2021-03-06,Discovery -- Discovery -- Discovery -- Discove...,"[0.9234652890281074, 0.9135971909033552, 0.923...",Discovery -- Discovery -- Discovery -- Discove...,0.793385,0.25324
1,1,enable ; system ; shell ; sh ; cat /proc/mount...,213.94.56.23,2021-03-06 00:02:55,2021-03-06,Discovery -- Discovery -- Discovery -- Discove...,"[0.9552223095097276, 0.9466695868669596, 0.955...",Discovery -- Discovery -- Discovery -- Discove...,0.818248,0.175923


#### Export

In [20]:
import csv

In [21]:
PATH_FILE = f"./corpus_with_predictions.csv"
final_df[["session_id", "full_session", "sensor", "first_timestamp", "Predicted_classes", "Models_predictions"]].to_csv(PATH_FILE, index = False, quoting=csv.QUOTE_ALL)

In [22]:
assert pd.read_csv(PATH_FILE).shape[0] == final_df.shape[0], "Error: wrong exporting"