# This notebook wants to prove the usefuleness of the model on decreasing the experts analysis from thousands of unique sessions a day to tens of unique intents.
## Idea here is to provide the security expert a tool to easily focus on "real" novelties
### Furthermore, we also want to analyse the relationship between new sessions (according to our labels) and new labels per day 

### Read libraries

In [None]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
tqdm.pandas()

### Import dataset

In [None]:
predicted_corpus = pd.read_csv(f"../Inference/corpus_with_predictions.csv")
predicted_corpus.head(2)

#### How many unique sessions?

In [None]:
print(f"Number of unique sessions: {predicted_corpus.full_session.nunique()}")

#### How many unique predictions?

In [None]:
print(f"Number of unique predictions: {predicted_corpus.Models_predictions.nunique()}")

##### MINOR: Remember to cast "first_timestamp" string to datetime

In [None]:
predicted_corpus["first_timestamp"] = pd.to_datetime(predicted_corpus["first_timestamp"])

#### Make sure we are handling only unique sessions

In [None]:
print(f"Before dropping duplicates: {predicted_corpus.shape[0]}")
predicted_corpus.sort_values(by = "first_timestamp", ascending = True, inplace = True)
predicted_corpus = predicted_corpus.drop_duplicates(["full_session"])
print(f"After dropping duplicates: {predicted_corpus.shape[0]}")
predicted_corpus.head(2)

#### How long did the collection last?

In [None]:
start = predicted_corpus.first_timestamp.min()
stop = predicted_corpus.first_timestamp.max()
print(f"Collection started in {start} and lasted untill {stop}")

#### Create "date" feature to aggregate daily stats

In [None]:
predicted_corpus["date"] = predicted_corpus["first_timestamp"].progress_apply(lambda datetime: datetime.date())
predicted_corpus.head(2)

#### Which distribution of unique sessions/day? Group daily stats

In [None]:
# Groupby date and set date as an index
unique_session_per_date = predicted_corpus.groupby("date")["full_session"].count().reset_index().rename({"full_session":"unique_sessions_per_day"}, axis = 1).sort_values(by = "date")
unique_session_per_date.set_index(unique_session_per_date.date, inplace = True)
unique_session_per_date.drop("date", axis = 1, inplace = True)
# Now refill dates in which the honeypot was off with None
idx = pd.date_range(predicted_corpus.date.min(), predicted_corpus.date.max())
unique_session_per_date = unique_session_per_date.reindex(idx, fill_value=None)
unique_session_per_date.head(2)

#### Now, find the number of unique predictions/day according to the model
##### Ide behind unique predictions is that we remove duplicates PER DAY

In [None]:
# Groupby date and set date as an index
unique_labels_per_date = predicted_corpus.groupby("date")["Models_predictions"].nunique().reset_index().rename({"Models_predictions":"unique_labels_per_day"}, axis = 1).sort_values(by = "date")
unique_labels_per_date.set_index(unique_labels_per_date.date, inplace = True)
unique_labels_per_date.drop("date", axis = 1, inplace = True)
# Now refill dates in which the honeypot was off with None
idx = pd.date_range(predicted_corpus.date.min(), predicted_corpus.date.max())
unique_labels_per_date = unique_labels_per_date.reindex(idx, fill_value=None)
unique_labels_per_date.head(2)

#### Plot the two trends

In [None]:
import matplotlib.pyplot as plt 

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
fontsize = 15

# First ax
ax1.plot(unique_session_per_date.index, unique_session_per_date.unique_sessions_per_day, linewidth = 2, color = "royalblue", label = "|Unique sessions|")
ax1.set_ylabel('|Unique sessions|', fontsize = fontsize + 3)
ax1.yaxis.set_tick_params(labelsize=fontsize)
ax1.set_xlabel('Date', fontsize = fontsize + 3)
ax1.xaxis.set_tick_params(labelsize=fontsize, rotation = 60)  
ax1.grid(linewidth = .5)

# Second ax
ax2.plot(unique_labels_per_date.index, unique_labels_per_date.unique_labels_per_day, linewidth = 2, color = "firebrick", label = "|Unique labels|", alpha = .7)
ax2.set_ylabel('|Unique labels|', fontsize = fontsize + 3)
ax2.set_xlabel('Date', fontsize = fontsize + 3)
ax2.yaxis.set_tick_params(labelsize=fontsize)
ax2.xaxis.set_tick_params(labelsize=fontsize, rotation = 60)   
ax2.grid(linewidth = .5)

plt.tight_layout()
plt.show()

**COMMENT**: The plot above shows that we indeed perform an aggregation, moving from ~ 500 unique sessions per day to ~ 30 unique labels per day

### Now, which is the relationship between #unique sessions per day and #new sessions per day? And which is the one between #unique labels and #new labels?

#### For each label, getting the date of first appearance

In [None]:
labels_first_appearances = predicted_corpus[["date", "Models_predictions"]].sort_values(by = "date").drop_duplicates(["Models_predictions"])
labels_first_appearances = labels_first_appearances.rename({"date":"first_appearance"}, axis = 1)
labels_first_appearances.head(2)

#### Now, obtain new sessions per day

In [None]:
print(f"Before: {predicted_corpus.shape[0]}")
joined_corpus = predicted_corpus.merge(labels_first_appearances, on = "Models_predictions")
joined_corpus = joined_corpus[joined_corpus.date == joined_corpus.first_appearance]
print(f"After: {joined_corpus.shape[0]}")
joined_corpus.head(2)

In [None]:
# Groupby date and set date as an index
new_sessions_per_date = joined_corpus.groupby("date")["full_session"].count().reset_index().rename({"full_session":"new_sessions_per_day"}, axis = 1).sort_values(by = "date")
new_sessions_per_date.set_index(new_sessions_per_date.date, inplace = True)
new_sessions_per_date.drop("date", axis = 1, inplace = True)
# Now refill dates in which the honeypot was off with None
idx = pd.date_range(predicted_corpus.date.min(), predicted_corpus.date.max())
new_sessions_per_date = new_sessions_per_date.reindex(idx, fill_value=None)
new_sessions_per_date.head(2)

#### Also, obtain the number of new labels per day

In [None]:
new_labels_per_date = joined_corpus.groupby("date")["Models_predictions"].nunique().reset_index().rename({"Models_predictions":"new_labels_per_day"}, axis = 1).sort_values(by = "date")
new_labels_per_date.set_index(new_labels_per_date.date, inplace = True)
new_labels_per_date.drop("date", axis = 1, inplace = True)
# Now refill dates in which the honeypot was off with None
idx = pd.date_range(predicted_corpus.date.min(), predicted_corpus.date.max())
new_labels_per_date = new_labels_per_date.reindex(idx, fill_value=None)
new_labels_per_date.head(2)

#### Only New Predictions per day

In [None]:
from datetime import datetime
server_update_date = datetime.strptime("08/11/2019", '%d/%m/%Y').date()
server_update_datetime = datetime.strptime("08/11/2019", '%d/%m/%Y')

print(f"Server update occurred in {server_update_date}")

In [None]:
import matplotlib.pyplot as plt 
from matplotlib.lines import Line2D

fig, (ax1) = plt.subplots(1, figsize=(6,5))
fontsize = 17

ax1.plot(unique_session_per_date.index, unique_session_per_date.unique_sessions_per_day, linewidth = 2, color = "royalblue", label = "|Unique sessions|")
ax1.vlines(server_update_datetime, 0, 6500, label = "Server update", linewidth = 1.5, linestyle = "dashed", color = "firebrick")

ax1.set_ylabel('|Unique sessions|', fontsize = fontsize + 2)
ax1.yaxis.set_tick_params(labelsize=fontsize)
ax1.set_xlabel('Date', fontsize = fontsize + 2)
ax1.xaxis.set_tick_params(labelsize=fontsize, rotation = 30)  
ax1.grid(linewidth = .5)


server_update = Line2D([0], [0], color='firebrick', linestyle = "dashed", label= "Server update")
handles = [server_update]
ax1.legend(handles=handles, fontsize = fontsize, loc = "upper left", framealpha=0.3)

plt.tight_layout()
plt.savefig("./Inference_results/unique_sessions_in_time.pdf")

In [None]:
import matplotlib.pyplot as plt 
from matplotlib.lines import Line2D

fig, (ax1) = plt.subplots(1, figsize=(6,5))
fontsize = 17

# Second ax
ax1.plot(new_labels_per_date.index, new_labels_per_date.new_labels_per_day, linewidth = 2, color = "forestgreen")
ax1.vlines(server_update_datetime, 0, 50, label = "Server update", linewidth = 1.5, linestyle = "dashed", color = "firebrick")

ax1.set_ylabel('|New tactical fingerprints|', fontsize = fontsize + 2)
ax1.yaxis.set_tick_params(labelsize=fontsize)
ax1.xaxis.set_tick_params(labelsize=fontsize, rotation = 30)
ax1.set_yticks(np.arange(0, 51, 10))
ax1.grid()
ax1.set_ylim(0, 50)
ax1.set_xlabel("Date", fontsize = fontsize + 2)

server_update = Line2D([0], [0], color='firebrick', linestyle = "dashed", label= "Server update")
handles = [server_update]
ax1.legend(handles=handles, fontsize = fontsize, loc = "upper left", framealpha=0.3)

plt.tight_layout()
plt.savefig("./Inference_results/novelties_in_time.pdf")

#### Other version

In [None]:
import matplotlib.pyplot as plt 

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,6))
fontsize = 20

# First ax
ax1.plot(unique_session_per_date.index, unique_session_per_date.unique_sessions_per_day, linewidth = 2, color = "royalblue", label = "|Unique sessions|")
ax1.set_ylabel('|Unique sessions|', fontsize = fontsize + 3)
ax1.yaxis.set_tick_params(labelsize=fontsize)
ax1.set_xlabel('Date', fontsize = fontsize + 3)
ax1.xaxis.set_tick_params(labelsize=fontsize, rotation = 60)  
ax1.grid(linewidth = .5)

# Second ax
ax2.plot(new_labels_per_date.index, new_labels_per_date.new_labels_per_day, linewidth = 2, color = "darkred")
ax2.set_ylabel('|New Labels|', fontsize = fontsize + 3)
ax2.yaxis.set_tick_params(labelsize=fontsize)
ax2.xaxis.set_tick_params(labelsize=fontsize, rotation = 60)      
ax2.grid()
ax2.set_xlabel('Date', fontsize = fontsize + 3)

plt.tight_layout(pad=5.0)
plt.show()

#### Isolate peak

In [None]:
from datetime import date
desired_date = date(2019, 11, 12)
novelties = labels_first_appearances[labels_first_appearances.first_appearance == desired_date]
print(f"Selected {novelties.shape[0]} sequences of predictions")
novelties.head(2)

##### How many associated sessions?

In [None]:
novelties = novelties.merge(joined_corpus.groupby("Models_predictions")["full_session"].count().reset_index(name = "associated_sessions"), on = "Models_predictions")
novelties.head(2)

##### How different/"distant" are those novelties in terms of edit distances (weighted Levenstein distance)?

###### Create OneHotEncoded versions of inputs so that we can compute word level levenstein

In [None]:
with open("../Dataset/Training/Supervised/labels.txt", "r") as f:
    labels = [el.strip() for el in f.readlines()]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
novelties["oneHotEncoded"] = novelties["Models_predictions"].apply(lambda prediction: "".join([str(label2id[el]) for el in prediction.split(" -- ")]))
novelties.sort_values(by = "associated_sessions", ascending = False, inplace = True)
novelties.head(10)

### How "distant" are the attacks of interest from each other?

In [None]:
from Levenshtein import distance as lev
from itertools import permutations
import seaborn as sns

perms_values = permutations(list(novelties.oneHotEncoded.values), 2)
perms_keys = permutations(list(novelties.Models_predictions.values), 2)

id2sequence = dict(zip(list(np.arange(novelties.shape[0])), list(novelties.Models_predictions.values)))
sequence2id = dict(zip(list(novelties.Models_predictions.values), list(np.arange(novelties.shape[0]))))

distances = []
for key, value in zip(perms_keys, perms_values):
    distance = lev(value[0], value[1])
    max_length = np.max([len(key[1].split(" -- ")), len(key[0].split(" -- "))])
    distances.append({
        "origin":sequence2id[key[0]],
        "destination": sequence2id[key[1]],
        "distance": distance/max_length
    })
    
df_tmp = pd.DataFrame(distances)

table = pd.pivot_table(df_tmp, values='distance', index=['origin'], columns = ["destination"], aggfunc=np.sum).fillna(0)

mask = np.zeros_like(table.to_numpy(), dtype="bool")
mask[np.triu_indices_from(mask)] = True

fig, axs = plt.subplots(figsize =(10, 6))
fontsize = 15

axs = sns.heatmap(table.to_numpy(), mask=mask, linewidth = 0.2 ,cmap="jet")

cbar = axs.collections[0].colorbar
# here set the labelsize by 20
cbar.ax.tick_params(labelsize=fontsize)

axs.figure.axes[-1].yaxis.label.set_size(fontsize+5)
axs.figure.axes[-1].yaxis.set_label_coords(3,.5)
axs.set_ylabel("Origin", fontsize = fontsize + 5)
axs.set_xlabel("Destination", fontsize = fontsize + 5)
plt.tight_layout()


In [None]:
for it in [0, 4, 5, 8, 12, 14, 29, 31]:
    prediction = novelties.iloc[it].Models_predictions
    print(it)
    print(joined_corpus[joined_corpus.Models_predictions == prediction].iloc[0].Models_predictions)
    print()
    print(joined_corpus[joined_corpus.Models_predictions == prediction].iloc[0].full_session)
    print("\n")

#### Focus on one

In [None]:
chosen_family = novelties.iloc[0].Models_predictions
print(f"Focusing on:\n{chosen_family}")

##### Let's try plotting the novelty of the predictions vs days passed from closest neighbor

###### For each label, save first date in which we've seen that label

In [None]:
first_appearance_per_prediction = predicted_corpus.groupby("Models_predictions")["date"].agg("first").reset_index()
first_appearance_per_prediction.sort_values(by = "date", ascending = True, inplace = True)
first_appearance_per_prediction.head(2)

##### Compute distances between families 

###### Create OneHotEncoded versions of inputs so that we can compute word level levenstein

In [None]:
first_appearance_per_prediction["OneHotEncoded_representation"] = first_appearance_per_prediction["Models_predictions"].apply(lambda prediction: "".join([str(label2id[el]) for el in prediction.split(" -- ")]))

###### Compute distances

In [None]:
from tqdm.contrib import tzip
from itertools import permutations
from Levenshtein import distance as lev
import numpy as np

perms_values = list(permutations(list(first_appearance_per_prediction.OneHotEncoded_representation.values), 2))
perms_keys = list(permutations(list(first_appearance_per_prediction.Models_predictions.values), 2))

distances = []
for it in tqdm(range(len(list(perms_keys)))):
    key, value = perms_keys[it], perms_values[it]
    distance = lev(value[0], value[1])
    max_length = np.max([len(key[1].split(" -- ")), len(key[0].split(" -- "))])
    distances.append({
        "origin":key[0],
        "destination": key[1],
        "distance": distance#/max_length
    })

In [None]:
distances_df = pd.DataFrame(distances)
distances_df.head(2)

###### Create OD matrix

In [None]:
OD_matrix = distances_df.pivot_table(values='destination', index="origin", columns='destination', aggfunc=sum)
print(f"Created OD matrix of size {OD_matrix.shape[0]}x{OD_matrix.shape[1]}")

##### Find index of chosen family + date of birth

In [None]:
prev_next_origins = [first_appearance_per_prediction[first_appearance_per_prediction.Models_predictions == chosen_family].index[0]][0]
origin_representation = first_appearance_per_prediction.loc[prev_next_origins]["date"]
origin_representation

In [None]:
%%time
to_beginning_of_dataset = False
beginnin_of_dataset = predicted_corpus.date.min()
hop = 0
top_neigh = 1
selected_date = desired_date

weighted_edges = []
prev_next_origins = [first_appearance_per_prediction[first_appearance_per_prediction.Models_predictions == chosen_family].index[0]]
while not to_beginning_of_dataset:
    print(f"\nHop: {hop + 1} -->", end = "\t")
    print(f"At this level, {len(prev_next_origins)} origins...", end = " ")
    next_origins = []
    for origin_id in prev_next_origins:
        origin_prediction = first_appearance_per_prediction.loc[origin_id]["Models_predictions"]
        origin_date = first_appearance_per_prediction.loc[origin_id]["date"]
        # From OD matrix, find possible destinations given origin
        all_destinations = OD_matrix.loc[origin_prediction]
        families_before_selected = first_appearance_per_prediction[first_appearance_per_prediction.date < origin_date].Models_predictions
        possible_destinations = all_destinations[families_before_selected]
        sorted_possible_destinations = possible_destinations.reset_index().sort_values(by = origin_prediction)
        for it in range(top_neigh):
            destination = sorted_possible_destinations.iloc[it]["destination"]
            distance_destination = sorted_possible_destinations.iloc[it][origin_prediction]
            id_destination = first_appearance_per_prediction[first_appearance_per_prediction.Models_predictions == destination].index[0]
            date_destination = first_appearance_per_prediction.loc[id_destination].date
            if date_destination == beginnin_of_dataset: #Stopping condition
                to_beginning_of_dataset = True
            weighted_edges.append((int(origin_id), int(id_destination), distance_destination, len(origin_prediction.split(" -- ")), origin_date))
            next_origins.append(id_destination)
    prev_next_origins = [el for el in next_origins]
    print(f"And {len(prev_next_origins)} destinations!")
    hop += 1

#### Edges

In [None]:
edges_df = pd.DataFrame(weighted_edges, columns = ["Origin", "Destination", "edit_distance", "|words_origin|", "day_of_novelty"])
edges_df["ordered_OD"] = edges_df.apply(lambda row: " - ".join(sorted([str(row["Origin"]), str(row["Destination"])])), axis = 1)
print(f"Final graph contains {edges_df['ordered_OD'].nunique()} edges")
edges_df[["Origin", "Destination", "edit_distance", "|words_origin|", "day_of_novelty"]].head(10)

In [None]:
import matplotlib.pyplot as plt 
from datetime import timedelta

fig, (ax) = plt.subplots(1, figsize=(8,6))
fontsize = 15
edges_df.sort_values(by = "day_of_novelty", inplace = True)

#Colorbar
norm = plt.Normalize(edges_df['|words_origin|'].min(), edges_df['|words_origin|'].max())
sm = plt.cm.ScalarMappable(cmap="flare", norm=norm)
sm.set_array([])

# First ax
ax.plot(edges_df.day_of_novelty, edges_df.edit_distance, linewidth = .5, color = "navy")
sns.scatterplot(data=edges_df, x="day_of_novelty", y ="edit_distance", hue = "|words_origin|", palette = "flare", marker='D', ax = ax)
ax.collections[0].set_sizes([200])                 # <---- reset markersize here
ax.set_ylabel('|Words edited|', fontsize = fontsize + 3)
ax.yaxis.set_tick_params(labelsize=fontsize)
ax.set_xlabel('Date', fontsize = fontsize + 3)
ax.xaxis.set_tick_params(labelsize=fontsize, rotation = 60)  
ax.grid(linewidth = .5)

# Remove the legend and add a colorbar
ax.get_legend().remove()
cbar = ax.figure.colorbar(sm)
cbar.ax.get_yaxis().labelpad = 20
cbar.set_label('|words per sequence|', rotation=270, fontsize=fontsize + 2)


#Text
for i in range(edges_df.shape[0]):
    txt = edges_df.iloc[i].Origin
    x, y = edges_df.iloc[i].day_of_novelty, edges_df.iloc[i].edit_distance
    t = ax.annotate(txt, (x + timedelta(days=5), y - 2), fontsize = fontsize)
    t.set_bbox(dict(facecolor='red', alpha=0.2, edgecolor='salmon'))
plt.tight_layout()
plt.show()

### Examples:

In [None]:
def remove_repetitions(sequence_intents):
    list_elements = sequence_intents.split(" -- ")
    prev_el = list_elements[0]
    non_repeated_list = []
    counter = 1
    for it in range(1,len(list_elements)):
        el = list_elements[it]
        if prev_el != el:
            non_repeated_list.append(f"{prev_el} x {counter}")
            counter = 1
            prev_el = el
        else:
            counter += 1
    # For last element
    non_repeated_list.append(f"{prev_el} x {counter}")        
    return " -- ".join(non_repeated_list)

In [None]:
print(f"Node 227:\nFirst appearance:\t{first_appearance_per_prediction.loc[227].date}\nCorresponding intent:\t{remove_repetitions(first_appearance_per_prediction.loc[227].Models_predictions)}")
print(f"\nExample: {predicted_corpus[predicted_corpus.Models_predictions == first_appearance_per_prediction.loc[227].Models_predictions].full_session.iloc[0]}")

In [None]:
print(f"Node 101:\nFirst appearance:\t{first_appearance_per_prediction.loc[101].date}\nCorresponding intent:\t{remove_repetitions(first_appearance_per_prediction.loc[101].Models_predictions)}")
print(f"\nExample: {predicted_corpus[predicted_corpus.Models_predictions == first_appearance_per_prediction.loc[101].Models_predictions].full_session.iloc[0]}")

In [None]:
print(f"Node 192:\nFirst appearance:\t{first_appearance_per_prediction.loc[192].date}\nCorresponding intent:\t{remove_repetitions(first_appearance_per_prediction.loc[192].Models_predictions)}")
print(f"\nExample: {predicted_corpus[predicted_corpus.Models_predictions == first_appearance_per_prediction.loc[192].Models_predictions].full_session.iloc[0]}")

In [None]:
print(f"Node 1591:\nFirst appearance:\t{first_appearance_per_prediction.loc[1591].date}\nCorresponding intent:\t{remove_repetitions(first_appearance_per_prediction.loc[1591].Models_predictions)}")
print(f"\nExample: {predicted_corpus[predicted_corpus.Models_predictions == first_appearance_per_prediction.loc[1591].Models_predictions].full_session.iloc[0]}")

In [None]:
print(f"Node 1384:\nFirst appearance:\t{first_appearance_per_prediction.loc[1384].date}\nCorresponding intent:\t{remove_repetitions(first_appearance_per_prediction.loc[1384].Models_predictions)}")
print(f"\nExample: {predicted_corpus[predicted_corpus.Models_predictions == first_appearance_per_prediction.loc[1384].Models_predictions].full_session.iloc[0]}")

In [None]:
print(f"Node 76:\nFirst appearance:\t{first_appearance_per_prediction.loc[76].date}\nCorresponding intent:\t{remove_repetitions(first_appearance_per_prediction.loc[76].Models_predictions)}")
print(f"\nExample: {predicted_corpus[predicted_corpus.Models_predictions == first_appearance_per_prediction.loc[76].Models_predictions].full_session.iloc[0]}")