# This notebook prepare some examples to:
- Create a table to visualize most frequent words vs their predictions
- Prove the importance of Attention ('echo' and 'rm' examples)

### Import dataset

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
import pandas as pd 
predicted_corpus = pd.read_csv(f"../Inference/corpus_with_predictions.csv")
print(f"Corpus contains {predicted_corpus.shape[0]} unique sessions and {predicted_corpus.Models_predictions.nunique()} unique sequences of predictions")
predicted_corpus.head(2)

#### Filter "/system scheduler" sessions --> not bash

In [None]:
print(f"Before filtering '/system scheduler' sessions: {predicted_corpus.shape[0]}")
predicted_corpus = predicted_corpus[~predicted_corpus.full_session.str.contains("/system scheduler")]
print(f"After filtering '/system scheduler' sessions: {predicted_corpus.shape[0]}")

#### Create date attribute

In [None]:
predicted_corpus["first_timestamp"] = pd.to_datetime(predicted_corpus["first_timestamp"])
predicted_corpus["date"] = predicted_corpus["first_timestamp"].apply(lambda datetime: datetime.date())
predicted_corpus.head(2)

#### Plot distribution of intents over time

In [None]:
sessions_date_predictions = predicted_corpus[["full_session", "Models_predictions", "date"]]
sessions_date_predictions["splitted_session"] = sessions_date_predictions["full_session"].apply(lambda session: session.split(" "))
sessions_date_predictions["splitted_prediction"] = sessions_date_predictions["Models_predictions"].apply(lambda predictions: predictions.split(" -- "))
exploded_df = sessions_date_predictions[["splitted_session", "splitted_prediction", "date"]].explode(["splitted_session", "splitted_prediction"])
print(f"Exploded dataset contains {exploded_df.shape[0]} rows and {exploded_df.shape[1]} columns")
exploded_df.head(2)

##### Groupby date

In [None]:
occurrences_x_day = exploded_df.groupby(["date", "splitted_prediction"]).size().reset_index(name = "daily_occurrences")
occurrences_x_day.head(5)

##### Calculate CDF

In [None]:
occurrences_x_day["cumulative_occurrences"] = occurrences_x_day.groupby(['splitted_prediction'])['daily_occurrences'].cumsum()
occurrences_x_day = occurrences_x_day.merge(occurrences_x_day.groupby("splitted_prediction")["daily_occurrences"].sum().reset_index(name = "tot_occurrences"), on = "splitted_prediction")
occurrences_x_day["cdf"] = occurrences_x_day.apply(lambda row: row["cumulative_occurrences"] / row["tot_occurrences"], axis = 1)
occurrences_x_day.head(2)

#### Create colors

In [None]:
import seaborn as sns
bars = occurrences_x_day.drop_duplicates("splitted_prediction").sort_values(by = "tot_occurrences", ascending = False)
palette = sns.color_palette("bright", bars.splitted_prediction.nunique())
role2color = {color:prediction for color, prediction in zip(bars.splitted_prediction.unique(), palette)}
hex_role2color = {color:prediction for color, prediction in zip(bars.splitted_prediction.unique(), palette.as_hex())}

#### How many words per class?

In [None]:
prediction_per_class = bars[["splitted_prediction", "tot_occurrences"]]
prediction_per_class["%_over_corpus"] = prediction_per_class["tot_occurrences"].apply(lambda occ: round(occ / prediction_per_class.tot_occurrences.sum() * 100, 3))
prediction_per_class

#### How many words in general?

In [None]:
prediction_per_class.tot_occurrences.sum()

#### Given Discovery, what's next?

##### Obtain set of predictions

In [None]:
def remove_repetitions(fingreprint):
    list_elements = fingreprint.split(" -- ")
    prev_el = list_elements[0]
    non_repeated_list = []
    for it in range(1,len(list_elements)):
        el = list_elements[it]
        if prev_el != el:
            non_repeated_list.append(str(prev_el))
            prev_el = el
    non_repeated_list.append(str(prev_el))        
    return " -- ".join(non_repeated_list)

In [None]:
predicted_corpus["set_tactics"] = predicted_corpus["Models_predictions"].progress_apply(lambda predictions_list: remove_repetitions(predictions_list))
predicted_corpus.head(2)

##### Now, for each session, we want to create the origin/destination matrix
###### Each session will get a |classes| x |classes| matrix (which we'll then convert to a flat tensor)

In [None]:
sessions_with_discovery = predicted_corpus[predicted_corpus.Predicted_classes.str.contains("Discovery")]
sessions_with_discovery.shape[0]

In [None]:
%%time
destinations_discovery = {"Stop":0}
change_transition = 0
def count_destinations_from_discovery(predictions, destinations_discovery):
    change_transition = 0
    if predictions.strip() == "Discovery":
        destinations_discovery["Stop"] += 1
    origins = predictions.split(" -- ")[:-1]
    destinations = predictions.split(" -- ")[1:]
    for it, el in enumerate(origins):
        if el == "Discovery":
            change_transition += 1
            destination = destinations[it]
            if destination not in  destinations_discovery.keys():
                destinations_discovery[destination] = 0
            destinations_discovery[destination] += 1
    return change_transition
for set_tactics in predicted_corpus["set_tactics"]:
    change_transition += count_destinations_from_discovery(set_tactics, destinations_discovery)
change_transition, destinations_discovery

#### Fill missing dates with NaN

In [None]:
def plot_cdfs(cdf, dates):
    cdf = cdf.copy().set_index("date")
    cdf.index = pd.DatetimeIndex(cdf.index)
    cdf = cdf.reindex(dates, method = 'ffill') #Every prediction must be equally indexed: if no improvements that day, put last valid value
    return cdf.asfreq('D')

##### Plot

In [None]:
dates = occurrences_x_day.date.unique()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
fig, axs = plt.subplots(1,2, figsize =(12, 3))
fontsize = 15
#Axis 1
sns.barplot(data=bars, x="tot_occurrences", y ="splitted_prediction", hue = "splitted_prediction", dodge = False, palette = role2color, ax = axs[0])
axs[0].legend_.remove()
axs[0].set_xlabel('|Words per prediction|', fontsize = fontsize)
axs[0].set_ylabel("")
axs[0].set_xscale("log")
axs[0].xaxis.set_tick_params(labelsize=fontsize)
axs[0].yaxis.set_tick_params(labelsize=fontsize)
axs[0].grid()
#Axis 2

for role in occurrences_x_day.splitted_prediction.unique():
    cdf = occurrences_x_day[occurrences_x_day.splitted_prediction == role][["date", "cdf"]]
    cdf = plot_cdfs(cdf, dates)
    axs[1].plot(cdf.index, cdf.cdf, color = role2color[role], linewidth = 3)
    
#sns.lineplot(data=occurrences_x_day, x="date", y ="cdf", hue = "splitted_prediction", palette = role2color, ax = axs[1], linewidth = 3)

axs[1].set_xlabel('Date', fontsize = fontsize)
axs[1].set_ylabel('ECDF of prediction', fontsize = fontsize)
axs[1].yaxis.set_tick_params(labelsize=fontsize)
axs[1].xaxis.set_tick_params(labelsize=fontsize, rotation = 30)
axs[1].grid()
plt.tight_layout()
plt.savefig(f"./Inference_results/1_Stats_per_prediction/{dataset}_predictions_stats.pdf")

#### Now, I want to know the predictions assigned per words
##### We will study:
- Given a prediction, which is the word assigned to that prediction for more time
- Given a word (more frequent ones), which is their prediction?

#### Keep only 1st session per sequence of intents

In [None]:
unique_corpus = predicted_corpus.sort_values(by = "date").drop_duplicates(["Models_predictions"])
print(f"Selected {unique_corpus.shape[0]} rows")
unique_corpus.head(2)

##### Explode dataset, so that each row contains a word and a prediction

In [None]:
unique_corpus["splitted_session"] = unique_corpus["full_session"].apply(lambda session: session.split(" "))
unique_corpus["splitted_prediction"] = unique_corpus["Models_predictions"].apply(lambda predictions: predictions.split(" -- "))
exploded_df = unique_corpus[["splitted_session", "splitted_prediction", "date"]].explode(["splitted_session", "splitted_prediction"])
print(f"Exploded dataset contains {exploded_df.shape[0]} rows and {exploded_df.shape[1]} columns")
exploded_df.head(2)

##### Count how many unique tuples ("word", "prediction")

In [None]:
grouped_df = exploded_df.groupby(["splitted_session", "splitted_prediction"]).size().reset_index(name = "occurrences_tuple")
print(f"The dataset contains {grouped_df.shape[0]} unique tuples")
grouped_df.head(2)

In [None]:
print(f"Particularly, it contains {grouped_df.splitted_session.nunique()} unique words and {grouped_df.splitted_prediction.nunique()} unique predictions")

In [None]:
import numpy as np
cdf_tuples_occurrences = grouped_df.value_counts("occurrences_tuple").sort_index()
cdf_tuples_occurrences = np.cumsum(cdf_tuples_occurrences)/np.sum(cdf_tuples_occurrences)

In [None]:
import matplotlib.pyplot as plt 

fig, axs = plt.subplots(1, figsize=(4,3))
fontsize = 18
axs.plot(cdf_tuples_occurrences.reset_index()["occurrences_tuple"], cdf_tuples_occurrences.reset_index()[0], linewidth = 2)
axs.set_xlabel('|occurrences| (Word, Prediction)', fontsize = fontsize + 3)
axs.set_xscale("log")
axs.set_ylabel('ECDF', fontsize = fontsize)
axs.set_xticks([1, 10, 100, 1_000, 10_000])
axs.yaxis.set_tick_params(labelsize=fontsize)
axs.xaxis.set_tick_params(labelsize=fontsize)
axs.grid()
plt.tight_layout()
plt.show()

##### Now group by word and collect:
- list of assigned predictions
- occurrences of that word

In [None]:
groupby_word = exploded_df.groupby("splitted_session").agg({"splitted_prediction":list, "date":"count"}).rename({"splitted_prediction":"assigned_predictions", "date":"word_occurrences"}, axis = 1)
print(f"Dataset contains {groupby_word.shape[0]} unique words")
groupby_word.head(2)

###### Trick to easily count labels occurrences

In [None]:
groupby_word["assigned_predictions"] = groupby_word["assigned_predictions"].apply(lambda list_predictions: " __ ".join(list_predictions))
groupby_word.head(2)

###### Doing that, each row will contain dates. Each column will be associated to a prediction

In [None]:
def custom_tokenizer(session):
    return [el.strip() for el in session.split(" __ ")]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, lowercase = False)
X = vectorizer.fit_transform(groupby_word.assigned_predictions).toarray()
names = vectorizer.get_feature_names_out()
df_count_vectorizer = pd.DataFrame(X, columns=names)
print(f"Dataframe has shape {df_count_vectorizer.shape[0]} x {df_count_vectorizer.shape[1]}")
df_count_vectorizer.head(2)

##### Concat two datasets

In [None]:
concat_df = pd.concat([groupby_word.reset_index(), df_count_vectorizer.reset_index(drop = True)], axis = 1)
concat_df.sort_values(by = "word_occurrences", ascending = False, inplace = True)
concat_df.head(2)

##### Visualize top-10 words
##### Keep only words with alphas 

In [None]:
import re
to_plot_df = concat_df.copy()
to_plot_df["is_alpha"] = to_plot_df["splitted_session"].apply(lambda word: False if re.search('[a-zA-Z]', word) == None else True)
to_plot_df["is_flag"] = to_plot_df["splitted_session"].apply(lambda word: "-" in word)

top = to_plot_df[(to_plot_df.is_alpha == True) & (to_plot_df.is_flag == False)].iloc[:20]
top = top[["splitted_session"] + list(top.columns[-9:-2].values)]
top.head(2)

In [None]:
from scipy.stats import entropy
top[top.columns[1:]] = top.apply(lambda row: row[1:] / np.sum(row[1:]), axis = 1)
top["entropy"] = top.apply(lambda row: entropy(row[1:].astype(float), base=2), axis = 1)
top.sort_values(by = "entropy", inplace = True)
top.head(2)

In [None]:
fig, axs = plt.subplots(1, figsize =(17, 6))
fontsize = 10
#First matrix
data_values = top.T.loc[top.columns[1:-1]].astype("float")
normed_data_values=data_values.apply(lambda column: column / column.sum(), axis=0)

im = sns.heatmap(normed_data_values, linewidth = 0.1, cmap="jet", ax = axs, annot = True, annot_kws={"fontsize":fontsize}, fmt='.2f', cbar_kws={"orientation": "horizontal", "location":"top"})

axs.set_xticklabels(top.T.loc["splitted_session"], fontsize = fontsize + 2, rotation = 90)
axs.set_yticklabels(top.columns[1:-1], fontsize = fontsize + 2, rotation = 0)
cbar = axs.collections[0].colorbar
cbar.ax.tick_params(labelsize=fontsize)

plt.tight_layout()
plt.savefig(f"./Inference_results/word_vs_prediction.pdf")

## STUDY ON ECHO - Go simple here!

In [None]:
print(f"Before filtering: {predicted_corpus.shape[0]} (and {predicted_corpus.Models_predictions.nunique()} families)")
df_echo= predicted_corpus[predicted_corpus.full_session.str.contains(" echo")]
print(f"Sessions containing 'echo': {df_echo.shape[0]} (and {df_echo.Models_predictions.nunique()} families)")
df_echo.head(2)

#### Now keep track of the predictions we associated to 'echo'

In [None]:
def track_echo(session, predictions):
    echo_roles = []
    words = session.split(" ")
    predictions = predictions.split(" -- ")
    for word, prediction in zip(words, predictions):
        if "echo" == word:
            echo_roles.append(prediction)
    return " -- ".join(echo_roles)

In [None]:
from tqdm import tqdm
tqdm.pandas()

df_echo["echo_roles"] = df_echo.progress_apply(lambda row: track_echo(row.full_session, row.Models_predictions), axis = 1)
df_echo.head(2)

### How many sessions we can associate to each echo's use?

In [None]:
sessions_associated_to_use = df_echo.groupby("echo_roles").full_session.count().reset_index(name = "associated_sessions")
print(f"There are {sessions_associated_to_use.shape[0]} different uses of the command 'echo'")

### Count how many "families" we associated per use

In [None]:
families_associated_to_use = df_echo.drop_duplicates("Models_predictions").groupby("echo_roles").Models_predictions.count().reset_index(name = "associated_families")
sessions_and_families_per_use = sessions_associated_to_use.merge(families_associated_to_use, on = "echo_roles")
with pd.option_context('display.max_colwidth', None):
    display(sessions_and_families_per_use.sort_values(by = "associated_sessions", ascending = False).head(10))

#### Show some examples of such usages

##### Idea is to select a family associated to role if:
- Family is numerous enough 
- Family is "dissimilar" to the ones observed before

##### Create OneHotEncoded versions of inputs
###### so that we can compute word level levenstein and print "different" families

In [None]:
with open("../Dataset/Training/Supervised/labels.txt", "r") as f:
    labels = [el.strip() for el in f.readlines()]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
df_echo.head(2)

##### Function to color items 

In [None]:
def color_sessions(session, not_confident_predictions, models_predictions, chosen_word):
    words = session.split(" ") 
    not_confident_predictions = not_confident_predictions.split(" -- ") 
    models_predictions = models_predictions.split(" -- ") 
    new_words = []
    for word, prediction, model_prediction in zip(words, not_confident_predictions, models_predictions):
        if word == chosen_word:
            new_word = f'<u><b><span style="color:{hex_role2color[model_prediction]};"> {word}</span></b></u>'
        else:
            new_word = f'<span style="color:{hex_role2color[model_prediction]};"> {word}</span>'
        new_words.append(new_word)
    return " ".join(new_words)

In [None]:
def remove_repetitions(sequence_intents):
    list_elements = sequence_intents.split(" -- ")
    prev_el = list_elements[0]
    non_repeated_list = []
    counter = 1
    for it in range(1,len(list_elements)):
        el = list_elements[it]
        if prev_el != el:
            non_repeated_list.append(f"{prev_el} x {counter}")
            counter = 1
            prev_el = el
        else:
            counter += 1
    # For last element
    non_repeated_list.append(f"{prev_el} x {counter}")        
    return " -- ".join(non_repeated_list)

def color_roles(models_predictions):
    non_repeated_roles = remove_repetitions(models_predictions)
    new_words = []
    for role_and_counter in non_repeated_roles.split(" -- "):
        role = role_and_counter.split(" x ")[0]
        new_words.append(f'<span style="color:{hex_role2color[role]};"> {role_and_counter}</span>')
    return " ".join(new_words)

##### Sort previous dataframe according to number of associated sessions

In [None]:
sessions_and_families_per_use.sort_values(by = "associated_sessions", ascending = False, inplace = True)

##### Create DF to store the results

In [None]:
df_echo_results = []

In [None]:
from Levenshtein import distance as lev

selected_role = sessions_and_families_per_use.echo_roles.iloc[0]
print(f"For echo roles: {selected_role}")
examples = df_echo[df_echo.echo_roles == selected_role]
families_numerosities = examples.groupby("Models_predictions").full_session.count().reset_index(name = "occurrences").sort_values(by = "occurrences", ascending = False)
prev_families = [""]
it = 0
i = 0
while i < 3 and it != families_numerosities.shape[0]: # for the top 3 families
    chosen_family = families_numerosities.iloc[it].Models_predictions
    nc_prediction = df_echo[df_echo.Models_predictions == chosen_family].Predicted_classes.iloc[0]
    occurrences = families_numerosities.iloc[it].occurrences
    embedded_family = "".join([str(label2id[word]) for word in chosen_family.split(" -- ")])
    flag = True
    for prev_family in prev_families: # Check the examples you showed before. Accept new examples if "dissimilar" enough
        lev_distance = lev(prev_family, embedded_family)
        if lev_distance < 5: #At least 5 words must be different
            flag = False
    if flag:
        examples_for_family = examples[examples.Models_predictions == chosen_family]["full_session"]
        for it in range(np.min([2, examples_for_family.shape[0]])):
            full_session = examples_for_family.iloc[it]
            colored_sessions = color_sessions(full_session, nc_prediction, chosen_family, "echo")
            colored_roles = color_roles(chosen_family)
            df_echo_results.append((colored_roles, colored_sessions))
        prev_families.append(embedded_family)
        i += 1
    it += 1

In [None]:
from Levenshtein import distance as lev

selected_role = sessions_and_families_per_use.echo_roles.iloc[3]
print(f"For echo roles: {selected_role}")
examples = df_echo[df_echo.echo_roles == selected_role]
families_numerosities = examples.groupby("Models_predictions").full_session.count().reset_index(name = "occurrences").sort_values(by = "occurrences", ascending = False)
prev_families = [""]
it = 0
i = 0
while i < 3 and it != families_numerosities.shape[0]: # for the top 3 families
    chosen_family = families_numerosities.iloc[it].Models_predictions
    nc_prediction = df_echo[df_echo.Models_predictions == chosen_family].Predicted_classes.iloc[0]
    occurrences = families_numerosities.iloc[it].occurrences
    embedded_family = "".join([str(label2id[word]) for word in chosen_family.split(" -- ")])
    flag = True
    for prev_family in prev_families: # Check the examples you showed before. Accept new examples if "dissimilar" enough
        lev_distance = lev(prev_family, embedded_family)
        if lev_distance < 5: #At least 5 words must be different
            flag = False
    if flag:
        examples_for_family = examples[examples.Models_predictions == chosen_family]["full_session"]
        for it in range(np.min([2, examples_for_family.shape[0]])):
            full_session = examples_for_family.iloc[it]
            colored_sessions = color_sessions(full_session, nc_prediction, chosen_family,"echo")
            colored_roles = color_roles(chosen_family)
            df_echo_results.append((colored_roles, colored_sessions))
        prev_families.append(embedded_family)
        i += 1
    it += 1

In [None]:
from Levenshtein import distance as lev

selected_role = sessions_and_families_per_use.echo_roles.iloc[2]
print(f"For echo roles: {selected_role}")
examples = df_echo[df_echo.echo_roles == selected_role]
families_numerosities = examples.groupby("Models_predictions").full_session.count().reset_index(name = "occurrences").sort_values(by = "occurrences", ascending = False)
prev_families = [""]
it = 0
i = 0
while i < 3 and it != families_numerosities.shape[0]: # for the top 3 families
    chosen_family = families_numerosities.iloc[it].Models_predictions
    nc_prediction = df_echo[df_echo.Models_predictions == chosen_family].Predicted_classes.iloc[0]
    occurrences = families_numerosities.iloc[it].occurrences
    embedded_family = "".join([str(label2id[word]) for word in chosen_family.split(" -- ")])
    flag = True
    for prev_family in prev_families: # Check the examples you showed before. Accept new examples if "dissimilar" enough
        lev_distance = lev(prev_family, embedded_family)
        if lev_distance < 5: #At least 5 words must be different
            flag = False
    if flag:
        examples_for_family = examples[examples.Models_predictions == chosen_family]["full_session"]
        for it in range(np.min([2, examples_for_family.shape[0]])):
            full_session = examples_for_family.iloc[it]
            colored_sessions = color_sessions(full_session, nc_prediction, chosen_family, "echo")
            colored_roles = color_roles(chosen_family)
            df_echo_results.append((colored_roles, colored_sessions))
        prev_families.append(embedded_family)
        i += 1
    it += 1

#### Export

In [None]:
df_to_export = pd.DataFrame(df_echo_results, columns = ["Sequence of intents", "Session"])

In [None]:
df_to_export.to_html(f"./Inference_results/1_Stats_per_prediction/{dataset}_echo_study.html", escape = False)

## STUDY ON RM - Bit harder, to show that the problem is non-trivial

In [None]:
print(f"Before filtering: {predicted_corpus.shape[0]} (and {predicted_corpus.Models_predictions.nunique()} families)")
df_rm = predicted_corpus[predicted_corpus.full_session.str.contains(" rm")]
print(f"Sessions containing 'rm': {df_rm.shape[0]} (and {df_rm.Models_predictions.nunique()} families)")
df_rm.head(2)

#### Now keep track of the predictions we associated to 'rm'

In [None]:
def track_rm(session, predictions):
    rm_roles = []
    words = session.split(" ")
    predictions = predictions.split(" -- ")
    for word, prediction in zip(words, predictions):
        if "rm" == word:
            rm_roles.append(prediction)
    return " -- ".join(rm_roles)

In [None]:
from tqdm import tqdm
tqdm.pandas()

df_rm["rm_roles"] = df_rm.progress_apply(lambda row: track_rm(row.full_session, row.Models_predictions), axis = 1)
df_rm.head(2)

### How many sessions we can associate to each echo's use?

In [None]:
sessions_associated_to_use = df_rm.groupby("rm_roles").full_session.count().reset_index(name = "associated_sessions")
print(f"There are {sessions_associated_to_use.shape[0]} different uses of the command 'rm'")

### Count how many "families" we associated per use

In [None]:
families_associated_to_use = df_rm.drop_duplicates("Models_predictions").groupby("rm_roles").Models_predictions.count().reset_index(name = "associated_families")
sessions_and_families_per_use = sessions_associated_to_use.merge(families_associated_to_use, on = "rm_roles")
with pd.option_context('display.max_colwidth', None):
    display(sessions_and_families_per_use.sort_values(by = "associated_sessions", ascending = False).head(10))

#### Show some examples of such usages

##### Idea is to select a family associated to role if:
- Family is numerous enough 
- Family is "dissimilar" to the ones observed before

In [None]:
df_rm.head(2)

##### Sort previous dataframe according to number of associated sessions

In [None]:
sessions_and_families_per_use.sort_values(by = "associated_sessions", ascending = False, inplace = True)

##### Create DF to store the results

In [None]:
df_rm_results = []

In [None]:
from Levenshtein import distance as lev

selected_role = sessions_and_families_per_use.rm_roles.iloc[0]
print(f"For echo roles: {selected_role}")
examples = df_rm[df_rm.rm_roles == selected_role]
families_numerosities = examples.groupby("Models_predictions").full_session.count().reset_index(name = "occurrences").sort_values(by = "occurrences", ascending = False)
prev_families = [""]
it = 0
i = 0
while i < 3 and it != families_numerosities.shape[0]: # for the top 3 families
    chosen_family = families_numerosities.iloc[it].Models_predictions
    nc_prediction = df_rm[df_rm.Models_predictions == chosen_family].Predicted_classes.iloc[0]
    occurrences = families_numerosities.iloc[it].occurrences
    embedded_family = "".join([str(label2id[word]) for word in chosen_family.split(" -- ")])
    flag = True
    for prev_family in prev_families: # Check the examples you showed before. Accept new examples if "dissimilar" enough
        lev_distance = lev(prev_family, embedded_family)
        if lev_distance < 5: #At least 5 words must be different
            flag = False
    if flag:
        examples_for_family = examples[examples.Models_predictions == chosen_family]["full_session"]
        for it in range(np.min([2, examples_for_family.shape[0]])):
            full_session = examples_for_family.iloc[it]
            colored_sessions = color_sessions(full_session, nc_prediction, chosen_family, "rm")
            colored_roles = color_roles(chosen_family)
            df_rm_results.append((colored_roles, colored_sessions))
        prev_families.append(embedded_family)
        i += 1
    it += 1

In [None]:
from Levenshtein import distance as lev

selected_role = sessions_and_families_per_use.rm_roles.iloc[2]
print(f"For echo roles: {selected_role}")
examples = df_rm[df_rm.rm_roles == selected_role]
families_numerosities = examples.groupby("Models_predictions").full_session.count().reset_index(name = "occurrences").sort_values(by = "occurrences", ascending = False)
prev_families = [""]
it = 0
i = 0
while i < 3 and it != families_numerosities.shape[0]: # for the top 3 families
    chosen_family = families_numerosities.iloc[it].Models_predictions
    nc_prediction = df_rm[df_rm.Models_predictions == chosen_family].Predicted_classes.iloc[0]
    occurrences = families_numerosities.iloc[it].occurrences
    embedded_family = "".join([str(label2id[word]) for word in chosen_family.split(" -- ")])
    flag = True
    for prev_family in prev_families: # Check the examples you showed before. Accept new examples if "dissimilar" enough
        lev_distance = lev(prev_family, embedded_family)
        if lev_distance < 5: #At least 5 words must be different
            flag = False
    if flag:
        examples_for_family = examples[examples.Models_predictions == chosen_family]["full_session"]
        for it in range(np.min([2, examples_for_family.shape[0]])):
            full_session = examples_for_family.iloc[it]
            colored_sessions = color_sessions(full_session, nc_prediction, chosen_family, "rm")
            colored_roles = color_roles(chosen_family)
            df_rm_results.append((colored_roles, colored_sessions))
        prev_families.append(embedded_family)
        i += 1
    it += 1

#### Export

In [None]:
df_to_export = pd.DataFrame(df_rm_results, columns = ["Sequence of intents", "Session"])

In [None]:
df_to_export.to_html(f"./Inference_results/1_Stats_per_prediction/rm_study.html", escape = False)

## Now, focus on 1 intent (e.g., "Execution")

In [None]:
execution_df = grouped_df[grouped_df["splitted_prediction"] == "Execution"]
print(f"Selected {execution_df.shape[0]} unique tuples (word, 'Execution')")

##### Visualize some examples:

In [None]:
execution_df.sort_values(by = "occurrences_tuple", ascending = False).head(20)

#### Now some examples in which those words were used:

In [None]:
example_execution_df = unique_corpus[unique_corpus["full_session"].str.contains(".i")]
print(f"Selected {example_execution_df.shape[0]} sessions ({example_execution_df.shape[0] / unique_corpus.shape[0] * 100:.2f} % of total)")
with pd.option_context('display.max_colwidth', None):
    display(example_execution_df[["full_session"]].head(4))