In [None]:
import json
import random

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
Path.ls = lambda x: list(x.iterdir())

In [None]:
## Combine All JSON files into one Dataframe
data_path = Path("../data/generated_raw_data/parsed/").resolve()
data_path.ls()

In [None]:
all_sentences = []
for file in data_path.ls():
    with file.open("r") as f:
        content = json.load(f)
        all_sentences.extend(content)

In [None]:
# strip space in keywords
for sentence in all_sentences:
    sentence["keywords"] = [kw.strip().lower() for kw in sentence["keywords"]]
    sentence["keywords"] = [kw for kw in sentence["keywords"] if len(kw) > 0]
all_sentences[0]

In [None]:
# replace the "text" key with "sentence" 
for sentence in all_sentences:
    if "text" in sentence.keys():
        sentence["sentence"] = sentence["text"]
        del sentence["text"]

In [None]:
# use pandas to remove duplicates
df = pd.DataFrame(all_sentences)
df = df.drop_duplicates(subset='sentence', keep="last").reset_index(drop=True)

In [None]:
df

In [None]:
intent_words_list = [
    "report stolen",
    "dues",
    "cancel",
    "cancellation",
    "block",
    "unblock",
    "activate",
    "activation",
    "transfer",
    "replacement",
    "replace",
]

In [None]:
# figure out where keywords have logged intents and which need manual intervention
data_list = df.to_dict(orient="records")
intent_data_list = []
unknown_intent = []
for entry in data_list:
    intersect = set(entry["keywords"]).intersection(set(intent_words_list))
    if len(intersect) == 1:
        intent = str(intersect.pop())
        entry["intent"] = intent
        entry["keywords"].remove(intent)
        intent_data_list.append(entry)
    else:
        unknown_intent.append(entry)

In [None]:
len(unknown_intent), len(intent_data_list), len(data_list)

I added the intents to the messages which did not have an intent from the list above. If there was any garbage in that extraction, I removed it. So there should be a difference in the count of exported and imported csv.

In [None]:
pd.DataFrame(unknown_intent).to_csv("../data/unknown_intent_for_manual_tagging.csv", sep=",", index=False) #export

In [None]:
added_intent = pd.read_csv("../data/TAGGED_unknown_intent_for_manual_tagging.csv").to_dict(orient="records") #import

In [None]:
intent_data_list.extend(added_intent)

In [None]:
intents_df = pd.DataFrame(intent_data_list).drop_duplicates(subset='sentence', keep="last").reset_index(drop=True)

In [None]:
len(intents_df)

In [None]:
intents_df.to_csv("../data/mark_for_review.csv", index=False)

This is the list which we mark for checking by someone other than me. This has some intents which are tagged manually, some which are from GPT3. There is a bit of chaos, and overlap and unclean intent labels (e.g. `cancel` and `cancellation`) as we would expect from human input in production as well. 

In [None]:
# df.to_json("../data/generated_preprocessed.json", orient="records", indent=2)