In [None]:
import pandas as pd

usage_instruction_df = pd.read_csv('resources/product_usage_instruction.csv')
usage_instruction_df.dropna(inplace=True, subset=['ProductUsageInstruction'])
usage_instruction_df


In [None]:
import spacy
import re
from spacy.tokens import Token
from spacy import displacy
# python -m spacy download en_core_web_sm 
# remember to run the above command in the terminal to download the model: https://spacy.io/usage/spacy-101
nlp = spacy.load('en_core_web_md')
map_to_lemmatized = lambda token: token.lemma_.lower()
filter_pos = lambda token: token.pos_ != "AUX" and token.pos_ != "DET" and token.pos_ != "PRON" and token.pos_ != "SCONJ" and token.pos_ != "PUNCT" and token.pos_ != "SYM" and token.pos_ != "X" and token.pos_ != "SPACE"
filter_noun = lambda token: token.lemma_ != "product" and token.lemma_ != "warning" and token.text != "Warning" and token.text != "Warnings"
filter_stop_words = lambda token: not token.is_stop

def map_list_tokens_to_sublists_tokens(list: list[Token]) -> list[list[Token]]:
    sublists = []
    sublist = []
    for idx, token in enumerate(list):
        if idx == len(list) - 1:
            sublist.append(token)
            sublists.append(sublist)
            break
        if (token.pos_ == "CCONJ" or (token.pos_ == "PART" and token.text != "to")) and (list[idx+1].pos_ == "VERB" or list[idx+1].dep_ == "xcomp" or list[idx+1].dep_ == "acomp"):
            sublists.append(sublist)
            sublist = []
            if token.text == "not":
                sublist.append(token)
        else:
            sublist.append(token)
    return sublists

def map_to_one_string(list: list[str]) -> str:
    return ' '.join([text_token for text_token in list])

def filter_simple_structure_sublists(sublists: list[list[Token]]) -> list[list[Token]]:
    filtered_sublists = []
    
    raw_regex__dep_list_to_keep = ["^neg ROOT.*", "^amod ROOT$", "^advmod ROOT$"]
    # not (main verb), direct use, better hot
    raw_regex__pos_list_to_keep = ["^VERB.*", "^ADV VERB.*", "^ADV PART VERB.*" , "^PART VERB.*"] 
    # directly to use , served best chilled, served best chilled or over ice, not use ...
    regex__pos_list_to_keep = list(map(lambda regex:re.compile(regex), raw_regex__pos_list_to_keep))
    regex__dep_list_to_keep = list(map(lambda regex:re.compile(regex), raw_regex__dep_list_to_keep))
    for sublist in sublists:
        grammar_tokens_dep = list(map(lambda token: token.dep_, sublist))
        grammar_tokens_pos = list(map(lambda token: token.pos_, sublist))
        dep_str = map_to_one_string(grammar_tokens_dep).strip() # "ROOT advmod"
        pos_str = map_to_one_string(grammar_tokens_pos).strip() # "VERB ADV"
        # print("Regex dep: ", regex__dep_list_to_keep, '\n')
        # print("Regex pos: ", regex__pos_list_to_keep, '\n')
        # print("Dep str: ", dep_str, '\n')
        # print("Pos str: ", pos_str, '\n')
        # \ multiple lines(do not have space)
        if (any(regex.match(dep_str) for regex in regex__dep_list_to_keep)\
            or any(regex.match(pos_str) for regex in regex__pos_list_to_keep))\
            and not re.findall(r'.*VERB.*', pos_str).count(".*VERB.*") > 1: #TODO fix later, this exclude multiple verb, except for the one in the list
                filtered_sublists.append(sublist)
        else:
            print("COUNT VERB: ", re.findall(r'.*VERB.*', pos_str).count("VERB"), '\n')
            print("Not match: ", dep_str, '\n')
            print("Not match: ", pos_str, '\n')
            pass
    return filtered_sublists


# check this link to understand pos tags: 
# https://spacy.io/api/annotation#pos-tagging
# https://stackoverflow.com/a/40288324
# filter_auxiliary_verb = lambda token: token.pos_ != "AUX" and token.pos_ != "PART" and token.pos_ != "DET" and token.pos_ != "PRON" 
# and token.pos_ != "CCONJ" and token.pos_ != "SCONJ" and token.pos_ != "PUNCT" and token.pos_ != "SYM" and token.pos_ != "X" and token.pos_ != "SPACE"
for _, usage_instruction_df_row in usage_instruction_df.iterrows():
    usage_instruction = usage_instruction_df_row['ProductUsageInstruction']
    usage_instruction = re.sub(r'[^\w\s]', '', usage_instruction)
    usage_instruction = re.sub(r'^\d?', '', usage_instruction)
    usage_instruction = usage_instruction.strip()
    print(f"Original usage instruction: {usage_instruction}", '\n')

    doc = nlp(usage_instruction)
    tokens_with_pos = [token for token in doc]
    token_filtered_noun = list(filter(filter_noun, tokens_with_pos))
    tokens_without_pos = list(filter(filter_pos, token_filtered_noun))
    print("Token without pos:", tokens_without_pos, '\n')
    # tokens_lemm = [token.lemma_.lower() for token in tokens_without_pos]
    # tokens = list(map(map_chill_to_chilled, tokens_lemm))
    # DO NOT LEMMATIZE, WE NEED TO RETAIN THE GRAMMER STRUCTURE

    tokens_lemm = [token for token in tokens_without_pos]
    tokens = list(map(lambda token: token.text.lower(), tokens_lemm))
    print("Final token(not lemm): ",tokens, '\n')
    doc = nlp(' '.join(tokens))
    token_list = [token for token in doc]
    sublists = map_list_tokens_to_sublists_tokens(token_list)
    print("Sublists: ", sublists, '\n')

    # Remove stop words, ok do not use this since it make the sentence structure lost
    # for sublist in sublists:
    #     sublist = list(filter(filter_stop_words, sublist))
    #     sublist = list(map(map_to_lemmatized, sublist))
    #     print(sublist, '\n')
    # print("Sublists after removing stop words: ", sublists, '\n')

    # Remove simple structure
    sublists = filter_simple_structure_sublists(sublists)
    print("Sublists after removing simple structure: ", sublists, '\n')



    # for sublist in sublists:
    #     grammar_tokens_dep = [token.dep_ for token in sublist]
    #     grammar_tokens_pos = [token.pos_ for token in sublist]
        # print("Grammer token Dep to string: ", map_to_one_string(grammar_tokens_dep), '\n')
        # print("Grammer token Pos to string: ", map_to_one_string(grammar_tokens_pos), '\n')
        # print("Grammar token Dep: ",grammar_tokens_dep, '\n')
        # print("Grammar token Pos: ",grammar_tokens_pos, '\n')

    # Lemmatize, Stemming for final result, then ready to tranform to name of usage instruction type
    import nltk
    from nltk.stem.porter import PorterStemmer

    porter_stemmer = PorterStemmer()
    def map_token_to_stemmed(token_text: str) -> str:
        return porter_stemmer.stem(token_text)
    
    for sublist in sublists:
        # sublist = list(filter(filter_stop_words, sublist))
        # print("Sublist after removing stop words and before lemmed: ",sublist, '\n')
        # DO NOT REMOVE STOP WORD SINCE IT HAVE MEANING LIKE NOT, TO, ...
        print("Sublist before lemmed: ", sublist, '\n')
        sublist = list(map(map_to_lemmatized, sublist))
        print("Sublist before stemmed and after lemmed: ", sublist, '\n')
        sublist = list(map(map_token_to_stemmed, sublist))
        print("Sublist after stemmed: ",sublist, '\n')

    print("=========================================================================================================================")
    # displacy.render(doc, style="dep")
    # displacy.render(doc, style="ent")
    # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
    # print(token.vector_norm)



