In [1]:
import pandas as pd

usage_instruction_df = pd.read_csv('resources/product_usage_instruction.csv')
usage_instruction_df.dropna(inplace=True, subset=['ProductUsageInstruction'])
usage_instruction_df


Unnamed: 0.1,Unnamed: 0,ProductUsageInstruction
0,0,Drink directly or after chilling
1,1,Can be used instantly.
2,2,1. Chill
3,3,Best served chilled
5,5,Best Served Chilled
6,6,Use directly
7,7,"Use direct, do not boil"
8,8,- Shake well before use
9,9,- Wet the body.
10,10,The product should be eaten immediately after ...


In [4]:
import spacy
import re
from spacy.tokens import Token
from spacy import displacy
# python -m spacy download en_core_web_sm 
# remember to run the above command in the terminal to download the model: https://spacy.io/usage/spacy-101
nlp = spacy.load('en_core_web_md')
filter_pos = lambda token: token.pos_ != "AUX" and token.pos_ != "DET" and token.pos_ != "PRON"\
                           and token.pos_ != "SCONJ" and token.pos_ != "PUNCT" and token.pos_ != "SYM" and token.pos_ != "X" and token.pos_ != "SPACE"

filter_some_nouns = lambda token: token.lemma_ != "product" and token.lemma_ != "warning" and token.text != "Warning" and token.text != "Warnings"

filter_stop_words = lambda token: not token.is_stop

def map_list_tokens_to_sublists_tokens(list: list[Token]) -> list[list[Token]]:
    sublists = []
    sublist = []
    for idx, token in enumerate(list):
        if idx == len(list) - 1:
            sublist.append(token)
            sublists.append(sublist)
            break
        if (token.pos_ == "CCONJ" or (token.pos_ == "PART" and token.text != "to")) and (list[idx+1].pos_ == "VERB" or list[idx+1].dep_ == "xcomp" or list[idx+1].dep_ == "acomp"):
            sublists.append(sublist)
            sublist = []
            if token.text == "not":
                sublist.append(token)
        else:
            sublist.append(token)
    return sublists

def map_to_one_string(list: list[str]) -> str:
    return ' '.join([text_token for text_token in list])

def filter_simple_structure_sublists(sublists: list[list[Token]]) -> list[list[Token]]:
    filtered_sublists = []
    
    raw_regex__dep_list_to_keep = ["^neg ROOT.*", "^amod ROOT$", "^advmod ROOT$", "^ROOT"]
    # not (main verb), direct use, better hot
    raw_regex__pos_list_to_keep = ["^VERB.*", "^ADV VERB.*", "^ADV PART VERB.*" , "^PART VERB.*"] 
    # directly to use , served best chilled, served best chilled or over ice, not use ...
    regex__pos_list_to_keep = list(map(lambda regex:re.compile(regex), raw_regex__pos_list_to_keep))
    regex__dep_list_to_keep = list(map(lambda regex:re.compile(regex), raw_regex__dep_list_to_keep))
    for sublist in sublists:
        grammar_tokens_dep = list(map(lambda token: token.dep_, sublist))
        grammar_tokens_pos = list(map(lambda token: token.pos_, sublist))
        dep_str = map_to_one_string(grammar_tokens_dep).strip() # "ROOT advmod"
        pos_str = map_to_one_string(grammar_tokens_pos).strip() # "VERB ADV"
        # print("Regex dep: ", regex__dep_list_to_keep, '\n')
        # print("Regex pos: ", regex__pos_list_to_keep, '\n')
        # print("Dep str: ", dep_str, '\n')
        # print("Pos str: ", pos_str, '\n')
        # \ multiple lines(do not have space)
        if (any(regex.match(dep_str) for regex in regex__dep_list_to_keep)\
            or any(regex.match(pos_str) for regex in regex__pos_list_to_keep))\
            and not re.findall(r'.*VERB.*', pos_str).count(".*VERB.*") > 1: #TODO fix later, this exclude multiple verb, except for the one in the list
                filtered_sublists.append(sublist)
        else:
            print("COUNT VERB: ", re.findall(r'.*VERB.*', pos_str).count("VERB"), '\n')
            print("Not match: ", dep_str, '\n')
            print("Not match: ", pos_str, '\n')
            pass
    return filtered_sublists


# check this link to understand pos tags: 
# https://spacy.io/api/annotation#pos-tagging
# https://stackoverflow.com/a/40288324
# filter_auxiliary_verb = lambda token: token.pos_ != "AUX" and token.pos_ != "PART" and token.pos_ != "DET" and token.pos_ != "PRON" 
# and token.pos_ != "CCONJ" and token.pos_ != "SCONJ" and token.pos_ != "PUNCT" and token.pos_ != "SYM" and token.pos_ != "X" and token.pos_ != "SPACE"
result = []
for _, usage_instruction_df_row in usage_instruction_df.iterrows():
    usage_instruction = usage_instruction_df_row['ProductUsageInstruction']
    usage_instruction = re.sub(r'[^\w\s]', '', usage_instruction)
    usage_instruction = re.sub(r'^\d?', '', usage_instruction)
    usage_instruction = usage_instruction.strip()
    print(f"Original usage instruction: {usage_instruction}", '\n')

    doc = nlp(usage_instruction)
    tokens_with_pos = [token for token in doc]
    token_filtered_noun = list(filter(filter_some_nouns, tokens_with_pos))
    tokens_filtered_by_pos = list(filter(filter_pos, token_filtered_noun))
    print("Token without pos:", tokens_filtered_by_pos, '\n')
    # tokens_lemm = [token.lemma_.lower() for token in tokens_filtered_by_pos]
    # DO NOT LEMMATIZE, WE NEED TO RETAIN THE GRAMMER STRUCTURE

    tokens_lowered = list(map(lambda token: token.text.lower(), tokens_filtered_by_pos))
    print("Final token(not lemm): ",tokens_lowered, '\n')
    doc = nlp(' '.join(tokens_lowered))
    token_list = [token for token in doc]
    sublists = map_list_tokens_to_sublists_tokens(token_list)
    print("Sublists: ", sublists, '\n')

    # Remove stop words, ok do not use this since it make the sentence structure lost

    # Remove simple structure
    sublists = filter_simple_structure_sublists(sublists)
    print("Sublists after removing simple structure: ", sublists, '\n')

    for sublist in sublists:
        grammar_tokens_dep = [token.dep_ for token in sublist]
        grammar_tokens_pos = [token.pos_ for token in sublist]
        # print("Grammer token Dep to string: ", map_to_one_string(grammar_tokens_dep), '\n')
        # print("Grammer token Pos to string: ", map_to_one_string(grammar_tokens_pos), '\n')
        # print("Grammar token Dep: ",grammar_tokens_dep, '\n')
        # print("Grammar token Pos: ",grammar_tokens_pos, '\n')
        print("=========================================================================================================================")
    result.append(sublists)

Original usage instruction: Drink directly or after chilling 

Token without pos: [Drink, directly, or, after, chilling] 

Final token(not lemm):  ['drink', 'directly', 'or', 'after', 'chilling'] 

Sublists:  [[drink, directly, or, after, chilling]] 

Sublists after removing simple structure:  [[drink, directly, or, after, chilling]] 

Original usage instruction: Can be used instantly 

Token without pos: [used, instantly] 

Final token(not lemm):  ['used', 'instantly'] 

Sublists:  [[used, instantly]] 

Sublists after removing simple structure:  [[used, instantly]] 

Original usage instruction: Chill 

Token without pos: [Chill] 

Final token(not lemm):  ['chill'] 

Sublists:  [[chill]] 

Sublists after removing simple structure:  [[chill]] 

Original usage instruction: Best served chilled 

Token without pos: [Best, served, chilled] 

Final token(not lemm):  ['best', 'served', 'chilled'] 

Sublists:  [[best, served, chilled]] 

Sublists after removing simple structure:  [[best, serve

In [5]:
# Lemmatize, Stemming for final result, then ready to tranform to name of usage instruction type
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download('wordnet') only once
def get_wordnet_pos_from_spacy_pos(spacy_pos: str) -> str:
    if spacy_pos == 'ADJ':
        return wordnet.ADV
    elif spacy_pos == 'VERB':
        return wordnet.VERB
    elif spacy_pos == 'NOUN':
        return wordnet.NOUN
    elif spacy_pos == 'ADV':
        return wordnet.ADV
    else:
        return None # for easy if-statement 
    
# use nltk lemmatizer instead of spacy lemmatizer since nltk lemmatizer can specify from the pos tag
def map_to_lemmatized_nltk(token: Token) -> str:
    lemmatizer = WordNetLemmatizer()
    wordnet_pos = get_wordnet_pos_from_spacy_pos(token.pos_)
    print("Wordnet pos: ", wordnet_pos, '\n')
    if wordnet_pos is not None:
        return lemmatizer.lemmatize(token.text, get_wordnet_pos_from_spacy_pos(token.pos_)) #only one text is passed in so the list only have one element
    else:
        return token.text

def map_token_to_stemmed_nltk(token_text: str) -> str:
    porter_stemmer = PorterStemmer()
    return porter_stemmer.stem(token_text)

map_to_lemmatized_spacy = lambda token: token.lemma_.lower()
#sublists = list(map(lambda sublist: list(filter(filter_stop_words, sublist)), sublists)) 
# DO NOT REMOVE STOP WORD SINCE IT HAVE MEANING LIKE NOT, TO, ...

result2 = []
for sublists in result:
    sublists = list(map(lambda sublist: list(map(map_to_lemmatized_spacy, sublist)), sublists))
    sublists = list(map(lambda sublist: list(map(map_token_to_stemmed_nltk, sublist)), sublists))
    print("Sublists after lemmatized and stemmed: ", sublists, '\n')
    print("=========================================================================================================================")
    result2.append(sublists)

# displacy.render(doc, style="dep")
# displacy.render(doc, style="ent")
# print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
# print(token.vector_norm)

Sublists after lemmatized and stemmed:  [['drink', 'directli', 'or', 'after', 'chill']] 

Sublists after lemmatized and stemmed:  [['use', 'instantli']] 

Sublists after lemmatized and stemmed:  [['chill']] 

Sublists after lemmatized and stemmed:  [['well', 'serv', 'chill']] 

Sublists after lemmatized and stemmed:  [['well', 'serv', 'chill']] 

Sublists after lemmatized and stemmed:  [['use', 'directli']] 

Sublists after lemmatized and stemmed:  [['use', 'direct'], ['not', 'boil']] 

Sublists after lemmatized and stemmed:  [['shake', 'well', 'befor', 'use']] 

Sublists after lemmatized and stemmed:  [['wet', 'bodi']] 

Sublists after lemmatized and stemmed:  [['eat', 'immedi', 'after', 'open', 'packag']] 

Sublists after lemmatized and stemmed:  [['use', 'dairectli'], ['not', 'use', 'after', 'expiri', 'date']] 

Sublists after lemmatized and stemmed:  [['use', 'as', 'spice', 'for', 'favorit', 'dish']] 

Sublists after lemmatized and stemmed:  [['use', 'directli', 'and', 'suitabl', '

In [6]:
# sort dict by value of appearance
dict = {}
for sublists in result2:
    for sublist in sublists:
        sublist_str = map_to_one_string(sublist)
        if sublist_str in dict:
            dict[sublist_str] += 1
        else:
            dict[sublist_str] = 1

dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1], reverse=True)}

# Then reduce the dict by removing LATTER (since its sorted by appearance so the latter is less appeared) similar keys by vector similarity
keys_list = []
keys = dict.keys()
for key in keys:
    keys_list.append(key)

nlp = spacy.load('en_core_web_md')
keys_to_remove_set = set()
len_key_list = len(keys_list)
for i in range(len_key_list):
    for j in range(i+1, len_key_list):
        doc_one = nlp(keys_list[i])
        doc_two = nlp(keys_list[j])
        similarity = doc_two.similarity(doc_one)
        if (similarity > 0.89):
            # print("Similarity: ", similarity, '\n')
            # print("Key one: ", keys_list[i], '\n')
            # print("Key two (to remove): ", keys_list[j], '\n')
            remove_key = keys_list[j]
            keys_to_remove_set.add(remove_key)

print("Keys to remove because similarity: ", keys_to_remove_set, '\n')
for key_to_remove in keys_to_remove_set:
    del dict[key_to_remove]
print("Dict after removing similar key: ", dict, '\n')

# remove key with count = 1, which means it only appear once for a product and too specific for a usage instruction type
keys_to_remove_set.clear()
for key, count_value in dict.items():
    if count_value == 1:
        keys_to_remove_set.add(key)
print("Keys to remove because of count == 1: ", keys_to_remove_set, '\n')

for key_to_remove in keys_to_remove_set:
    del dict[key_to_remove]
        
print("Final dict: ", dict, '\n')

  similarity = doc_two.similarity(doc_one)


Keys to remove because similarity:  {'use direct', 'use instantli', 'use dairectli', 'direct use', 'not use after expir date', 'use directli without heat treatment contain nut milk soya seasam seed and hazelnut'} 

Dict after removing similar key:  {'not use after expiri date': 4, 'use directli': 3, 'well serv chill': 2, 'drink directli or after chill': 1, 'chill': 1, 'not boil': 1, 'shake well befor use': 1, 'wet bodi': 1, 'eat immedi after open packag': 1, 'use as spice for favorit dish': 1, 'use directli and suitabl for cook': 1, 'make sauc for mani dish such as pizza pasta french fri': 1, 'use food season to creat delici tast': 1, 'wash befor process': 1, 'great for dip tortilla chip into cool flavour complement crunchi textur and season flavour of crisp perfectli': 1, 'mix with freshli boil pasta': 1, 'drink directli more delici drink cold': 1, 'consum immedi after open': 1, 'well use within 3 day right after open box store in refriger after open box': 1, 'gener eat': 1, 'serv wit

In [7]:
usage_instruction_type_dict = {
    0: "no usage instruction",
    1: "too special usage instruction with steps",
}
idx = 2
for usage_instruction_type in dict.keys():
    usage_instruction_type_dict[idx] = usage_instruction_type
    idx += 1

print("Usage instruction type dict: ", usage_instruction_type_dict, '\n')

import pandas as pd
usage_instruction_type_df = pd.DataFrame.from_dict(usage_instruction_type_dict, orient='index', columns=['UsageInstructionTypeName'])
usage_instruction_type_df.to_csv('resources/reduced_usage_instruction_type.csv', index_label='UsageInstructionTypeId')


Usage instruction type dict:  {0: 'no usage instruction', 1: 'too special usage instruction with steps', 2: 'not use after expiri date', 3: 'use directli', 4: 'well serv chill'} 

