In [18]:
import dask.dataframe as dd
import pandas as pd

import dask.bag as db
from tqdm import tqdm
from IPython.display import display

# disable chained assignments
pd.options.mode.chained_assignment = None 


Read UNSPC file

In [2]:
UNSPC_FILE_PATH = 'UNGM_UNSPSC.xlsx'

# read excel file
df_unspc = pd.read_excel(UNSPC_FILE_PATH, engine='openpyxl')

# remove all puctuations from title_lemma columns
df_unspc['title_lemma'] = df_unspc['title_lemma'].str.replace('[^\w\s]','')

df_unspc.head()

  df_unspc['title_lemma'] = df_unspc['title_lemma'].str.replace('[^\w\s]','')


Unnamed: 0,Key,Parent key,Code,Title,Top Key,Top Title,title_lemma
0,100,0,A,"Raw Materials, Chemicals, Paper, Fuel",100.0,"Raw Materials, Chemicals, Paper, Fuel",raw material chemical paper fuel
1,101,0,B,Industrial Equipment & Tools,101.0,Industrial Equipment & Tools,industrial equipment tool
2,102,0,C,Components & Supplies,102.0,Components & Supplies,component supply
3,103,0,D,"Construction, Transportation & Facility Equipm...",103.0,"Construction, Transportation & Facility Equipm...",construction transportation facility equipme...
4,104,0,E,"Medical, Laboratory & Test Equipment & Supplie...",104.0,"Medical, Laboratory & Test Equipment & Supplie...",medical laboratory test equipment supply p...


Read n-grams file

In [3]:
# load text file in dask bag
bag = db.read_text('word_frequencies.txt', blocksize=25e6, encoding='utf-8')

# split each row of dask bag item on comma using map_partitions
bag = bag.map_partitions(lambda x: [item.split(',') for item in x])

# convert to dataframe
df_ngrams = bag.to_dataframe(columns=['word', 'count'])
# convert count to int
df_ngrams['count'] = df_ngrams['count'].astype(int)

# filter out words with count more than 100
df_ngrams = df_ngrams[df_ngrams['count'] > 100]

df_ngrams.head()

Unnamed: 0,word,count
0,opn,200863
1,supply,171382
2,material,151960
3,date,146756
4,supply date,144924


In [25]:
def jaccard_similarity(s1, s2):
    # convert strings to sets of characters
    set1 = set(s1)
    set2 = set(s2)
    # find the union and intersection of the sets
    union = set1.union(set2)
    intersection = set1.intersection(set2)
    # calculate and return the Jaccard distance
    return (len(intersection) / len(union)) * len(s1)

result = []
total = len(df_ngrams)
# loop through n gram words and find the longest match
for index, row in tqdm(df_ngrams.iterrows(), total=total):

    # search_items = "|".join([f"\b{item}\b" for item in row['word'].split(' ')])
    # Create a regular expression pattern
    # pattern = '\\b(' + '|'.join(row['word'].split(' ')) + ')\\b'

    search_words = row['word'].split(' ')
    # Create a regular expression pattern
    pattern = '\\b(' + '|'.join(search_words) + ')\\b'

    # Use the regular expression pattern to filter the dataframe
    df_filtered = df_unspc[df_unspc['title_lemma'].str.extract(pattern, expand=False).notnull()]

    if len(df_filtered) == 0:
        continue

    # calculate score for each row
    df_filtered.loc[:, 'score'] = df_filtered['title_lemma'].apply(lambda x: jaccard_similarity(x, row['word']))
    df_filtered = df_filtered.sort_values(by='score', ascending=False)
    # select first row of dataframe
    df_filtered = df_filtered.iloc[0]

    title_lemma = df_filtered['title_lemma']
    title = df_filtered['Title']
    top_title = df_filtered['Top Title']
    score = round(df_filtered['score'], 2)
    if score > 30:
        result.append((row['word'], title_lemma, title, top_title, score))
print(result[:5])

 56%|█████▌    | 13561/24273 [03:17<02:36, 68.25it/s]

In [None]:
# create a dataframe from the result
df_result = pd.DataFrame(result, columns=['ngram', 'title_lemma', 'title', 'top_title', 'score'])
# sort the dataframe by score
df_result = df_result.sort_values(by='score', ascending=False)
df_result.to_excel('unspc_ngrams.xlsx', index=False)