## Fuzzy/Negative keywords generation for Aerosol

### Import Packages

In [1]:
import modin.pandas as pd
import numpy as np
import datetime
import json
import os
from google.cloud import bigquery
import gcsfs
import mapply
import gc
from tqdm import tqdm
from rank_bm25 import BM25Okapi
import itertools
import pickle
client = bigquery.Client()

from gensim import corpora
from smart_open import smart_open
import dask.dataframe as dd

from gensim.models import TfidfModel, OkapiBM25Model

from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile

from gensim.models import fasttext
from gensim.models.fasttext import FastText

import dask.dataframe as dd
from thefuzz import fuzz

mapply.init(
    n_workers=-1,
    chunk_size=10000,
    max_chunks_per_worker=0,
    progressbar=True
)



#### Initial Data Selection: 

In [2]:
#Query to featch trusted source data
query = """
SELECT DISTINCT(keywords) FROM `wmt-dca-catalog-dq-dev.SC_Final_Tables.PROD_FULL_TEXT`
, UNNEST(WORDS_CLEAN) as keywords
"""
all_words = client.query(query).to_dataframe()
all_words.shape

(6063694, 1)

In [3]:
all_words

Unnamed: 0,keywords
0,low
1,surprise
2,lifestyle
3,psychological
4,downside
...,...
6063689,essentialspijamas
6063690,partey
6063691,utlisation
6063692,sloset


#### Fuzzy Keyword generation using fuzz function from the 'thefuzz' library

In [4]:
all_words['confidence']= all_words['keywords'].mapply(lambda x: fuzz.ratio('aerosol',x))



  0%|          | 0/606 [00:00<?, ?it/s]

In [5]:
aerosol_keywords = all_words[(all_words['confidence']>85)].sort_values('confidence',ascending=False).reset_index(drop=True)
aerosol_keywords.head()

Unnamed: 0,keywords,confidence
0,aerosol,100
1,aerosoil,93
2,xaerosol,93
3,ae-rosol,93
4,aerosols,93


In [6]:
all_words['confidence']= all_words['keywords'].mapply(lambda x: fuzz.ratio('spray',x))



  0%|          | 0/606 [00:00<?, ?it/s]

In [7]:
spray_keywords = all_words[(all_words['confidence']>85)].sort_values('confidence',ascending=False).reset_index(drop=True)
spray_keywords.head()

Unnamed: 0,keywords,confidence
0,spray,100
1,sprayi,91
2,sprayr,91
3,sprayl,91
4,sprary,91


#### Fuzzy Keyword generation using fasttext model

#### Please refere "**MODEL_GENERATION.ipynb**" notebook for steps taken to create fasttext20_final model

In [8]:
#copying all model files
!gsutil cp gs://phase2_scoreclean/pod1/word_counts/models/_fasttext20_final* models/

Copying gs://phase2_scoreclean/pod1/word_counts/models/_fasttext20_final.model...
Copying gs://phase2_scoreclean/pod1/word_counts/models/_fasttext20_final.model.syn1neg.npy...
Copying gs://phase2_scoreclean/pod1/word_counts/models/_fasttext20_final.model.wv.vectors_ngrams.npy...
Copying gs://phase2_scoreclean/pod1/word_counts/models/_fasttext20_final.model.wv.vectors_vocab.npy...
/ [4 files][  1.1 GiB/  1.1 GiB]   78.0 MiB/s                                   
Operation completed over 4 objects/1.1 GiB.                                      


In [9]:
ft_model = FastText.load('models/_fasttext20_final.model')

In [10]:
wv = ft_model.wv

In [11]:
data = wv.most_similar("aerosol",topn=100)
data

[('eliminators-aerosol', 0.8189114332199097),
 ('spraymax', 0.8096947073936462),
 ('aerosol-free', 0.8065834045410156),
 ('spraypaint', 0.8015031814575195),
 ('spray', 0.7996445894241333),
 ('nonaerosol', 0.7992598414421082),
 ('sprayway', 0.7987311482429504),
 ('sprayon', 0.7920376658439636),
 ('sprayable', 0.7849791646003723),
 ('spraygun', 0.7821300029754639),
 ('aerosolcan', 0.7762606739997864),
 ('sprayco', 0.7736736536026001),
 ('non-aerosol', 0.7724792957305908),
 ('spraytm', 0.769294798374176),
 ('degreaser', 0.7564659118652344),
 ('meguiars', 0.7555786967277527),
 ('paint', 0.7553110718727112),
 ('spraybrand', 0.7547832727432251),
 ('vht', 0.7538639307022095),
 ('touch-up', 0.7521988749504089),
 ('rosol', 0.7521270513534546),
 ('sprays', 0.7517364025115967),
 ('krylon', 0.7502234578132629),
 ('degreasers', 0.7464777827262878),
 ('diluter', 0.7449178099632263),
 ('aerosol-cloud', 0.741051197052002),
 ('aerosolized', 0.7404928803443909),
 ('spray-on', 0.7394258379936218),
 ('cle

In [12]:
fuzzy_fasttext_keywords = pd.DataFrame(data, columns=['keywords','confidence'])
fuzzy_fasttext_keywords.head()


    import ray
    ray.init()

2023-12-14 09:30:10,749	INFO worker.py:1642 -- Started a local Ray instance.


Unnamed: 0,keywords,confidence
0,eliminators-aerosol,0.818911
1,spraymax,0.809695
2,aerosol-free,0.806583
3,spraypaint,0.801503
4,spray,0.799645


In [13]:
#combing keywords from fuzz function and fasttext model
fuzzy_keywords = pd.concat([aerosol_keywords, spray_keywords,fuzzy_fasttext_keywords], ignore_index=True)
fuzzy_keywords = fuzzy_keywords.drop_duplicates()
fuzzy_keywords



Unnamed: 0,keywords,confidence
0,aerosol,100.000000
1,aerosoil,93.000000
2,xaerosol,93.000000
3,ae-rosol,93.000000
4,aerosols,93.000000
...,...,...
161,roll-on,0.698722
162,solvent,0.698610
163,pump-spray,0.698048
164,tintable,0.697616


#### Negative Keyword generation using fuzz function from the 'thefuzz' library

In [14]:
all_words['confidence']= all_words['keywords'].mapply(lambda x: fuzz.ratio('non-aerosol',x))



  0%|          | 0/606 [00:00<?, ?it/s]

In [15]:
non_aerosol_keywords = all_words[(all_words['confidence']>85)].sort_values('confidence',ascending=False).reset_index(drop=True)
non_aerosol_keywords

Unnamed: 0,keywords,confidence
0,non-aerosol,100
1,non-aerosols,96
2,nonaerosol,95
3,non-aersol,95
4,no-aerosol,95
5,non-aerosal,91
6,ion-aerosol,91
7,oz-aerosol,86
8,nonareosol,86
9,non-aresol,86


In [16]:
all_words['confidence']= all_words['keywords'].mapply(lambda x: fuzz.ratio('not-aerosol',x))



  0%|          | 0/606 [00:00<?, ?it/s]

In [17]:
not_aerosol_keywords = all_words[(all_words['confidence']>85)].sort_values('confidence',ascending=False).reset_index(drop=True)
not_aerosol_keywords

Unnamed: 0,keywords,confidence
0,no-aerosol,95
1,non-aerosol,91
2,non-aerosols,87
3,nonaerosol,86
4,non-aersol,86
5,oz-aerosol,86


#### Negative Keyword generation using fasttext model

In [18]:
negative_data = wv.most_similar("aerosol-free",topn=100)
negative_data

[('phenol-free', 0.8531052470207214),
 ('eliminators-aerosol', 0.8354067206382751),
 ('non-aerosol', 0.8326234221458435),
 ('perfume-free', 0.8310071229934692),
 ('aerosol-cloud', 0.8238242864608765),
 ('resorcinol-free', 0.8208845257759094),
 ('petrolatum-free', 0.8206416964530945),
 ('neutralizer', 0.8097235560417175),
 ('paraben-free', 0.8074968457221985),
 ('aerosol', 0.8065834641456604),
 ('fragrance-free', 0.8049142956733704),
 ('ethanol-free', 0.8030544519424438),
 ('silicone-free', 0.8013126254081726),
 ('rinse-free', 0.798409104347229),
 ('non-oily', 0.7956057190895081),
 ('oil-free', 0.7915005087852478),
 ('surfactant-free', 0.7898951172828674),
 ('petroleum-free', 0.7886817455291748),
 ('fume-free', 0.7880845069885254),
 ('aluminum-free', 0.7846335768699646),
 ('acetone-free', 0.783689558506012),
 ('odor-neutralizer', 0.7836825847625732),
 ('ozone-free', 0.7834336161613464),
 ('phosphate-free', 0.7833195924758911),
 ('lanolin-free', 0.7827317714691162),
 ('peroxide-free', 0.

In [19]:
negative_fasttext_keywords = pd.DataFrame(data, columns=['keywords','confidence'])
negative_fasttext_keywords.head()



Unnamed: 0,keywords,confidence
0,eliminators-aerosol,0.818911
1,spraymax,0.809695
2,aerosol-free,0.806583
3,spraypaint,0.801503
4,spray,0.799645


In [20]:
#combing keywords from fuzz function and fasttext model
negative_keywords = pd.concat([non_aerosol_keywords, not_aerosol_keywords,negative_fasttext_keywords], ignore_index=True)
negative_keywords = negative_keywords.drop_duplicates()
negative_keywords



Unnamed: 0,keywords,confidence
0,non-aerosol,100.000000
1,non-aerosols,96.000000
2,nonaerosol,95.000000
3,non-aersol,95.000000
4,no-aerosol,95.000000
...,...,...
111,roll-on,0.698722
112,solvent,0.698610
113,pump-spray,0.698048
114,tintable,0.697616


In [21]:
#renaming the column nmaes
fuzzy_keywords.columns =['Fuzzy_keywords','Fuzzy_keywords_confidence']
negative_keywords.columns = ['Negative_Keywords', 'Negative_Keywords_confidence']

In [22]:
keyword_set = pd.concat([fuzzy_keywords,negative_keywords], axis=1)
keyword_set = keyword_set[['Fuzzy_keywords','Negative_Keywords']]
keyword_set

Unnamed: 0,Fuzzy_keywords,Negative_Keywords
0,aerosol,non-aerosol
1,aerosoil,non-aerosols
2,xaerosol,nonaerosol
3,ae-rosol,non-aersol
4,aerosols,no-aerosol
...,...,...
161,roll-on,
162,solvent,
163,pump-spray,
164,tintable,


#### Define a file names and save it in a csv file in the GCS bucket

In [23]:
#keyword_set.to_csv("gs://phase2_scoreclean/pod2/aerosol/aerosol_keywords.csv",index=False)

In [2]:
# library version
# Package                                  Version
# ---------------------------------------- ---------------
# absl-py                                  1.4.0
# aiohttp                                  3.8.5
# aiohttp-cors                             0.7.0
# aiorwlock                                1.3.0
# aiosignal                                1.3.1
# ansiwrap                                 0.8.4
# anyio                                    3.7.1
# apache-beam                              2.46.0
# argon2-cffi                              23.1.0
# argon2-cffi-bindings                     21.2.0
# array-record                             0.4.1
# arrow                                    1.2.3
# asttokens                                2.4.0
# astunparse                               1.6.3
# async-lru                                2.0.4
# async-timeout                            4.0.3
# attrs                                    23.1.0
# Babel                                    2.12.1
# backcall                                 0.2.0
# backoff                                  2.2.1
# backports.functools-lru-cache            1.6.5
# beatrix-jupyterlab                       2023.814.150030
# beautifulsoup4                           4.12.2
# bleach                                   6.0.0
# blessed                                  1.20.0
# boltons                                  23.0.0
# Brotli                                   1.1.0
# cached-property                          1.5.2
# cachetools                               4.2.4
# certifi                                  2023.7.22
# cffi                                     1.15.1
# charset-normalizer                       3.2.0
# click                                    8.1.7
# cloud-tpu-client                         0.10
# cloud-tpu-profiler                       2.4.0
# cloudpickle                              2.2.1
# colorama                                 0.4.6
# colorful                                 0.5.5
# comm                                     0.1.4
# conda                                    23.7.4
# conda-package-handling                   2.2.0
# conda_package_streaming                  0.9.0
# contourpy                                1.1.1
# crcmod                                   1.7
# cryptography                             41.0.4
# cycler                                   0.11.0
# Cython                                   3.0.2
# dacite                                   1.8.1
# db-dtypes                                1.1.1
# debugpy                                  1.8.0
# decorator                                5.1.1
# defusedxml                               0.7.1
# Deprecated                               1.2.14
# dill                                     0.3.7
# distlib                                  0.3.7
# dm-tree                                  0.1.8
# docker                                   6.1.3
# docopt                                   0.6.2
# docstring-parser                         0.15
# entrypoints                              0.4
# etils                                    1.5.0
# exceptiongroup                           1.1.3
# executing                                1.2.0
# explainable-ai-sdk                       1.3.3
# Farama-Notifications                     0.0.4
# fastapi                                  0.103.1
# fastavro                                 1.8.3
# fasteners                                0.19
# fastjsonschema                           2.18.0
# filelock                                 3.12.4
# flatbuffers                              23.5.26
# fonttools                                4.42.1
# fqdn                                     1.5.1
# frozenlist                               1.4.0
# fsspec                                   2023.9.2
# gast                                     0.4.0
# gcsfs                                    2023.9.2
# gitdb                                    4.0.10
# GitPython                                3.1.37
# google-api-core                          2.11.1
# google-api-python-client                 1.8.0
# google-apitools                          0.5.31
# google-auth                              2.23.0
# google-auth-httplib2                     0.1.1
# google-auth-oauthlib                     1.1.0
# google-cloud-aiplatform                  1.33.1
# google-cloud-artifact-registry           1.8.3
# google-cloud-bigquery                    3.11.4
# google-cloud-bigquery-storage            2.16.2
# google-cloud-bigtable                    1.7.3
# google-cloud-core                        2.3.3
# google-cloud-datastore                   1.15.5
# google-cloud-dlp                         3.12.3
# google-cloud-language                    1.3.2
# google-cloud-monitoring                  2.15.1
# google-cloud-pubsub                      2.18.4
# google-cloud-pubsublite                  1.8.3
# google-cloud-recommendations-ai          0.7.1
# google-cloud-resource-manager            1.10.4
# google-cloud-spanner                     3.40.1
# google-cloud-storage                     2.11.0
# google-cloud-videointelligence           1.16.3
# google-cloud-vision                      3.4.4
# google-crc32c                            1.5.0
# google-pasta                             0.2.0
# google-resumable-media                   2.6.0
# googleapis-common-protos                 1.60.0
# gpustat                                  1.0.0
# greenlet                                 2.0.2
# grpc-google-iam-v1                       0.12.6
# grpcio                                   1.48.0
# grpcio-status                            1.48.0
# gviz-api                                 1.10.0
# gymnasium                                0.28.1
# h11                                      0.14.0
# h5py                                     3.9.0
# hdfs                                     2.7.2
# htmlmin                                  0.1.12
# httplib2                                 0.21.0
# idna                                     3.4
# ImageHash                                4.3.1
# imageio                                  2.31.4
# importlib-metadata                       6.8.0
# importlib-resources                      6.1.0
# ipykernel                                6.25.2
# ipython                                  8.15.0
# ipython-genutils                         0.2.0
# ipython-sql                              0.5.0
# ipywidgets                               8.1.1
# isoduration                              20.11.0
# jaraco.classes                           3.3.0
# jax-jumpy                                1.0.0
# jedi                                     0.19.0
# jeepney                                  0.8.0
# Jinja2                                   3.1.2
# joblib                                   1.3.2
# json5                                    0.9.14
# jsonpatch                                1.33
# jsonpointer                              2.4
# jsonschema                               4.19.1
# jsonschema-specifications                2023.7.1
# jupyter_client                           7.4.9
# jupyter_core                             5.3.1
# jupyter-events                           0.7.0
# jupyter-http-over-ws                     0.0.8
# jupyter-lsp                              2.2.0
# jupyter-server                           1.24.0
# jupyter-server-mathjax                   0.2.6
# jupyter_server_proxy                     4.1.0
# jupyter_server_terminals                 0.4.4
# jupyterlab                               3.4.8
# jupyterlab_git                           0.43.0
# jupyterlab-pygments                      0.2.2
# jupyterlab_server                        2.25.0
# jupyterlab-widgets                       3.0.9
# jupytext                                 1.15.2
# keras                                    2.11.0
# keras-core                               0.1.7
# keras-tuner                              1.4.1
# keyring                                  24.2.0
# keyrings.google-artifactregistry-auth    1.1.2
# kfp                                      2.3.0
# kfp-pipeline-spec                        0.2.2
# kfp-server-api                           2.0.1
# kiwisolver                               1.4.5
# kt-legacy                                1.0.5
# kubernetes                               26.1.0
# lazy_loader                              0.3
# libclang                                 16.0.6
# llvmlite                                 0.41.0
# lz4                                      4.3.2
# mapply                                   0.1.21
# Markdown                                 3.4.4
# markdown-it-py                           3.0.0
# MarkupSafe                               2.0.1
# matplotlib                               3.8.0
# matplotlib-inline                        0.1.6
# mdit-py-plugins                          0.4.0
# mdurl                                    0.1.2
# mistune                                  3.0.1
# more-itertools                           10.1.0
# msgpack                                  1.0.6
# multidict                                6.0.4
# multimethod                              1.10
# multiprocess                             0.70.15
# namex                                    0.0.7
# nb-conda                                 2.2.1
# nb-conda-kernels                         2.3.1
# nbclassic                                1.0.0
# nbclient                                 0.8.0
# nbconvert                                7.8.0
# nbdime                                   3.2.0
# nbformat                                 5.9.2
# nest-asyncio                             1.5.6
# networkx                                 3.1
# notebook                                 6.5.6
# notebook-executor                        0.2
# notebook_shim                            0.2.3
# numba                                    0.58.0
# numpy                                    1.23.5
# nvidia-ml-py                             11.495.46
# oauth2client                             4.1.3
# oauthlib                                 3.2.2
# objsize                                  0.6.1
# opencensus                               0.11.3
# opencensus-context                       0.1.3
# opentelemetry-api                        1.20.0
# opentelemetry-exporter-otlp              1.20.0
# opentelemetry-exporter-otlp-proto-common 1.20.0
# opentelemetry-exporter-otlp-proto-grpc   1.20.0
# opentelemetry-exporter-otlp-proto-http   1.20.0
# opentelemetry-proto                      1.20.0
# opentelemetry-sdk                        1.20.0
# opentelemetry-semantic-conventions       0.41b0
# opt-einsum                               3.3.0
# orjson                                   3.9.7
# overrides                                6.5.0
# packaging                                23.1
# pandas                                   2.0.3
# pandas-gbq                               0.19.2
# pandas-profiling                         3.6.6
# pandocfilters                            1.5.0
# papermill                                2.4.0
# parso                                    0.8.3
# pathos                                   0.3.1
# patsy                                    0.5.3
# pexpect                                  4.8.0
# phik                                     0.12.3
# pickleshare                              0.7.5
# Pillow                                   10.0.1
# pip                                      23.2.1
# pkgutil_resolve_name                     1.3.10
# platformdirs                             3.10.0
# plotly                                   5.17.0
# pluggy                                   1.3.0
# pox                                      0.3.3
# ppft                                     1.7.6.7
# prettytable                              3.9.0
# prometheus-client                        0.17.1
# promise                                  2.3
# prompt-toolkit                           3.0.39
# proto-plus                               1.22.3
# protobuf                                 3.19.6
# psutil                                   5.9.3
# ptyprocess                               0.7.0
# pure-eval                                0.2.2
# py-spy                                   0.3.14
# pyarrow                                  9.0.0
# pyasn1                                   0.5.0
# pyasn1-modules                           0.3.0
# pycosat                                  0.6.4
# pycparser                                2.21
# pydantic                                 1.10.12
# pydata-google-auth                       1.8.2
# pydot                                    1.4.2
# Pygments                                 2.16.1
# PyJWT                                    2.8.0
# pymongo                                  3.13.0
# pyOpenSSL                                23.2.0
# pyparsing                                3.1.1
# PySocks                                  1.7.1
# python-dateutil                          2.8.2
# python-json-logger                       2.0.7
# pytz                                     2023.3.post1
# pyu2f                                    0.1.5
# PyWavelets                               1.4.1
# PyYAML                                   6.0.1
# pyzmq                                    24.0.1
# ray                                      2.7.0
# ray-cpp                                  2.7.0
# referencing                              0.30.2
# regex                                    2023.8.8
# requests                                 2.31.0
# requests-oauthlib                        1.3.1
# requests-toolbelt                        0.10.1
# retrying                                 1.3.3
# rfc3339-validator                        0.1.4
# rfc3986-validator                        0.1.1
# rich                                     13.5.3
# rpds-py                                  0.10.3
# rsa                                      4.9
# ruamel.yaml                              0.17.32
# ruamel.yaml.clib                         0.2.7
# scikit-image                             0.21.0
# scikit-learn                             1.3.1
# scipy                                    1.11.2
# seaborn                                  0.12.2
# SecretStorage                            3.3.3
# Send2Trash                               1.8.2
# setuptools                               68.2.2
# Shapely                                  1.8.5.post1
# simpervisor                              1.0.0
# six                                      1.16.0
# smart-open                               6.4.0
# smmap                                    5.0.1
# sniffio                                  1.3.0
# soupsieve                                2.5
# SQLAlchemy                               2.0.21
# sqlparse                                 0.4.4
# stack-data                               0.6.2
# starlette                                0.27.0
# statsmodels                              0.14.0
# tabulate                                 0.9.0
# tangled-up-in-unicode                    0.2.0
# tenacity                                 8.2.3
# tensorboard                              2.11.2
# tensorboard-data-server                  0.6.1
# tensorboard-plugin-profile               2.13.1
# tensorboard-plugin-wit                   1.8.1
# tensorboardX                             2.6
# tensorflow                               2.11.0
# tensorflow-cloud                         0.1.16
# tensorflow-datasets                      4.9.0
# tensorflow-estimator                     2.11.0
# tensorflow-hub                           0.14.0
# tensorflow-io                            0.29.0
# tensorflow-io-gcs-filesystem             0.29.0
# tensorflow-metadata                      0.14.0
# tensorflow-probability                   0.21.0
# tensorflow-serving-api                   2.11.0
# tensorflow-transform                     0.14.0
# termcolor                                2.3.0
# terminado                                0.17.1
# textwrap3                                0.9.2
# threadpoolctl                            3.2.0
# tifffile                                 2023.9.18
# tinycss2                                 1.2.1
# toml                                     0.10.2
# tomli                                    2.0.1
# toolz                                    0.12.0
# tornado                                  6.3.3
# tqdm                                     4.66.1
# traitlets                                5.10.0
# typeguard                                2.13.3
# typer                                    0.9.0
# typing_extensions                        4.5.0
# typing-utils                             0.1.0
# tzdata                                   2023.3
# uri-template                             1.3.0
# uritemplate                              3.0.1
# urllib3                                  1.26.16
# uvicorn                                  0.23.2
# virtualenv                               20.21.0
# visions                                  0.7.5
# watchfiles                               0.20.0
# wcwidth                                  0.2.6
# webcolors                                1.13
# webencodings                             0.5.1
# websocket-client                         1.6.3
# Werkzeug                                 2.1.2
# wheel                                    0.41.2
# widgetsnbextension                       4.0.9
# witwidget                                1.8.1
# wordcloud                                1.9.2
# wrapt                                    1.15.0
# yarl                                     1.9.2
# ydata-profiling                          4.5.1
# zipp                                     3.17.0
# zstandard                                0.21.0