### Import Packages

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import modin.pandas as mpd
import pandas as pd
import numpy as np
import datetime
import json
import os
from google.cloud import bigquery
import gcsfs
import mapply
import gc
from tqdm import tqdm
from rank_bm25 import BM25Okapi
import itertools
import pickle
client = bigquery.Client()

from gensim import corpora
from smart_open import smart_open
import dask.dataframe as dd

from gensim.models import TfidfModel, OkapiBM25Model

from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile

from gensim.models import fasttext
from gensim.models.fasttext import FastText

from gensim.models.doc2vec import Doc2Vec

2023-10-31 20:42:19,052 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-10-31 20:42:19,053 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2023-10-31 20:42:19,054 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2023-10-31T20:42:19.054629', 'gensim': '4.3.2', 'python': '3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]', 'platform': 'Linux-5.10.0-26-cloud-amd64-x86_64-with-glibc2.31', 'event': 'created'}


In [3]:
mapply.init(
    n_workers=-1,
    chunk_size=10000,
    max_chunks_per_worker=0,
    progressbar=True
)

### Tables used: 
<ol>
    <li> `wmt-dca-catalog-dq-dev.SC_Final_Tables.IIS_CORE` - CORE TABLE SEE TABLE DIRECTORY </li>
</ol>

### Attributes Summary:
#### Target Attribute 
- ELECTRONIC_WASTE_IND
#### Standard Attributes
- WPID
- GTIN
- PROD_TYPE_NM
- PROD_NM

In [4]:
query = """
SELECT WPID,PROD_TYPE_NM FROM wmt-dca-catalog-dq-dev.SC_Final_Tables.IIS_CORE
"""
all_items = client.query(query).to_dataframe()

### Analysis: 

##### Create a sample of books & t-shirt. Also create with a sample not containing either books or t-shirts & concatenate them together.

In [5]:
items_sample = all_items.groupby('PROD_TYPE_NM').sample(frac=.2).reset_index(drop=True)

In [19]:
items_sample_most = items_sample[~items_sample.PROD_TYPE_NM.isin(['Books','T-Shirts'])]

In [21]:
items_sample_books = items_sample[items_sample.PROD_TYPE_NM.isin(['Books'])]

In [22]:
items_sample_ts = items_sample[items_sample.PROD_TYPE_NM.isin(['T-Shirts'])]

In [23]:
items_sample_books = items_sample_books.sample(frac=.5).reset_index(drop=True)
items_sample_ts = items_sample_ts.sample(frac=.5).reset_index(drop=True)

In [24]:
items_sample = pd.concat([items_sample_most,items_sample_books,items_sample_ts],axis=0)


    import ray
    ray.init()

2023-10-27 19:44:42,624	INFO worker.py:1642 -- Started a local Ray instance.
2023-10-27 19:44:44,638 : INFO : Using sequential splitting in '.from_pandas()' because of some of the conditions are False: enough_elements=True; all_numeric_types=False; async_mode_on=False
2023-10-27 19:44:46,836 : INFO : Using sequential splitting in '.from_pandas()' because of some of the conditions are False: enough_elements=False; all_numeric_types=False; async_mode_on=False
2023-10-27 19:44:47,369 : INFO : Using sequential splitting in '.from_pandas()' because of some of the conditions are False: enough_elements=False; all_numeric_types=False; async_mode_on=False


In [25]:
items_sample.shape

(4185745, 2)

##### Create a table  storing the above sample.

In [26]:
items_sample.to_gbq(project_id = "wmt-dca-catalog-dq-dev", destination_table = "wmt-dca-catalog-dq-dev.SC_Final_Tables.ALL_ITEMS_RAND_SAMPLE20", if_exists='replace')

Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.
4185745 out of 4185745 rows loaded.s]2023-10-27 19:45:23,373 : INFO : 
100%|██████████| 1/1 [00:00<00:00, 932.07it/s]
2023-10-27 19:45:23,375 : INFO : Using sequential splitting in '.from_pandas()' because of some of the conditions are False: enough_elements=True; all_numeric_types=False; async_mode_on=False


##### Create parquet files from the table created above and import data from the files

In [None]:
QUERY = f"""
        EXPORT DATA
          OPTIONS (
            uri = 'gs://phase2_scoreclean/pod1/word_counts/ALL_ITEMS_SAMPLE20/*.parquet',
            format = 'parquet')
        AS (SELECT A.WPID,PROD_TYPE_NM,WORDS_CLEAN_LINE FROM `wmt-dca-catalog-dq-dev.SC_Final_Tables.ALL_ITEMS_RAND_SAMPLE20` AS A
        LEFT JOIN 
        (SELECT WPID,WORDS_CLEAN_LINE FROM wmt-dca-catalog-dq-dev.SC_Final_Tables.PROD_FULL_TEXT) AS B
        ON A.WPID=B.WPID)
        """
        job = client.query(QUERY)
        while job.done()!=True:
            print('waiting for bigquery')
            time.sleep(3)
        print('errors:',job.errors)

In [None]:
item_sample = dd.read_parquet('gs://phase2_scoreclean/pod1/word_counts/ALL_ITEMS_SAMPLE20/*.parquet')
item_sample = item_sample.compute()

In [12]:
item_sample = item_sample.reset_index(drop=True)

In [14]:
item_sample['WORDS_CLEAN_LINE'][0]

'official creality ender max neo printer upgraded with cr touch auto leveling dual z-axis all-metal bowden extruder color knob screen large print size in cr touch auto bed leveling the cr touch intelligent leveling system can automatically compensate the printing height at different points of the printing platform the technology effectively improves the leveling accuracy mm and printing success rate power loss recovery and filament sensor the creality printers features the function of detecting filament runout or breakage power loss and resume printing after recovery by accurately recording the printing data at the time of power outage filament runout or breakage it helps to avoid the waste of filaments and time caused by accidents high-precision dual z-axis higher printing precision with z-axis dual-screw z-axis dual-motor design works smoother and more synchronously to lower the possibility of lines and ridges on the sides of your print thus improving the printing quality simple quic

##### Fetch and write the words in the product_typ_nm  for each product in a .txt file 

In [None]:
item_sample_list=item_sample['WORDS_CLEAN_LINE'].to_list()

In [None]:
with open('item_sample_20_final.txt', 'w') as fp:
    for item in tqdm(item_sample_list):
        # write each item on a new line
        fp.write(item + "\n")
    print('Done')

100%|██████████| 4185745/4185745 [00:11<00:00, 378963.83it/s]

Done





In [None]:
!gsutil cp item_sample_20.txt gs://phase2_scoreclean/pod1/word_counts/

Copying file://item_sample_20_final.txt [Content-Type=text/plain]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1 files][  4.7 GiB/  4.7 GiB]   89.2 MiB/s                                   
Operation completed over 1 objects/4.7 GiB.                                      


##### Create a dictionary of words from the text file saved above

In [25]:
dictionary = corpora.Dictionary(line.split() for line in tqdm(open('item_sample.txt', encoding='utf-8')))

2507496it [08:21, 4997.78it/s]


In [31]:
dictionary = corpora.Dictionary.load('models/item_sample_20_final_dictionary')

2023-10-31 01:54:23,589 : INFO : loading Dictionary object from models/item_sample_20_final_dictionary
2023-10-31 01:54:24,862 : INFO : Dictionary lifecycle event {'fname': 'models/item_sample_20_final_dictionary', 'datetime': '2023-10-31T01:54:24.862800', 'gensim': '4.3.2', 'python': '3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]', 'platform': 'Linux-5.10.0-26-cloud-amd64-x86_64-with-glibc2.31', 'event': 'loaded'}


In [32]:
document_model = OkapiBM25Model(dictionary=dictionary)

In [25]:
document_model = OkapiBM25Model.load('models/bm25.model')

2023-10-31 01:52:14,425 : INFO : loading OkapiBM25Model object from models/bm25.model
2023-10-31 01:52:14,781 : INFO : OkapiBM25Model lifecycle event {'fname': 'models/bm25.model', 'datetime': '2023-10-31T01:52:14.781005', 'gensim': '4.3.2', 'python': '3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]', 'platform': 'Linux-5.10.0-26-cloud-amd64-x86_64-with-glibc2.31', 'event': 'loaded'}


##### Create a corpus of words from the txt file created above

In [33]:
class MyCorpus:
    def __iter__(self):
        for line in tqdm(open('item_sample.txt', encoding='utf-8')):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.split())

In [34]:
corpus=MyCorpus()

In [35]:
bm25_corpus = document_model[corpus]

##### Create an index which is used to generate the 'elec_score'

In [36]:
index_tmpfile = get_tmpfile("index")

In [39]:
index = Similarity(index_tmpfile, bm25_corpus, num_features=len(dictionary),chunksize=256,shardsize=50000)

169983it [01:51, 2026.57it/s]2023-10-31 02:01:04,620 : INFO : PROGRESS: fresh_shard size=20000
173972it [01:53, 1534.43it/s]


KeyboardInterrupt: 

In [51]:
query = dictionary.doc2bow('''
'''.split())

In [53]:
books_sample['elec_score'] = index[query]

##### Storing the 'elec_score' in a csv file

In [60]:
books_sample[books_sample['elec_score']!=0].sort_values('elec_score',ascending=False).to_csv('elec_score.csv',index=False)

##### Training & saving the Fast Text model with  text file created

In [None]:
ft_model = FastText(
    sg=1, # use skip-gram: usually gives better results
    vector_size=100, # embedding dimension (default)
    window=10, # window size: 10 tokens before and 10 tokens after to get wider context
    min_count=5, # only consider tokens with at least n occurrences in the corpus
    negative=15, # negative subsampling: bigger than default to sample negative examples more
    min_n=2, # min character n-gram
    max_n=5, # max character n-gram
    workers=30
)

In [None]:
ft_model.build_vocab(corpus_file='item_sample_20_final.txt')

In [None]:
ft_model.train(
    corpus_file='item_sample_20_final.txt',
    epochs=6,
    total_examples=ft_model.corpus_count, 
    total_words=ft_model.corpus_total_words)

In [None]:
ft_model.save('_fasttext20_final.model') #save

In [None]:
dv_model = Doc2Vec(
    vector_size=100, # embedding dimension (default)
    window=10, # window size: 10 tokens before and 10 tokens after to get wider context
    min_count=5, # only consider tokens with at least n occurrences in the corpus
    negative=15, # negative subsampling: bigger than default to sample negative examples more
    workers=25
)

In [None]:
dv_model.build_vocab(corpus_file='item_sample_20_final.txt')

In [None]:
dv_model.train(
    corpus_file='item_sample_20_final.txt',
    epochs=10,
    total_examples=dv_model.corpus_count, 
    total_words=dv_model.corpus_total_words)

In [None]:
dv_model.save('_doc2vec20_final.model')

In [42]:
!gsutil cp gs://phase2_scoreclean/pod1/word_counts/models/_doc2vec20_final* models/

Copying gs://phase2_scoreclean/pod1/word_counts/models/_doc2vec20_final.model...
Copying gs://phase2_scoreclean/pod1/word_counts/models/_doc2vec20_final.model.dv.vectors.npy...
Copying gs://phase2_scoreclean/pod1/word_counts/models/_doc2vec20_final.model.syn1neg.npy...
Copying gs://phase2_scoreclean/pod1/word_counts/models/_doc2vec20_final.model.wv.vectors.npy...
- [4 files][  2.0 GiB/  2.0 GiB]  102.0 MiB/s                                   
Operation completed over 4 objects/2.0 GiB.                                      
