## Loading Necessary Libraries

In [1]:
# general
import re
import gc
import os
import csv
import time
import pickle
from tqdm import tqdm

# data handling
import numpy as np
import pandas as pd

# HTML parsing
from bs4 import BeautifulSoup

# plotting
import matplotlib.pyplot as plt

# chunker
import torch
from torch import cuda
# import flair
# from flair.data import Sentence
# from flair.models import SequenceTagger

## Load Data

In [2]:
# 2023 Data
data = pd.read_csv('../data/2023/train-v3.csv', usecols=['DESCRIPTION'])
# data = pd.read_csv('../data/2023/test-v2.csv')

# 2021 Data
# data = pd.read_csv('../data/2021/train-v2.csv')
# data = pd.read_csv('../data/2021/test-v2.csv')

In [3]:
data = data.fillna('')

In [4]:
data = data[~data.DESCRIPTION.isin([''])]

In [5]:
data

Unnamed: 0,DESCRIPTION
2,"Specifications : Color : Red , Material : Alum..."
3,AISHAH Women ' s Lycra Cotton Ankel Leggings ....
5,HINS Brings you the most Elegant Looking Pot w...
7,Aluminum Foil Stickers-good kitchen helper for...
9,"Transform your home , workplace or hotel room ..."
...,...
2249688,Welcome to the wonderfully Wicked World of Aut...
2249689,This extra long Tall t-Shirt will be your favo...
2249694,[ Brand ] : XVIEONR [ Product name ] : Fashion...
2249695,Wall Clocks Are Very Attractive In Looks And E...


In [6]:
data_idxs = data.index.to_list()

## Chunking

Since, we have lot of text data, capturing the entire graph between all the tokens would be difficult (time consuming). So, let's chunk the text into sections to capture the local information.

In [7]:
flair.device = torch.device('cuda')

In [8]:
flair.device

device(type='cuda')

In [9]:
# load tagger
tagger = SequenceTagger.load("flair/chunk-english-fast")

Downloading pytorch_model.bin:   0%|          | 0.00/72.9M [00:00<?, ?B/s]

2023-05-27 09:05:25,344 SequenceTagger predicts: Dictionary with 47 tags: O, S-NP, B-NP, E-NP, I-NP, S-VP, B-VP, E-VP, I-VP, S-PP, B-PP, E-PP, I-PP, S-ADVP, B-ADVP, E-ADVP, I-ADVP, S-SBAR, B-SBAR, E-SBAR, I-SBAR, S-ADJP, B-ADJP, E-ADJP, I-ADJP, S-PRT, B-PRT, E-PRT, I-PRT, S-CONJP, B-CONJP, E-CONJP, I-CONJP, S-INTJ, B-INTJ, E-INTJ, I-INTJ, S-LST, B-LST, E-LST, I-LST, S-UCP, B-UCP, E-UCP, I-UCP, <START>, <STOP>


In [10]:
def optimize_gpu():
  """
  Frees up GPU to help reduce memory leak
  Reset Already occupied Memory and Cache
  """
  torch.cuda.reset_max_memory_allocated()
  
  torch.cuda.reset_max_memory_cached()
  
  torch.cuda.empty_cache()

  # Garbage Collection
  gc.collect()

In [11]:
os.mkdir('description-chunked')

In [13]:
data.shape[0] // 10000

109

In [14]:
PRODUCT_AT_ONCE = 10000
LAST_CRASHED = 109
file_no = LAST_CRASHED+1

for i in tqdm(range(LAST_CRASHED, (data.shape[0] // PRODUCT_AT_ONCE)+1)):
# for i in tqdm(range(1)):
  # make product sentences
  sentences = [[Sentence(sent) for sent in data.iloc[k,0].split(' . ')]
               for k in range(i*PRODUCT_AT_ONCE, min((i+1)*PRODUCT_AT_ONCE, data.shape[0]))]
    
  # number of bps per product
  len_sentences = [
    len(bp) for bp in sentences
  ]
  
  cut_offs = np.cumsum([0] + len_sentences)
  
  # unrolling sentences
  
  sentences = [bp for bps in sentences for bp in bps]
  
  max_toks = max([len(sent.text.split(' ')) for sent in sentences])
  
  if max_toks >= 2000:
    bs = 100
  
  elif max_toks >= 1000:
    bs = 200
  
  else:
    bs = 250

  # predict chunk tags
  tagger.predict(sentences, verbose=True, mini_batch_size=bs)
  
  processed_descs = list()
  
  # print predicted chunk spans
  # store predicted NER spans
  for p in range(len(cut_offs)-1):
    
    processed_desc = list()
    
    for sentence in sentences[cut_offs[p]: cut_offs[p+1]]:
      
      processed_sent = list()
      
      # iterate over entities and print
      
      for entity in sentence.get_spans('np'):
        
        processed_sent.append((entity.text, entity.tag))
    
      processed_desc.append(processed_sent)
    
    processed_descs.append(processed_desc)
  
  with open(f'/kaggle/working/description-chunked/{file_no}.pkl', 'wb') as f:
    pickle.dump(processed_descs, f)
  
  file_no += 1
  
  optimize_gpu()

  0%|          | 0/1 [00:00<?, ?it/s]
Batch inference:   0%|          | 0/51 [00:00<?, ?it/s][A
Batch inference:   2%|▏         | 1/51 [00:05<04:48,  5.76s/it][A
Batch inference:   4%|▍         | 2/51 [00:07<02:50,  3.49s/it][A
Batch inference:   6%|▌         | 3/51 [00:09<02:00,  2.51s/it][A
Batch inference:   8%|▊         | 4/51 [00:09<01:29,  1.91s/it][A
Batch inference:  10%|▉         | 5/51 [00:10<01:10,  1.53s/it][A
Batch inference:  12%|█▏        | 6/51 [00:11<00:57,  1.29s/it][A
Batch inference:  14%|█▎        | 7/51 [00:13<00:57,  1.31s/it][A
Batch inference:  16%|█▌        | 8/51 [00:13<00:48,  1.12s/it][A
Batch inference:  18%|█▊        | 9/51 [00:14<00:42,  1.01s/it][A
Batch inference:  20%|█▉        | 10/51 [00:15<00:36,  1.11it/s][A
Batch inference:  22%|██▏       | 11/51 [00:15<00:33,  1.21it/s][A
Batch inference:  24%|██▎       | 12/51 [00:16<00:29,  1.31it/s][A
Batch inference:  25%|██▌       | 13/51 [00:17<00:26,  1.41it/s][A
Batch inference:  27%|██▋   

In [16]:
chunked_descs = [pickle.load(open(f'../data/2023/temp-descs/{k+1}.pkl', 'rb')) for k in tqdm(range(110))]

100%|█████████████████████████████████████████| 110/110 [00:42<00:00,  2.60it/s]


In [17]:
chunked_descs = [chunk for chunk_desc in tqdm(chunked_descs) for chunk in chunk_desc]

100%|█████████████████████████████████████████| 110/110 [00:05<00:00, 20.71it/s]


In [21]:
with open(f'../data/2023/temp-descs/chunked-train-descs-2023.pkl', 'wb') as f:
  pickle.dump(chunked_descs, f)