## Loading Necessary Libraries

In [21]:
# general
import re
import gc
import os
import csv
import time
import pickle
from tqdm import tqdm

# data handling
import numpy as np
import pandas as pd

# HTML parsing
from bs4 import BeautifulSoup

# plotting
import matplotlib.pyplot as plt

# chunker
import torch
from torch import cuda
import flair
from flair.data import Sentence
from flair.models import SequenceTagger

## Load Data

In [3]:
# 2023 Data
data = pd.read_csv('/kaggle/input/amazon-train-2023-bullet-points/train-2023-bps.csv')
# data = pd.read_csv('../data/2023/test-v2.csv')

# 2021 Data
# data = pd.read_csv('../data/2021/train-v2.csv')
# data = pd.read_csv('../data/2021/test-v2.csv')

In [18]:
data = pd.read_csv('../data/2023/train.csv', usecols=['BULLET_POINTS'])

In [4]:
# data = data.fillna('')

In [5]:
# data = data.to_numpy().tolist()

In [6]:
# data = pd.DataFrame([' . '.join([str(bp) for bp in bps if str(bp) != '']) for bps in data])

In [7]:
# data

In [8]:
# data = data[~data.iloc[:,0].isin([''])]

In [9]:
data

Unnamed: 0,0
0,LUXURIOUS and APPEALING : Beautiful custom-mad...
1,Harry Potter Hedwig Pyjamas ( 6 to 16 years ) ...
2,"Loud Dual Tone Trumpet Horn , Compatible With ..."
3,Made By 95 % cotton and 5 % Lycra which gives ...
4,"Simple and elegant , great for displaying indo..."
...,...
1412317,Easy to install ; . Round shape 14LED spot wor...
1412318,Material : Polyester
1412319,❤ [ Inspiration ] Inspired by the Untitled Goo...
1412320,"Dial size : 12 inches in diameter . Big , clea..."


## Chunking

Since, we have lot of text data, capturing the entire graph between all the tokens would be difficult (time consuming). So, let's chunk the text into sections to capture the local information.

In [10]:
flair.device = torch.device('cuda')

In [11]:
flair.device

device(type='cuda')

In [22]:
# load tagger
tagger = SequenceTagger.load("flair/chunk-english-fast")

2023-05-27 19:29:54,915 SequenceTagger predicts: Dictionary with 47 tags: O, S-NP, B-NP, E-NP, I-NP, S-VP, B-VP, E-VP, I-VP, S-PP, B-PP, E-PP, I-PP, S-ADVP, B-ADVP, E-ADVP, I-ADVP, S-SBAR, B-SBAR, E-SBAR, I-SBAR, S-ADJP, B-ADJP, E-ADJP, I-ADJP, S-PRT, B-PRT, E-PRT, I-PRT, S-CONJP, B-CONJP, E-CONJP, I-CONJP, S-INTJ, B-INTJ, E-INTJ, I-INTJ, S-LST, B-LST, E-LST, I-LST, S-UCP, B-UCP, E-UCP, I-UCP, <START>, <STOP>


In [23]:
tagger

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2048, out_features=2048, bias=True)
  (rnn): LSTM(2048, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=47, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)

In [13]:
def optimize_gpu():
  """
  Frees up GPU to help reduce memory leak
  Reset Already occupied Memory and Cache
  """
  torch.cuda.reset_max_memory_allocated()
  
  torch.cuda.reset_max_memory_cached()
  
  torch.cuda.empty_cache()

  # Garbage Collection
  gc.collect()

In [14]:
os.mkdir('bps-chunked')

In [15]:
data.shape[0] // 10000

141

In [16]:
PRODUCT_AT_ONCE = 10000
LAST_CRASHED = 141
file_no = LAST_CRASHED+1

for i in tqdm(range(LAST_CRASHED, (data.shape[0] // PRODUCT_AT_ONCE)+1)):
# for i in tqdm(range(1)):
  # make product sentences
  sentences = [[Sentence(sent) for sent in data.iloc[k,0].split(' . ')]
               for k in range(i*PRODUCT_AT_ONCE, min((i+1)*PRODUCT_AT_ONCE, data.shape[0]))]
    
  # number of bps per product
  len_sentences = [
    len(bp) for bp in sentences
  ]
  
  cut_offs = np.cumsum([0] + len_sentences)
  
  # unrolling sentences
  
  sentences = [bp for bps in sentences for bp in bps]  

  # predict chunk tags
  tagger.predict(sentences, verbose=True, mini_batch_size=250)
  
  processed_bps = list()
  
  # print predicted chunk spans
  # store predicted NER spans
  for p in range(len(cut_offs)-1):
    
    processed_bp = list()
    
    for sentence in sentences[cut_offs[p]: cut_offs[p+1]]:
      
      processed_sent = list()
      
      # iterate over entities and print
      
      for entity in sentence.get_spans('np'):
        
        processed_sent.append((entity.text, entity.tag))
    
      processed_bp.append(processed_sent)
    
    processed_bps.append(processed_bp)
  
  with open(f'/kaggle/working/bps-chunked/{file_no}.pkl', 'wb') as f:
    pickle.dump(processed_bps, f)
  
  file_no += 1
  
  optimize_gpu()

  0%|          | 0/1 [00:00<?, ?it/s]




Batch inference:   0%|          | 0/64 [00:00<?, ?it/s][A
Batch inference:   2%|▏         | 1/64 [00:02<02:24,  2.30s/it][A
Batch inference:   3%|▎         | 2/64 [00:03<01:33,  1.51s/it][A
Batch inference:   5%|▍         | 3/64 [00:04<01:13,  1.21s/it][A
Batch inference:   6%|▋         | 4/64 [00:04<01:02,  1.04s/it][A
Batch inference:   8%|▊         | 5/64 [00:05<00:54,  1.08it/s][A
Batch inference:   9%|▉         | 6/64 [00:06<00:59,  1.03s/it][A
Batch inference:  11%|█         | 7/64 [00:07<00:51,  1.11it/s][A
Batch inference:  12%|█▎        | 8/64 [00:08<00:45,  1.22it/s][A
Batch inference:  14%|█▍        | 9/64 [00:08<00:41,  1.32it/s][A
Batch inference:  16%|█▌        | 10/64 [00:09<00:37,  1.43it/s][A
Batch inference:  17%|█▋        | 11/64 [00:09<00:34,  1.52it/s][A
Batch inference:  19%|█▉        | 12/64 [00:10<00:34,  1.52it/s][A
Batch inference:  20%|██        | 13/64 [00:11<00:31,  1.62it/s][A
Batch inference:  22%|██▏       | 14/64 [00:12<00:38,  1.30it/s]

In [2]:
os.listdir('../data/2023/temp-bps')

['49.pkl',
 '61.pkl',
 '75.pkl',
 '74.pkl',
 '60.pkl',
 '48.pkl',
 '76.pkl',
 '62.pkl',
 '89.pkl',
 '88.pkl',
 '63.pkl',
 '77.pkl',
 'lala.zip',
 '73.pkl',
 '67.pkl',
 '9.pkl',
 '98.pkl',
 '.DS_Store',
 '99.pkl',
 '8.pkl',
 '66.pkl',
 '72.pkl',
 '64.pkl',
 '70.pkl',
 '58.pkl',
 '59.pkl',
 '71.pkl',
 '65.pkl',
 '16.pkl',
 '103.pkl',
 '117.pkl',
 '116.pkl',
 '102.pkl',
 '17.pkl',
 '29.pkl',
 '15.pkl',
 '114.pkl',
 '100.pkl',
 '128.pkl',
 '129.pkl',
 '101.pkl',
 '115.pkl',
 '14.pkl',
 '28.pkl',
 '10.pkl',
 '38.pkl',
 '139.pkl',
 '111.pkl',
 '105.pkl',
 '104.pkl',
 '110.pkl',
 '138.pkl',
 '39.pkl',
 '11.pkl',
 '13.pkl',
 '106.pkl',
 '112.pkl',
 '113.pkl',
 '107.pkl',
 '12.pkl',
 '23.pkl',
 '37.pkl',
 '122.pkl',
 '136.pkl',
 '137.pkl',
 '123.pkl',
 '36.pkl',
 '22.pkl',
 '34.pkl',
 '20.pkl',
 '135.pkl',
 '121.pkl',
 '109.pkl',
 '108.pkl',
 '120.pkl',
 '134.pkl',
 '21.pkl',
 '35.pkl',
 '31.pkl',
 '25.pkl',
 '19.pkl',
 '118.pkl',
 '130.pkl',
 '124.pkl',
 'results1.zip',
 '125.pkl',
 '131.pkl',

In [3]:
chunked_bps = [pickle.load(open(f'../data/2023/temp-bps/{k+1}.pkl', 'rb')) for k in tqdm(range(141+1))]

100%|█████████████████████████████████████████| 142/142 [00:35<00:00,  3.97it/s]


In [4]:
chunked_bps = [chunk for chunk_bp in tqdm(chunked_bps) for chunk in chunk_bp]

100%|███████████████████████████████████████| 142/142 [00:00<00:00, 1075.14it/s]


In [5]:
np.mean([len(chunk) for chunk in chunked_bps])

6.851608202661999

In [6]:
len(chunked_bps)

1412322

In [7]:
with open(f'../data/2023/temp-bps/chunked-train-bps-2023.pkl', 'wb') as f:
  pickle.dump(chunked_bps, f)

In [3]:
chunked_bps = pickle.load(open(f'../data/2023/temp-bps/chunked-train-bps-2023.pkl', 'rb'))

In [19]:
for sent in data.BULLET_POINTS.to_list()[:100]:
  print(sent)
  print()

[LUXURIOUS & APPEALING: Beautiful custom-made curtains to decorate any home or office | Includes inbuilt tieback to hold the curtain | Completely finished and ready to hang on walls & windows,MATERIAL: Luxurious & versatile fabric with a natural finish | High colour fastness | State-of-the-art digital printing ensures colour consistency and prevents any fading | Eyelets; Cotton Canvas; Width 4.5feet (54inch) | Multicolour | PACKAGE: 2 Room Curtains Eyelets | SIZE: Height 5 feet (60 inch); SET OF 2 PCS,BLACKOUT CURTAIN: 100% opaque & heavy premium cotton canvas fabric | Tight knitted, long life & durable fabric | Printing only on front side with a plain colour back side,MADE TO PERFECTION: Large eyelets at the top to put hanging hooks | Perfectly tailored seams for durability | Refined stitching with a matching thread color,QUALITY ASSURED: Gentle wash with similar colors in cold water | Avoid direct sunlight to prevent fading | Dispatched after MULTIPLE QUALITY CHECKS]

[Harry Potter H

In [11]:
for i in range(100):
  for j in range(len(chunked_bps[i])):
    print(' '.join([x[0] + f' ({x[1]})'for x in chunked_bps[i][j]]))
    print()
  print()
  print()

LUXURIOUS and APPEALING (NP) Beautiful custom-made curtains (NP) to decorate (VP) any home or office (NP) Includes (VP) inbuilt tieback (NP) to hold (VP) the curtain (NP) Completely finished (VP) ready to hang (VP) on (PP) walls and windows (NP)

MATERIAL (NP) Luxurious and versatile fabric (NP) with (PP) a natural finish (NP) High colour fastness (NP) State-of-the-art digital printing (NP) ensures (VP) colour consistency (NP) prevents (VP) any fading (NP) Eyelets (NP) Cotton Canvas (NP) Width 4.5feet (NP) 54inch (NP) Multicolour (NP) PACKAGE (NP) 2 Room Curtains Eyelets (NP) SIZE (NP) Height (NP) 5 feet (NP) 60 inch (NP) SET (NP) OF (PP) 2 pieces (NP)

BLACKOUT CURTAIN (NP) 100 % (NP) opaque (ADJP) heavy premium cotton canvas fabric (NP) Tight knitted (NP) long life (NP) durable fabric (NP) Printing (VP) only (ADVP) on (PP) front side (NP) with (PP) a plain colour back side (NP)

MADE (VP) TO (PP) PERFECTION (NP) Large eyelets (NP) at (PP) the top (NP) to put (VP) hanging hooks (NP) P

In [26]:
prod_relationships = set([
      chunk[0].lower()
      for bps in chunked_bps
      for bp in bps
      for chunk in bp
      if chunk[1] != 'NP'
    ])

In [28]:
len(prod_relationships)

309167