### Imports

In [1]:
 import nltk
 nltk.download('stopwords')  
 nltk.download('punkt')
 !pip install num2words
 !pip install transformers

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.7 MB/s 
Installing collected packages: num2words
Successfully installed num2words-0.5.10
Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.7 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl 

In [2]:
import pandas as pd
import csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import numpy as np
import math
import json

import spacy
from gensim.summarization.bm25 import BM25
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline
import concurrent.futures
import itertools
import operator
import re

## Preprocessor

In [4]:
class PreProcessor:

    def __init__(self, data):
        self.data = data;

    def execute(self):
        self.convert_lower_case()
        self.remove_punctuation() #remove comma seperately
        self.remove_apostrophe()
        self.remove_stop_words()
        self.convert_numbers()
        self.stemming()
        self.remove_punctuation()
        self.convert_numbers()
        self.stemming() #needed again as we need to stem the words
        self.remove_punctuation() #needed again as num2word is giving few hypens and commas fourty-one
        self.remove_stop_words() #needed again as num2word is giving stop words 101 - one hundred and one        
        return self.data

    def convert_lower_case(self):
        self.data = np.char.lower(self.data)

    def remove_stop_words(self):
        stop_words = stopwords.words('english')
        words = word_tokenize(str(self.data))
        new_text = ""
        for w in words:
            if w not in stop_words and len(w) > 1:
                new_text = new_text + " " + w
        self.data = new_text
    
    def remove_punctuation(self):
        symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
        for i in range(len(symbols)):
            data = np.char.replace(self.data, symbols[i], ' ')
            data = np.char.replace(data, "  ", " ")
        data = np.char.replace(data, ',', '')
        self.data = data

    def remove_apostrophe(self):
        self.data = np.char.replace(self.data, "'", "")

    def stemming(self):
        stemmer= PorterStemmer()
        
        tokens = word_tokenize(str(self.data))
        new_text = ""
        for w in tokens:
            new_text = new_text + " " + stemmer.stem(w)
        self.data = new_text

    def convert_numbers(self):
        tokens = word_tokenize(str(self.data))
        new_text = ""
        for w in tokens:
            try:
                w = num2words(int(w))
            except:
                a = 0
            new_text = new_text + " " + w
        new_text = np.char.replace(new_text, "-", " ")
        self.data = new_text

## Compressor

In [5]:
class Compressor:

    def __init__(self, df):
        self.df = df;
        self.col = ['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND', 'BROWSE_NODE_ID']
        self.data_dict = []

    def validRow(self, row):
        c = row.shape[0]
        for i in range(c):
            if type(row[i]) is float:
                row[i] = ''
        return row

    def compress(self):
        print(f'[INFO] Compress init')
        self.df.fillna('', inplace=True) # Handling null values

        data = {}
        l = self.df.shape[0]
        x = int(l/100);
        for i in range(l):
            if i%x == 0:
                print('.' , end='')
            bni = self.df.loc[i]['BROWSE_NODE_ID']
            data[bni] = ["", "", "", "", bni]
        print()
        for i in range(self.df.shape[0]):
            if i%x == 0:
                print('|' , end='')
            bni = self.df.loc[i]['BROWSE_NODE_ID']
            row = self.validRow(self.df.loc[i])

            data[bni][0] = data[bni][0] + ' \n ' + row['TITLE']
            data[bni][1] = data[bni][1] + ' \n ' + row['DESCRIPTION']
            data[bni][2] = data[bni][2] + ' \n ' + row['BULLET_POINTS']
            data[bni][3] = data[bni][3] + ' \n ' + row['BRAND']
        print()
        self.create_new_df(data)

    # precompress will preprocess text while compressing it
    def precompress(self):
        print(f'[INFO] Compress init')
        self.df.fillna('', inplace=True) # Handling null values

        data = {}
        l = self.df.shape[0]
        x = int(l/100);
        for i in range(l):
            if i%x == 0:
                print('.' , end='')
            bni = self.df.loc[i]['BROWSE_NODE_ID']
            data[bni] = ["", "", "", "", bni]
        print()
        for i in range(self.df.shape[0]):
            if i%x == 0:
                print('|' , end='')
            bni = self.df.loc[i]['BROWSE_NODE_ID']
            row = self.validRow(self.df.loc[i])

            data[bni][0] = data[bni][0] + ' \n ' + PreProcessor(row['TITLE']).execute()
            data[bni][1] = data[bni][1] + ' \n ' + PreProcessor(row['DESCRIPTION']).execute()
            data[bni][2] = data[bni][2] + ' \n ' + PreProcessor(row['BULLET_POINTS']).execute()
            data[bni][3] = data[bni][3] + ' \n ' + PreProcessor(row['BRAND']).execute()
        print()
        self.create_new_df(data)

    def create_new_df(self, data):
        print(f'[INFO] Creating new dataframe')
        for d in data:
            new_row = {self.col[0] : data[d][0], self.col[1] : data[d][1], self.col[2] : data[d][2], self.col[3] : data[d][3], self.col[4] : data[d][4]}
            self.data_dict.append(new_row)
        print(f'[INFO] data compressed')

    def save_csv(self, filename):
        with open(filename, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=self.col)
            writer.writeheader()
            writer.writerows(self.data_dict)

    def get_csv(self):
        return pd.DataFrame.from_dict(self.data_dict)

## Nodes Extractor

In [6]:
SPACY_MODEL = os.environ.get('SPACY_MODEL', 'en_core_web_sm')
nlp = spacy.load(SPACY_MODEL, disable=['ner', 'parser', 'textcat'])

In [7]:
class NodeExtractor:

    def __init__(self, nlp):
        self.tokenize = lambda text: [token.lemma_ for token in nlp(text)]
        self.bm25 = None
        self.passages = None
        self.directory = None # {1: 1, 2:1, 3:1, 4:2 ...... } key = passageindex & value = column index

    def preprocess(self, df):
        print(f'[INFO] compiling data')
        directory = {}
        passages = []
        l = df.shape[0]
        x = int(l/100)
        for i in range(100):
            print('.', end='')
        print()
        for i in range(l):
            if i%x == 0:
                print('|', end='')
            txt = f'{df.loc[i]["TITLE"]} \n {df.loc[i]["DESCRIPTION"]} \n {df.loc[i]["BULLET_POINTS"]} \n {df.loc[i]["BRAND"]}'    
            sentences = txt.split('\n')
            for j in range(len(sentences)):
                sentences[j] = sentences[j].strip()
                if (len(sentences[j]) < 1):
                    continue
                passages.append(sentences[j])
                directory[len(passages)-1] = df['BROWSE_NODE_ID'].loc[i]
        print()
        self.passages = passages
        self.directory = directory
    
    def train(self, df):
        print(f'[INFO] Training init')
        self.preprocess(df)
        print(f'[INFO] Featching corpus to BM25....')
        corpus = [self.tokenize(p) for p in self.passages]
        self.bm25 = BM25(corpus)
        print(f'[INFO] Training finished')

    def test(self, row, topn=10):
        text = f'{row["TITLE"]} \n {row["DESCRIPTION"]} \n {row["BULLET_POINTS"]} \n {row["BRAND"]}'
        text = PreProcessor(text).execute()
        tokens = self.tokenize(text)
        average_idf = sum(map(lambda k: float(self.bm25.idf[k]), self.bm25.idf.keys()))
        scores = self.bm25.get_scores(tokens, average_idf)
        pairs = [(s, i) for i, s in enumerate(scores)]
        pairs.sort(reverse=True)
        indices = [self.directory[i] for _, i in pairs[:topn]]
        scores = [s for s, _ in pairs[:topn]]
        return [scores, indices]

## Train

In [8]:
data = pd.read_csv('s_train.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [9]:
compressor = Compressor(data)
compressor.precompress()
compressed_data = compressor.get_csv()
# compressed_data.head()

[INFO] Compress init
....................................................................................................
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
[INFO] Creating new dataframe
[INFO] data compressed


In [10]:
model = NodeExtractor(nlp)
model.train(compressed_data)

[INFO] Training init
[INFO] compiling data
....................................................................................................
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
[INFO] Featching corpus to BM25....
[INFO] Training finished


## Test

In [18]:
# test_data = pd.read_csv('/content/drive/MyDrive/AmazonMLChallenge/Dataset/test.csv', escapechar = "\\" , quoting = csv.QUOTE_NONE, engine='python')
test_data = pd.read_csv('m_test.csv')
test_data.head()

Unnamed: 0.1,Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


Multiclass Classifier

In [19]:
out = []
l = test_data.shape[0]
x = int(l/100)

for i in range(100):
    print('.', end='')
print()

for i in range(l):
    if i%x==0:
        print('|', end='')
    row = test_data.loc[i]
    scores, indices = model.test(row, 3)
    # string = f"{row['PRODUCT_ID']},{indices[0]}"
    # out.append({"PRODUCT_ID,BROWSE_NODE_ID": string})
    out.append({"PRODUCT_ID" : row['PRODUCT_ID'], "BROWSE_NODE_ID" : indices[0]})
    
print()

prediction = pd.DataFrame.from_dict(out)
# prediction.head()

....................................................................................................
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||


In [20]:
prediction.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,435
1,2,474
2,3,18
3,4,125
4,5,852
