In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('dataset/train.csv', escapechar = "\\", quoting = csv.QUOTE_NONE)

In [4]:
train.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [5]:
train.shape

(2903024, 5)

In [8]:
# Drop rows with more than 2 null values
train_thresh = train.dropna(thresh=3)
train_thresh.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
5,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,"[Color: Blue,Sleeve: Full Sleeve,Material: Cot...",Bhavya Enterprise,5


In [9]:
train_thresh.shape

(2855971, 5)

In [10]:
train.dtypes

TITLE             object
DESCRIPTION       object
BULLET_POINTS     object
BRAND             object
BROWSE_NODE_ID     int64
dtype: object

In [11]:
# Check number of null values in each column
train.isnull().sum(axis = 0)

TITLE                 71
DESCRIPTION       723664
BULLET_POINTS     166263
BRAND              56737
BROWSE_NODE_ID         0
dtype: int64

In [12]:
# Count number of classes / product browse nodes
len(train['BROWSE_NODE_ID'].unique())

9919

In [13]:
# Replace NaN values with space
train_replaced_NaN = train_thresh.replace(np.nan, '', regex=True)

In [14]:
train_replaced_NaN.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
5,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,"[Color: Blue,Sleeve: Full Sleeve,Material: Cot...",Bhavya Enterprise,5


In [15]:
# Split dataset into train and validation
X = train_replaced_NaN[['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND']]
y = train_replaced_NaN["BROWSE_NODE_ID"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [27]:
# Corpus generation function
def generateCorpus(df):
    np_arr = df.to_numpy()
    corpus = []
    for product_desc in np_arr:
        str = ""
        for desc in product_desc:
            str+=desc
            str+= ', '
        corpus.append(str + '\n')
    return corpus

In [28]:
train_corpus = generateCorpus(X_train)

In [29]:
train_corpus[:3]

['Safex Fire ABC\xa0Fire Extinguisher\xa06kg Pack of 2, ABC powder based fire extinguishers are the most widely used fire extinguishers, filled with MAP (mono ammonium phosphate) dry powder, Suitable for all types of fire i.e. A b c and electrically started fire effectively extinguish by interrupting the chemical reaction of a fire triangle. Suitable for all - car / home / office / pantry / commercial, residential ; industrial buildings, Home/Office/Car/Commercial/Residential Building Purpose., [Material : Iron,Ring Handle Mount,ABC Types Fire extinguishers are effective for all types of fire like Class A, B and C types of fires as well as Electrical fires and also ABC Powder Type (Stored Pressure) Fire Extinguisher, Multipurpose uses,Clear Instruction Label and No Maintenance required,operating temperature (-0) ºC to (+55) ºC], Safex Fire, \n',
 "Shopsmeade Malec Shadowhunters S2 Mug with Electronic Education Record ® Gift Card | Collector Edition Mug | Gifts for Boyfriend Girlfriend 

In [None]:
import os
txt_file = open("./corpus/train_corpus.txt", a, encoding = "utf8")
for desc in train_corpus:
    txt_file.write(desc + "\n")
txt_file.close()