In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm_notebook as tqdm
import csv
import pickle

In [6]:
filtered = pd.read_csv('filtered_data.csv')
filtered.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,"[Color: Blue,Sleeve: Full Sleeve,Material: Cot...",Bhavya Enterprise,5
3,Glance Women's Wallet (Black) (LW-21),This Black wallet by Glance will be a treasure...,[The Most Comfortable Women's Wallet That You ...,Glance,6
4,Wild Animals Hungry Brain Educational Flash Ca...,Wild Animals are the animals that mostly stays...,[Playful learning: Flash cards develops the lo...,hungry brain,7


In [3]:
filtered.shape

(1903408, 5)

In [4]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [5]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [6]:
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [7]:
stemmer = SnowballStemmer('english')

In [8]:
eng_stopwords = set(stopwords.words('english'))

In [9]:
def preprocess(sentence):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'html.parser').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = " ".join([stemmer.stem(word) for word in sentence.split()])
    sentence = cleanpunc(sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in eng_stopwords)
    return sentence

In [12]:
pre_processed_text = []

for col in tqdm(filtered.values):
    text = preprocess(col[0]+' '+col[1]+' '+col[2]+' '+col[3])
    pre_processed_text.append(text)

for i,text in enumerate(pre_processed_text[:5],1):
    print(f"{i}) {text}")

HBox(children=(IntProgress(value=0, max=1903408), HTML(value='')))

1) pete cat bedtim blue doll inch pete cat coolest popular cat town new pete cat bedtim blue doll merrymak rock stripe pj red slipper one sleepi cat readi cuddl measur inch tall safe age remov cloth surfac wash onli new pete cat bedtim blue plush doll base popular pete cat book jame dean super cudd readi naptim bedtim safe age perfect age measur inch merrymak
2) new yorker refriger magnet x new yorker handsom cello wrap hard magnet measur width height highlight one mani beauti new yorker cover full color cat tea cup new yorker cover artist gurbuz dogan eksioglu cat tea cup new yorker cover artist gurbuz dogan eksioglu handsom cello wrap hard magnet ideal home offic gift ani new yorker magazin lover highlight one mani beauti new yorker cover full color rigid magnet measur width height new yorker
3) men full sleev raglan shirt denim shirt size onli men full sleev raglan shirt denim shirt size onli color blue sleev full sleev materi cotton neck round bhavya enterpris
4) glanc women wallet

In [17]:
pickle.dump(pre_processed_text, open('filtered_ppt', 'wb'))

In [30]:
pre_processed_text = pickle.load(open('filtered_ppt', 'rb'))
len(pre_processed_text)

1903408

In [32]:
Y_train.shape

(1903408,)

In [3]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 50000, ngram_range = (1,2))
X_train = tfidf.fit_transform(pre_processed_text)
print(X_train.shape)

(1903408, 50000)
Wall time: 6min 50s


In [25]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('test', 168736889),
 ('Y_train', 15227416),
 ('pre_processed_test', 927568),
 ('prod_id', 886352),
 ('all_classes', 22168),
 ('TfidfVectorizer', 2000),
 ('MultinomialNB', 1056),
 ('text', 1019),
 ('train_test_split', 136),
 ('col', 96),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('sns', 80),
 ('X_train', 56),
 ('mnb', 56),
 ('test_final', 56),
 ('tfidf', 56),
 ('i', 28)]

In [26]:
del test
del pre_processed_test

In [7]:
Y_train = filtered['BROWSE_NODE_ID']
print(Y_train.shape)

(1903408,)


In [8]:
del filtered

In [2]:
test = pd.read_csv('test.csv', escapechar = "\\", quoting = csv.QUOTE_NONE)
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [10]:
pre_processed_test = []

for col in tqdm(test.values):
    text = ""
    if not col[1]: text += "empty "
    else: text += str(col[1])
    if not col[2]: text += " empty "
    else: text += str(col[2])
    if not col[3]: text += " empty "
    else: text += str(col[3])
    if not col[4]: text += " empty"
    else: text += str(col[4])
    pre_processed_test.append(preprocess(text))

for i,text in enumerate(pre_processed_test[:5],1):
    print(f"{i}) {text}")

HBox(children=(IntProgress(value=0, max=110775), HTML(value='')))


1) command small kitchen hook white decor damag free easi easi hook strip multi pack unit pack includ hook small indoor strip hook hold lb organ damag free say goodby hole mark sticki residu wall door cabinet closet command hook easi use help keep wall look beauti tool requir hang hat calendar key measur spoon clean tool jewelri accessori want without nail hammer includ metal wire toggl featur versatil hang strong versatil command wire hook includ hold strong varieti indoor surfac includ paint wall smooth ceil finish wood glass tile metal smooth surfac remov clean reorgan inspir strike wall hook leav sticki adhes behind perfect use insid cabinet closet colleg dorm apart home offic damag free hang hold strong remov clean easi appli remov work varieti surfac command
2) neal jump hardwar jag unisex adult glove black red leather palm doubl layer thumb reinforc hook loop adjust wrist closur silicon print better grip long last flexibl materi perfect fit super lightweight design bold non fad

In [11]:
tfidf = pickle.load(open('tfidf_model_50K', 'rb'))

In [12]:
test_final = tfidf.transform(pre_processed_test)

In [13]:
test_final.shape

(110775, 50000)

In [14]:
pickle.dump(test_final, open('test_tfidf_50K', 'wb'))

In [15]:
pickle.dump(tfidf, open('tfidf_model_50K', 'wb'))

In [16]:
del pre_processed_text

In [15]:
import sys
sys.getsizeof(X_train)

56

In [17]:
pickle.dump((X_train, Y_train), open('filtered_tfidf_50K.pkl', 'wb'))

In [15]:
X_train, Y_train = pickle.load(open('filtered_tfidf_50K.pkl', 'rb'))

In [16]:
print(X_train.shape, Y_train.shape)

(1903408, 50000) (1903408,)


In [17]:
all_classes = Y_train.unique()
len(all_classes)

2759

In [18]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha = 0.00001)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
start = 0
minibatch_size = 50000
for i in tqdm(range(0, (int)(X_train.shape[0]/minibatch_size))):
    
    train_x = X_train[start:(start+minibatch_size), :]
    train_y = Y_train.iloc[start:(start+minibatch_size)]
    mnb.partial_fit(train_x, train_y, classes=all_classes)
    #_, test_x, _, test_y = train_test_split(X_test,Y_test, test_size=0.1)
    #print(f"Accuracy = {mnb.score(test_x, test_y)}")
    start+=minibatch_size

HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




In [21]:
test = pd.read_csv('test.csv', escapechar = "\\", quoting = csv.QUOTE_NONE)
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [22]:
prod_id = test['PRODUCT_ID']

In [23]:
test_final.shape

(110775, 50000)

In [24]:
pred = mnb.predict(test_final)
pred.shape

(110775,)

In [25]:
submission = pd.DataFrame(zip(prod_id, pred), columns = ['PRODUCT_ID', 'BROWSE_NODE_ID'])
submission.to_csv('submission9.csv', index = False)

In [43]:
from sklearn.linear_model import SGDClassifier
lr = SGDClassifier(loss = 'log', penalty = 'l1')
start = 0
minibatch_size = 50000
for i in tqdm(range(0, (int)(X_train.shape[0]/minibatch_size))):
    
    train_x = X_train[start:(start+minibatch_size), :]
    train_y = Y_train.iloc[start:(start+minibatch_size)]
    lr.partial_fit(train_x, train_y, classes=all_classes)
    #_, test_x, _, test_y = train_test_split(X_test,Y_test, test_size=0.1)
    #print(f"Accuracy = {mnb.score(test_x, test_y)}")
    start+=minibatch_size

HBox(children=(IntProgress(value=0, max=38), HTML(value='')))

In [44]:
predlr = lr.predict(test_final)
predlr.shape

(110775,)

In [45]:
submission = pd.DataFrame(zip(test['PRODUCT_ID'], predlr), columns = ['PRODUCT_ID', 'BROWSE_NODE_ID'])
submission.to_csv('submission4.csv', index = False)

In [46]:
brand.shape

(1903408,)

In [47]:
brand_ohe.shape

(1903408, 213032)

In [49]:
nmnb = MultinomialNB(alpha = 0.0001)

In [51]:
start = 0
minibatch_size = 10000
for i in tqdm(range(0, (int)(X_train.shape[0]/minibatch_size))):
    
    train_x = brand_ohe[start:(start+minibatch_size), :]
    train_y = Y_train.iloc[start:(start+minibatch_size)]
    nmnb.partial_fit(train_x, train_y, classes=all_classes)
    #_, test_x, _, test_y = train_test_split(X_test,Y_test, test_size=0.1)
    #print(f"Accuracy = {mnb.score(test_x, test_y)}")
    start+=minibatch_size

HBox(children=(IntProgress(value=0, max=190), HTML(value='')))

MemoryError: Unable to allocate array with shape (2759, 213032) and data type float64

In [52]:
del brand
del brand_ohe
del predlr

In [54]:
del X_train
del Y_train