In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm_notebook as tqdm
import csv

In [2]:
data = pd.read_csv('train.csv', escapechar = "\\", quoting = csv.QUOTE_NONE)
data.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [3]:
# data shape before and after removing duplicates from the data (if any)

print(f"Data Shape Before removing Duplicates : {data.shape}")

no_dup_data = data.drop_duplicates(subset = 'TITLE', inplace=False,keep='first')

print(f"Data Shape After removing Duplicates : {no_dup_data.shape}")

Data Shape Before removing Duplicates : (2903024, 5)
Data Shape After removing Duplicates : (2751513, 5)


In [4]:
print("total NaN values for different columns in the dataset:\n")
nans = [no_dup_data[name].isna().sum() for name in no_dup_data.columns]
total_pts = no_dup_data.shape[0]
for num,name in zip(nans,no_dup_data.columns):
    print(f"{name} : {(num/total_pts)*100}%")

total NaN values for different columns in the dataset:

TITLE : 3.634364075328737e-05%
DESCRIPTION : 24.782946691511178%
BULLET_POINTS : 5.9447656616559685%
BRAND : 2.05610513197648%
BROWSE_NODE_ID : 0.0%


In [5]:
# we will drop the rows with NaN values
clean_data = no_dup_data.dropna()

In [6]:
print(f"After dropping NaN values, Data shape : {clean_data.shape}")

After dropping NaN values, Data shape : (2002281, 5)


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [9]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [10]:
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [11]:
stemmer = SnowballStemmer('english')

In [12]:
eng_stopwords = set(stopwords.words('english'))

In [13]:
def preprocess(sentence):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'html.parser').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = " ".join([stemmer.stem(word) for word in sentence.split()])
    sentence = cleanpunc(sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in eng_stopwords)
    return sentence

In [14]:
xtrain = clean_data.iloc[:500000]
xtest = clean_data.iloc[500000:600000]
ytrain = clean_data['BROWSE_NODE_ID'].iloc[:500000]
ytest = clean_data['BROWSE_NODE_ID'].iloc[500000:600000]
print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)

(500000, 5) (500000,)
(100000, 5) (100000,)


In [15]:
X = clean_data.drop(columns = ['BROWSE_NODE_ID'])
Y = clean_data['BROWSE_NODE_ID']

In [16]:
subsetx = X.iloc[:500000]
subsety = Y.iloc[:500000]
print(subsetx.shape, subsety.shape)

(500000, 4) (500000,)


In [18]:
pre_processed_text = []

for col in tqdm(subsetx.values):
    text = preprocess(col[0]+' '+col[1]+' '+col[2])
    pre_processed_text.append(text)

for i,text in enumerate(pre_processed_text[:5],1):
    print(f"{i}) {text}")

HBox(children=(IntProgress(value=0, max=500000), HTML(value='')))


1) pete cat bedtim blue doll inch pete cat coolest popular cat town new pete cat bedtim blue doll merrymak rock stripe pj red slipper one sleepi cat readi cuddl measur inch tall safe age remov cloth surfac wash onli new pete cat bedtim blue plush doll base popular pete cat book jame dean super cudd readi naptim bedtim safe age perfect age measur inch
2) new yorker refriger magnet x new yorker handsom cello wrap hard magnet measur width height highlight one mani beauti new yorker cover full color cat tea cup new yorker cover artist gurbuz dogan eksioglu cat tea cup new yorker cover artist gurbuz dogan eksioglu handsom cello wrap hard magnet ideal home offic gift ani new yorker magazin lover highlight one mani beauti new yorker cover full color rigid magnet measur width height
3) men full sleev raglan shirt denim shirt size onli men full sleev raglan shirt denim shirt size onli color blue sleev full sleev materi cotton neck round
4) glanc women wallet black black wallet glanc treasur ad

In [98]:
pre_processed_text = []

for col in tqdm(X.values):
    text = preprocess(col[0]+' '+col[1]+' '+col[2])
    pre_processed_text.append(text)

for i,text in enumerate(pre_processed_text[:5],1):
    print(f"{i}) {text}")

HBox(children=(IntProgress(value=0, max=2002281), HTML(value='')))

1) pete cat bedtim blue doll inch pete cat coolest popular cat town new pete cat bedtim blue doll merrymak rock stripe pj red slipper one sleepi cat readi cuddl measur inch tall safe age remov cloth surfac wash onli new pete cat bedtim blue plush doll base popular pete cat book jame dean super cudd readi naptim bedtim safe age perfect age measur inch
2) new yorker refriger magnet x new yorker handsom cello wrap hard magnet measur width height highlight one mani beauti new yorker cover full color cat tea cup new yorker cover artist gurbuz dogan eksioglu cat tea cup new yorker cover artist gurbuz dogan eksioglu handsom cello wrap hard magnet ideal home offic gift ani new yorker magazin lover highlight one mani beauti new yorker cover full color rigid magnet measur width height
3) men full sleev raglan shirt denim shirt size onli men full sleev raglan shirt denim shirt size onli color blue sleev full sleev materi cotton neck round
4) glanc women wallet black black wallet glanc treasur add

In [99]:
pickle.dump(pre_processed_text, open('preprocessed_text', 'wb'))

In [41]:
"""X_train = pre_processed_text[:400000]
X_test = pre_processed_text[400000:]
y = subsety
y_train = y.iloc[:400000]
y_test = y.iloc[400000:]
print(y_train.shape)
print(y_test.shape)"""

(400000,)
(100000,)


In [101]:
X_train = pre_processed_text

In [102]:
from sklearn.feature_extraction.text import CountVectorizer
cnt_vec = CountVectorizer(min_df = 10, max_features = 30000)
X_train = cnt_vec.fit_transform(X_train)
#X_test = cnt_vec.transform(X_test)
print(X_train.shape)

(2002281, 30000)


In [111]:
Y.shape

(2002281,)

In [103]:
pickle.dump(X_train, open('bow_unfiltered_train', 'wb'))

In [114]:
pickle.dump(Y, open('unfiltered_labels', 'wb'))

In [104]:
sys.getsizeof(X_train)

56

In [117]:
del no_dup_data
del clean_data
del data
del xtrain
del subsetx
del test
del xtest
del w2v_words
del ytrain
del list_of_sentance
del preprocessed_essays
del test_class_distribution
del train_class_distribution

NameError: name 'xtrain' is not defined

In [97]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('X', 3134391632),
 ('Y', 32036520),
 ('subsety', 8000024),
 ('y', 8000024),
 ('y_train', 6400024),
 ('sub', 1772552),
 ('submission', 1772552),
 ('y_test', 1600024),
 ('ytest', 1600024),
 ('pre_processed_text', 927568),
 ('sorted_yi', 76104),
 ('eng_stopwords', 8416),
 ('sent_vec', 2496),
 ('BeautifulSoup', 2000),
 ('SGDClassifier', 2000),
 ('CountVectorizer', 1464),
 ('KNeighborsClassifier', 1464),
 ('LogisticRegression', 1464),
 ('sent', 1344),
 ('KeyedVectors', 1056),
 ('OneVsRestClassifier', 1056),
 ('SnowballStemmer', 1056),
 ('Word2Vec', 1056),
 ('text', 1015),
 ('OneHotEncoder', 888),
 ('sent_vectors', 432),
 ('sentance', 286),
 ('classification_report', 136),
 ('cleanpunc', 136),
 ('confusion_matrix', 136),
 ('decontracted', 136),
 ('preprocess', 136),
 ('nans', 128),
 ('col', 96),
 ('pred', 96),
 ('row', 96),
 ('vec', 96),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('sns', 80),
 ('name', 63),
 ('X_test', 56),
 ('X_train', 56),
 ('classifier', 56),
 ('cnt_vec', 56),
 ('knn', 56

In [112]:
from sklearn.neighbors import KNeighborsClassifier
  
knn = KNeighborsClassifier(n_neighbors = 1)
  
knn.fit(X_train, Y_train)
pred = knn.predict(X_test)
  
# Predictions and Evaluations
# Let's evaluate our KNN model ! 
from sklearn.metrics import classification_report, confusion_matrix
  
print(classification_report(y_test, pred))

"pred = knn.predict(X_test)\n  \n# Predictions and Evaluations\n# Let's evaluate our KNN model ! \nfrom sklearn.metrics import classification_report, confusion_matrix\n  \nprint(classification_report(y_test, pred))"

In [105]:
test = pd.read_csv('test.csv', escapechar = "\\", quoting = csv.QUOTE_NONE)
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [106]:
test.shape

(110775, 5)

In [107]:
test.isnull().any()

PRODUCT_ID       False
TITLE             True
DESCRIPTION       True
BULLET_POINTS     True
BRAND             True
dtype: bool

In [108]:
pre_processed_text = []

for col in tqdm(test.values):
    text = ""
    if not col[1]: text += "empty "
    else: text += str(col[1])
    if not col[2]: text += " empty "
    else: text += str(col[2])
    if not col[3]: text += " empty"
    else: text += str(col[3])
    pre_processed_text.append(text)

for i,text in enumerate(pre_processed_text[:5],1):
    print(f"{i}) {text}")

HBox(children=(IntProgress(value=0, max=110775), HTML(value='')))

1) Command 3M Small Kitchen Hooks, White, Decorate Damage Free, Easy On, Easy Off, 9 Hooks, 12 Strips, Multi-Pack - 17067-VPSale Unit: PACK[INCLUDES - 9 hooks and 12 small indoor strips; 1 hook holds 1/2 lb,ORGANIZE DAMAGE-FREE - Say goodbye to holes, marks, or sticky residue on your walls, doors, cabinets, or closets; Command Hooks by 3M are easy to use and help keep your walls looking beautiful,NO TOOLS REQUIRED- Hang hats, calendars, keys, measuring spoons, cleaning tools, jewelry, and accessories where you want without nails or a hammer. Includes metal wire toggle feature for versatile hanging,STRONG AND VERSATILE- Command Wire Hooks include hold strongly on a variety of indoor surfaces including painted walls, smooth ceilings, finished wood, glass, tile, metal, and other smooth surfaces,REMOVES CLEANLY- Reorganize when inspiration strikes; These wall hooks leave no sticky adhesive behind; Perfect to use inside cabinets and closets, in your college dorm, apartment, home, and office

In [109]:
test_final = cnt_vec.transform(pre_processed_text)
print(test_final.shape)

(110775, 30000)


In [118]:
pickle.dump(test_final, open('test_bow', 'wb'))

In [113]:
pred = knn.predict(test_final) 
print(pred.shape)

MemoryError: Unable to allocate array with shape (142249593,) and data type int32

In [115]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('X', 3134391632),
 ('test', 168736889),
 ('Y', 32036520),
 ('subsety', 8000024),
 ('y', 8000024),
 ('y_train', 6400024),
 ('sub', 1772552),
 ('submission', 1772552),
 ('y_test', 1600024),
 ('ytest', 1600024),
 ('pre_processed_text', 927568),
 ('sorted_yi', 76104),
 ('eng_stopwords', 8416),
 ('sent_vec', 2496),
 ('BeautifulSoup', 2000),
 ('SGDClassifier', 2000),
 ('CountVectorizer', 1464),
 ('KNeighborsClassifier', 1464),
 ('LogisticRegression', 1464),
 ('sent', 1344),
 ('KeyedVectors', 1056),
 ('OneVsRestClassifier', 1056),
 ('SnowballStemmer', 1056),
 ('Word2Vec', 1056),
 ('text', 1015),
 ('OneHotEncoder', 888),
 ('sent_vectors', 432),
 ('sentance', 286),
 ('classification_report', 136),
 ('cleanpunc', 136),
 ('confusion_matrix', 136),
 ('decontracted', 136),
 ('preprocess', 136),
 ('nans', 128),
 ('col', 96),
 ('pred', 96),
 ('row', 96),
 ('vec', 96),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('sns', 80),
 ('name', 63),
 ('X_test', 56),
 ('X_train', 56),
 ('classifier', 56),
 ('cnt_

In [82]:
submission = pd.DataFrame(zip(test['PRODUCT_ID'], pred), columns = ['PRODUCT_ID', 'BROWSE_NODE_ID'])
submission.to_csv('submission1.csv', index = False)

In [83]:
sub = pd.read_csv('submission1.csv')
sub.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,15960
1,2,368
2,3,25077
3,4,290
4,5,1724


In [89]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('data', 3747053655),
 ('X', 3134391632),
 ('xtrain', 784578359),
 ('subsetx', 780578359),
 ('test', 168736889),
 ('xtest', 157388850),
 ('Y', 32036520),
 ('w2v_words', 27000112),
 ('subsety', 8000024),
 ('y', 8000024),
 ('ytrain', 8000024),
 ('y_train', 6400024),
 ('list_of_sentance', 4290016),
 ('preprocessed_essays', 4290016),
 ('sub', 1772552),
 ('submission', 1772552),
 ('y_test', 1600024),
 ('ytest', 1600024),
 ('pre_processed_text', 927568),
 ('test_class_distribution', 152040),
 ('train_class_distribution', 126616),
 ('sorted_yi', 76104),
 ('eng_stopwords', 8416),
 ('sent_vec', 2496),
 ('BeautifulSoup', 2000),
 ('SGDClassifier', 2000),
 ('CountVectorizer', 1464),
 ('KNeighborsClassifier', 1464),
 ('LogisticRegression', 1464),
 ('sent', 1344),
 ('KeyedVectors', 1056),
 ('OneVsRestClassifier', 1056),
 ('SnowballStemmer', 1056),
 ('Word2Vec', 1056),
 ('text', 1015),
 ('OneHotEncoder', 888),
 ('sent_vectors', 432),
 ('sentance', 286),
 ('classification_report', 136),
 ('cleanpunc'

In [92]:
import pickle
pickle.dump((X_train, y_train), open('train_data_bow_sample.pkl', 'wb'))
pickle.dump((X_test, y_test), open('test_data_bow_sample.pkl', 'wb'))

In [91]:
# this will be taking so much time try not to run it, download the lr_with_equal_weight.pkl file and use to predict
# This takes about 6-7 hours to run.
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1', verbose = 1))
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

print("accuracy :",metrics.accuracy_score(y_test,predictions))
print("macro f1 score :",metrics.f1_score(y_test, predictions, average = 'macro'))
print("micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro'))
print("hamming loss :",metrics.hamming_loss(y_test,predictions))
print("Precision recall report :\n",metrics.classification_report(y_test, predictions))

KeyboardInterrupt: 