In [34]:
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle
#import lda

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")

from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


%matplotlib inline


import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)

In [35]:
PATH="/home/prateek/Price-Prediction-from-ItemDescription-using-LDA/Prateek/mercari-price-suggestion-challenge/"

train=pd.read_csv(f'{PATH}train.tsv',sep='\t')
test = pd.read_csv(f'{PATH}test.tsv', sep='\t')

In [36]:
print(train.shape)
print(test.shape)

(1482535, 8)
(693359, 7)


In [37]:
train.dtypes

train_id               int64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object

In [38]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [39]:
stop=set(stopwords.words('english'))
def tokenize(text):
    try:
        regex=re.compile("["+string.punctuation+"0-9\\r\\t\\n]")
        text=regex.sub(" ",text)
        
        tokens_=[word_tokenize(s) for s in sent_tokenize(text)]
        tokens=[]
        for token_by_sent in tokens_:
            tokens+=token_by_sent;
        
        tokens=list(filter(lambda t:t.lower() not in stop,tokens))
        filtered_tokens=[w for w in tokens if re.search('[a-zA-Z]',w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
    
    except TypeError as e :print(text,e)

In [40]:
train['tokens']=train['item_description'].head().map(tokenize)
test["tokens"]=test["item_description"].head().map(tokenize)

In [41]:
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [42]:
for description,tokens in zip(train["item_description"].head(),train["tokens"].head()):
    print("description:",description)
    print("tokens:",tokens)
    print() 
    

description: No description yet
tokens: ['description', 'yet']

description: This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.
tokens: ['keyboard', 'great', 'condition', 'works', 'like', 'came', 'box', 'ports', 'tested', 'work', 'perfectly', 'lights', 'customizable', 'via', 'razer', 'synapse', 'app']

description: Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!
tokens: ['adorable', 'top', 'hint', 'lace', 'key', 'hole', 'back', 'pale', 'pink', 'also', 'available', 'white']

description: New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage
tokens: ['new', 'tags', 'leather', 'horses', 'retail', 'stand', 'foot', 'high', 'sold', 'pair', 'questions', 'please', 

In [43]:
print("There are %d unique values in the category column." % train['category_name'].nunique())

There are 1287 unique values in the category column.


In [44]:
train["category_name"].value_counts()[:5]

Women/Athletic Apparel/Pants, Tights, Leggings    60177
Women/Tops & Blouses/T-Shirts                     46380
Beauty/Makeup/Face                                34335
Beauty/Makeup/Lips                                29910
Electronics/Video Games & Consoles/Games          26557
Name: category_name, dtype: int64

In [45]:
print("There are %d items that do not have a label." % train['category_name'].isnull().sum())

There are 6327 items that do not have a label.


In [46]:
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

In [58]:
train['general_cat'], train['subcat_1'], train['subcat_2'] = zip(*train['category_name'].apply(lambda x: split_cat(x)))
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,tokens,general_cat,subcat_1,subcat_2
1482495,1482495,XBOX Bundle,3,Electronics/Video Games & Consoles/Games,Xbox,25.0,1,2 XBOX games Call of duty and Halo,,Electronics,Video Games & Consoles,Games
1482496,1482496,Benefit they're real mascara,1,Beauty/Makeup/Eyes,Benefit,8.0,1,No description yet,,Beauty,Makeup,Eyes
1482497,1482497,Wet Seal Pink Bodysuit size XL,2,Women/Tops & Blouses/Blouse,Wet Seal,14.0,0,"Wet Seal light pink Bodysuit. Light, thin mate...",,Women,Tops & Blouses,Blouse
1482498,1482498,Cupquake beauty rush mist ON HOLD,3,Beauty/Fragrance/Women,,27.0,0,No description yet,,Beauty,Fragrance,Women
1482499,1482499,iPhone 7+,2,Electronics/Cell Phones & Accessories/Cell Pho...,Apple,400.0,1,For Serena,,Electronics,Cell Phones & Accessories,Cell Phones & Smartphones
1482500,1482500,Brandy Melville crop top,2,"Women/Tops & Blouses/Tank, Cami",Brandy Melville,8.0,0,Barely worn,,Women,Tops & Blouses,"Tank, Cami"
1482501,1482501,Jeffrey Star Velour Liquid Lipstick,2,Beauty/Makeup/Lips,,38.0,0,Hey so I ordered these online and finally got ...,,Beauty,Makeup,Lips
1482502,1482502,"MURANO, GLASS FISH PAPERWEIGHT",3,Home/Home Décor/Home Décor Accents,,41.0,0,"Made In Italy, Solid Glass and Very Heavy, Pap...",,Home,Home Décor,Home Décor Accents
1482503,1482503,Bundle!,2,"Women/Jeans/Slim, Skinny",Abercrombie & Fitch,19.0,0,"Brett fit, in great condition. and I will bund...",,Women,Jeans,"Slim, Skinny"
1482504,1482504,Herschel Wallet,1,Men/Men's Accessories/Wallets,,25.0,1,Bi-fold wallet from Herschel Supply Co. Premiu...,,Men,Men's Accessories,Wallets


In [60]:
general_cats = train['general_cat'].unique()
print(general_cats)

['Men' 'Electronics' 'Women' 'Home' 'Sports & Outdoors'
 'Vintage & Collectibles' 'Beauty' 'Other' 'Kids' 'No Label' 'Handmade']


In [82]:
cat_desc=dict()
#before running uncomment loop for all general_cats
# for cat in general_cats:
text=" ".join(train.loc[train["general_cat"]=="Men","item_description"].values)
cat_desc["Men"]=tokenize(text)



In [85]:
print(cat_desc["Men"][0:100])

['description', 'yet', 'percent', 'authentic', 'beaters', 'still', 'lot', 'life', 'original', 'box', 'wear', 'back', 'white', 'spots', 'believe', 'washed', 'men', 'xxl', 'good', 'condition', 'minor', 'mark', 'sleeve', 'worn', 'washed', 'sent', 'rose', 'gold', 'bezel', 'crown', 'engraved', 'logo', 'red', 'green', 'signature', 'band', 'unisex', 'box', 'boys', 'lebron', 'soldier', 'great', 'condition', 'black', 'ferragamo', 'belt', 'fit', 'sizes', 'nothing', 'loose', 'order', 'item', 'described', 'simply', 'return', 'package', 'get', 'money', 'back', 'brand', 'new', 'packaging', 'never', 'worn', 'guaranteed', 'authentic', 'bought', 'nordstrom', 'women', 'never', 'worn', 'comes', 'receipt', 'sleep', 'fresh', 'men', 'nike', 'dri', 'fit', 'shirt', 'blue', 'items', 'come', 'clean', 'smoke', 'pet', 'free', 'home', 'flash', 'sale', 'today', 'sunday', 'like', 'new', 'purchased', 'last', 'may', 'worn', 'times', 'retail']
