In [13]:
%matplotlib inline
import matplotlib.pyplot as plt

import gc
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from time import gmtime, strftime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import sys

#Add https://www.kaggle.com/anttip/wordbatch to your kernel Data Sources, 
#until Kaggle admins fix the wordbatch pip package installation
sys.path.insert(0, '../input/wordbatch/wordbatch/')
import wordbatch

from wordbatch.extractors import WordBag, WordHash
from wordbatch.models import FTRL, FM_FTRL

from nltk.corpus import stopwords
import re

In [5]:
NUM_BRANDS = 4500
NUM_CATEGORIES = 1250

develop = False
# develop= True

start_time = time.time()
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-01-29 19:18:51


In [129]:
train = pd.read_table('input/train.tsv', engine='c')
test = pd.read_table('input/test.tsv', engine='c')

print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

merged = pd.concat([train, test])
print('Merged shape: ', merged.shape)

del train
del test
merge.head()

Train shape:  (1482535, 8)
Test shape:  (693359, 7)
Merged shape:  (2175894, 9)


Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0


In [130]:
## preprocess - splitting category names
##

tt = pd.DataFrame([ str(x).split("/") for x in merge['category_name']])
cat_col_names = ['general_cat','subcat_1','subcat_2','subcat_3','subcat_4']
tt.columns = ['general_cat','subcat_1','subcat_2','subcat_3','subcat_4']

merged[cat_col_names] = tt
del tt


In [128]:
### delete rare brands
##
pop_brand = merged['brand_name'].value_counts().loc[lambda x: x<=10].index.values
merged

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,...,subcat_2,subcat_3,subcat_4,mis_general_cat,mis_subcat_1,mis_subcat_2,mis_subcat_3,mis_subcat_4,mis_brand_name,mis_item_description
0,missing,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,...,T-shirts,missing,missing,0,0,0,1,1,1,0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,...,Components & Parts,missing,missing,0,0,0,1,1,0,0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,...,Blouse,missing,missing,0,0,0,1,1,0,0
3,missing,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,Home,...,Home Décor Accents,missing,missing,0,0,0,1,1,1,0
4,missing,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,Women,...,Necklaces,missing,missing,0,0,0,1,1,1,0
5,missing,Women/Other/Other,3,"Banana republic bottoms, Candies skirt with ma...",Bundled items requested for Ruie,59.0,0,,5.0,Women,...,Other,missing,missing,0,0,0,1,1,1,0
6,Acacia Swimwear,Women/Swimwear/Two-Piece,3,Size small but straps slightly shortened to fi...,Acacia pacific tides santorini top,64.0,0,,6.0,Women,...,Two-Piece,missing,missing,0,0,0,1,1,0,0
7,Soffe,Sports & Outdoors/Apparel/Girls,3,You get three pairs of Sophie cheer shorts siz...,Girls cheer and tumbling bundle of 7,6.0,1,,7.0,Sports & Outdoors,...,Girls,missing,missing,0,0,0,1,1,0,0
8,Nike,Sports & Outdoors/Apparel/Girls,3,Girls Size small Plus green. Three shorts total.,Girls Nike Pro shorts,19.0,0,,8.0,Sports & Outdoors,...,Girls,missing,missing,0,0,0,1,1,0,0
9,missing,Vintage & Collectibles/Collectibles/Doll,3,I realized his pants are on backwards after th...,Porcelain clown doll checker pants VTG,8.0,0,,9.0,Vintage & Collectibles,...,Doll,missing,missing,0,0,0,1,1,1,0


In [125]:
merged['brand_name'].head()

0    missing
1      Razer
2     Target
3    missing
4    missing
Name: brand_name, dtype: object

In [123]:
## replace missing values
for col_name in  cat_col_names+['brand_name']+['item_description']:
    merged['mis_'+col_name] = 0
    merged[col_name].fillna(value='missing', inplace=True)
    merged.loc[merged[col_name]=='missing','mis_'+col_name] = 1

merged.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,...,subcat_2,subcat_3,subcat_4,mis_general_cat,mis_subcat_1,mis_subcat_2,mis_subcat_3,mis_subcat_4,mis_brand_name,mis_item_description
0,missing,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,...,T-shirts,missing,missing,0,0,0,1,1,1,0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,...,Components & Parts,missing,missing,0,0,0,1,1,0,0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,...,Blouse,missing,missing,0,0,0,1,1,0,0
3,missing,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,Home,...,Home Décor Accents,missing,missing,0,0,0,1,1,1,0
4,missing,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,Women,...,Necklaces,missing,missing,0,0,0,1,1,1,0


In [74]:
merged.drop('category_name', axis=1, inplace=True)

In [75]:
merged

Unnamed: 0,brand_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2,subcat_3,subcat_4,mis_general_cat,mis_subcat_1,mis_subcat_2,mis_subcat_3,mis_subcat_4,mis_brand_name,mis_item_description
0,missing,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,Tops,T-shirts,missing,missing,0,0,0,1,1,1,0
1,Razer,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,Computers & Tablets,Components & Parts,missing,missing,0,0,0,1,1,0,0
2,Target,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,Tops & Blouses,Blouse,missing,missing,0,0,0,1,1,0,0
3,missing,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,Home,Home Décor,Home Décor Accents,missing,missing,0,0,0,1,1,1,0
4,missing,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,Women,Jewelry,Necklaces,missing,missing,0,0,0,1,1,1,0
5,missing,3,"Banana republic bottoms, Candies skirt with ma...",Bundled items requested for Ruie,59.0,0,,5.0,Women,Other,Other,missing,missing,0,0,0,1,1,1,0
6,Acacia Swimwear,3,Size small but straps slightly shortened to fi...,Acacia pacific tides santorini top,64.0,0,,6.0,Women,Swimwear,Two-Piece,missing,missing,0,0,0,1,1,0,0
7,Soffe,3,You get three pairs of Sophie cheer shorts siz...,Girls cheer and tumbling bundle of 7,6.0,1,,7.0,Sports & Outdoors,Apparel,Girls,missing,missing,0,0,0,1,1,0,0
8,Nike,3,Girls Size small Plus green. Three shorts total.,Girls Nike Pro shorts,19.0,0,,8.0,Sports & Outdoors,Apparel,Girls,missing,missing,0,0,0,1,1,0,0
9,missing,3,I realized his pants are on backwards after th...,Porcelain clown doll checker pants VTG,8.0,0,,9.0,Vintage & Collectibles,Collectibles,Doll,missing,missing,0,0,0,1,1,1,0


In [76]:
def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category1 = dataset['general_cat'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category2 = dataset['subcat_1'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category3 = dataset['subcat_2'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['general_cat'].isin(pop_category1), 'general_cat'] = 'missing'
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing'
    
cutting(merged)

In [83]:
merged['brand_name'].value_counts().loc[lambda x: x.index != 'missing']

Nike                   79277
PINK                   79092
Victoria's Secret      70508
LuLaRoe                45598
Apple                  25435
FOREVER 21             22327
Nintendo               22156
Lululemon              21391
Michael Kors           20335
American Eagle         19418
Rae Dunn               18031
Sephora                17849
Disney                 15425
Coach                  15309
Bath & Body Works      15187
Adidas                 15002
Funko                  13568
Under Armour           12414
Sony                   11729
Old Navy               11089
Hollister              10182
Carter's                9289
The North Face          9137
Urban Decay             8979
Independent             8681
Too Faced               8479
Xbox                    8406
Brandy Melville         8366
Kate Spade              7863
MAC                     7850
                       ...  
Pro Line                   1
Zen Group                  1
Clairol                    1
SkinCeuticals 

In [None]:
## https://www.kaggle.com/tunguz/wordbatch-ftrl-fm-lgb-lbl-0-42506/code
##
