In [33]:
%matplotlib inline
import matplotlib.pyplot as plt

import gc
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from time import gmtime, strftime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import sys

#Add https://www.kaggle.com/anttip/wordbatch to your kernel Data Sources, 
#until Kaggle admins fix the wordbatch pip package installation
# sys.path.insert(0, '../input/wordbatch/wordbatch/')
##import wordbatch
##from wordbatch.extractors import WordBag, WordHash
##from wordbatch.models import FTRL, FM_FTRL

from nltk.corpus import stopwords
import re

In [34]:
NUM_BRANDS = 4500
NUM_CATEGORIES = 1250

develop = False
# develop= True

start_time = time.time()
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-02-01 13:19:30


In [35]:
LOAD_TEST = False


train = pd.read_table('input/train.tsv', engine='c')
print('Train shape: ', train.shape)
merged = pd.concat([train])
del train


if LOAD_TEST:
    test = pd.read_table('input/test.tsv', engine='c')
    print('Test shape: ', test.shape)
    merged = pd.concat([merged, test])
    del test
    
print('Merged shape: ', merged.shape)

merged.head()

Train shape:  (593376, 8)
Merged shape:  (593376, 8)


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [36]:
## preprocess - splitting category names
##

tt = pd.DataFrame([ str(x).split("/") for x in merged['category_name']])
cat_col_names = ['general_cat','subcat_1','subcat_2','subcat_3','subcat_4']
tt.columns = ['general_cat','subcat_1','subcat_2','subcat_3','subcat_4']

merged[cat_col_names] = tt

merged.drop('category_name', axis=1, inplace=True)
del tt


In [38]:
## replace missing values
for col_name in  cat_col_names+['brand_name']+['item_description']:
    merged['mis_'+col_name] = 0
    merged[col_name].fillna(value='missing', inplace=True)
    merged.loc[merged[col_name]=='missing','mis_'+col_name] = 1

merged.head()

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,general_cat,subcat_1,subcat_2,subcat_3,subcat_4,mis_general_cat,mis_subcat_1,mis_subcat_2,mis_subcat_3,mis_subcat_4,mis_brand_name,mis_item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,missing,10.0,1,No description yet,Men,Tops,T-shirts,missing,missing,0,0,0,1,1,1,0
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts,missing,missing,0,0,0,1,1,0,0
2,2,AVA-VIV Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse,missing,missing,0,0,0,1,1,0,0
3,3,Leather Horse Statues,1,missing,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents,missing,missing,0,0,0,1,1,1,0
4,4,24K GOLD plated rose,1,missing,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces,missing,missing,0,0,0,1,1,1,0


In [39]:
VAL_MISS_MARGIN = 10

### delete rare brands
##

pop_brand = merged['brand_name'].value_counts().loc[lambda x: x<=VAL_MISS_MARGIN].index.values
merged.loc[merged['brand_name'].isin(pop_brand), 'mis_brand_name'] = 1
merged.loc[merged['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'

## delete rare categories
##
pop_category1 = merged['general_cat'].value_counts().loc[lambda x: x<=VAL_MISS_MARGIN].index.values
pop_category2 = merged['subcat_1'].value_counts().loc[lambda x: x<=VAL_MISS_MARGIN].index.values
pop_category3 = merged['subcat_2'].value_counts().loc[lambda x: x<=VAL_MISS_MARGIN].index.values
merged.loc[merged['general_cat'].isin(pop_brand), 'mis_general_cat'] = 1
merged.loc[merged['general_cat'].isin(pop_category1), 'general_cat'] = 'missing'
merged.loc[merged['subcat_1'].isin(pop_brand), 'mis_subcat_1'] = 1
merged.loc[merged['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing'
merged.loc[merged['subcat_2'].isin(pop_brand), 'mis_subcat_2'] = 1
merged.loc[merged['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing'
    
print (pop_brand)
print (pop_category1)
print (pop_category2)
print (pop_category3)

['Free Country' 'Speck' 'Call It Spring' ..., 'USA Hockey'
 'American Rebel Boot Company' 'Bruder Toys America']
[]
[]
['Automotive Enthusiast Merchandise' 'Nasal Aspirators' 'Outdoor Safety'
 'Tires & Wheels' 'Grooming & Healthcare Kits' 'Puzzle' 'Radio' 'Teacup'
 'Home Office Furniture' 'Furniture' 'Skirts' 'Action, Adventure' 'Box'
 'Photographs' 'Frame' 'Wallet' 'Car Speakers & Systems' 'Refrigerators'
 'Cream and Sugar Set' 'Three Button' 'Brass Instruments' 'Inflatable Beds'
 'Bookmark' 'Bouquets' 'Case' 'Stained Glass' 'Mixed Media' 'Feeding'
 'Car Video' 'Photography' 'Standard' 'Instrument' 'Cuff Links' 'Tape'
 'Dress Shorts' 'Golf Bags' 'Activity Centers & Entertainers' 'Dress Suit'
 'Belt' 'Lighting & Studio' 'Bed' 'Lightweight' 'Toiletry Kits'
 'Instructional' 'Shams, Bed Skirts & Bed Frame Draperies' 'Magnet'
 'Dinnerware Set' 'Crochet' 'Peasant' 'Housewares' 'How to' 'Tiered'
 'Block' 'Sleep Positioners' 'Performance Parts & Accessories' 'Outerwear'
 'Pillow' 'Dehumidifie

In [40]:
## https://www.kaggle.com/anttip/wordbatch-ftrl-fm-lgb-lbl-0-42555/code
## https://www.kaggle.com/anttip/wordbatch-ftrl-fm-lgb-lbl-0-42555/code
merged.head()

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,general_cat,subcat_1,subcat_2,subcat_3,subcat_4,mis_general_cat,mis_subcat_1,mis_subcat_2,mis_subcat_3,mis_subcat_4,mis_brand_name,mis_item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,missing,10.0,1,No description yet,Men,Tops,T-shirts,missing,missing,0,0,0,1,1,1,0
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts,missing,missing,0,0,0,1,1,0,0
2,2,AVA-VIV Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse,missing,missing,0,0,0,1,1,0,0
3,3,Leather Horse Statues,1,missing,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents,missing,missing,0,0,0,1,1,1,0
4,4,24K GOLD plated rose,1,missing,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces,missing,missing,0,0,0,1,1,1,0
