In [1]:
import gc
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import sys

from multiprocessing import Pool

from functools import reduce
from nltk.corpus import stopwords
# stopWords = []
# for i in """!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'""":
#     stopWords.append(i)
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()
lemma_dict = {}
def lemmatizer(word):
    try:
        word = word.lower()
        if word in lemma_dict:
            return lemma_dict[word]
        else:
            normal_form = wordnet_lemmatizer.lemmatize(word)
            lemma_dict[word] = normal_form
            return normal_form
    except:
        return 'nonascii'
def join_2(x,*args):
    if len(args) == 0:
        return x
    else:
        return x+" " +args[0]
def transform(x):
    try:
        for symbol in stopWords:
            if symbol in x:
                x = x.replace(symbol,"")
        x = x.split()
        x = map(lemmatizer,x)
        #x = map(asc,x)
        x =  reduce(join_2,x)
        return x
    except:
        return "problem"

#Add https://www.kaggle.com/anttip/wordbatch to your kernel Data Sources, 
#until Kaggle admins fix the wordbatch pip package installation
#sys.path.insert(0, '../input/wordbatch/wordbatch/')
import wordbatch

from wordbatch.extractors import WordBag, WordHash
from wordbatch.models import FTRL, FM_FTRL

from nltk.corpus import stopwords
import re

NUM_BRANDS = 4500
NUM_CATEGORIES = 1200

develop = True
# develop= True

def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2)))


def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("No Label", "No Label", "No Label")


def handle_missing_inplace(dataset):
    dataset['general_cat'].fillna(value='m', inplace=True)
    dataset['subcat_1'].fillna(value='m', inplace=True)
    dataset['subcat_2'].fillna(value='m', inplace=True)
    dataset['brand_name'].fillna(value='m', inplace=True)
    dataset['item_description'].fillna(value='m', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'm'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'm'
    pop_category1 = dataset['general_cat'].value_counts().loc[lambda x: x.index != 'm'].index[:NUM_CATEGORIES]
    pop_category2 = dataset['subcat_1'].value_counts().loc[lambda x: x.index != 'm'].index[:NUM_CATEGORIES]
    pop_category3 = dataset['subcat_2'].value_counts().loc[lambda x: x.index != 'm'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['general_cat'].isin(pop_category1), 'general_cat'] = 'm'
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'm'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'm'


def to_categorical(dataset):
    dataset['general_cat'] = dataset['general_cat'].astype('category')
    dataset['subcat_1'] = dataset['subcat_1'].astype('category')
    dataset['subcat_2'] = dataset['subcat_2'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')


# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')


def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])



start_time = time.time()
from time import gmtime, strftime
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

# if 1 == 1:
train = pd.read_table('train.tsv', engine='c')
test = pd.read_table('test.tsv', engine='c')

####
print(test.shape)
test_len = test.shape[0]
def simulate_test(test):
    if test.shape[0] < 800000:
        indices = np.random.choice(test.index.values, 2800000)
        test_ = pd.concat([test, test.iloc[indices]], axis=0)
        return test_.copy()
    else:
        return test
#test = simulate_test(test)
print('new shape ', test.shape)
####
#train = pd.read_table('../input/train.tsv', engine='c')
#test = pd.read_table('../input/test.tsv', engine='c')

print('[{}] Finished to load data'.format(time.time() - start_time))
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
nrow_test = train.shape[0]  # -dftt.shape[0]
dftt = train[(train.price < 1.0)]
train = train.drop(train[(train.price < 1.0)].index)
del dftt['price']
nrow_train = train.shape[0]
# print(nrow_train, nrow_test)
y = np.log1p(train["price"])
merge = pd.concat([train, dftt, test])
#submission: pd.DataFrame = test[['test_id']]

del train
del test
gc.collect()

merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
    zip(*merge['category_name'].apply(lambda x: split_cat(x)))
merge.drop('category_name', axis=1, inplace=True)
print('[{}] Split categories completed.'.format(time.time() - start_time))

handle_missing_inplace(merge)
print('[{}] Handle missing completed.'.format(time.time() - start_time))

cutting(merge)
print('[{}] Cut completed.'.format(time.time() - start_time))

to_categorical(merge)
print('[{}] Convert categorical completed'.format(time.time() - start_time))

wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                              "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                              "idf": None,
                                                              }), procs=8)
wb.dictionary_freeze= True
X_name = wb.fit_transform(merge['name'])
del(wb)
X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

wb = CountVectorizer()
X_category1 = wb.fit_transform(merge['general_cat'])
X_category2 = wb.fit_transform(merge['subcat_1'])
X_category3 = wb.fit_transform(merge['subcat_2'])
print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

# wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                              "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                              "idf": None})
                         , procs=8)
wb.dictionary_freeze= True

# p = Pool(processes=8)
# merge['item_description'] = p.map(transform, merge.item_description.values)
# p.terminate()

X_description = wb.fit_transform(merge['item_description'])
del(wb)
gc.collect()
X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                      sparse=True).values)
print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
      X_name.shape)
sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()

print('[{}] Create sparse merge completed'.format(time.time() - start_time))

#    pd.to_pickle((sparse_merge, y), "xy.pkl")
# else:
#    nrow_train, nrow_test= 1481661, 1482535
#    sparse_merge, y = pd.read_pickle("xy.pkl")

# Remove features with document frequency <=1
print(sparse_merge.shape)
mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
sparse_merge = sparse_merge[:, mask]
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_test:]
print(sparse_merge.shape)

gc.collect()
train_X, train_y = X, y
if develop:
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=100)

2018-02-01 21:33:11
(693359, 7)
new shape  (693359, 7)
[8.033500909805298] Finished to load data
Train shape:  (1482535, 8)
Test shape:  (693359, 7)
[20.874011039733887] Split categories completed.
[21.975393295288086] Handle missing completed.
[41.09307670593262] Cut completed.
[43.25295066833496] Convert categorical completed
Normalize text
Extract wordbags
[165.4384262561798] Vectorize `name` completed.
[194.16664218902588] Count vectorize `categories` completed.
Normalize text
Extract wordbags
[410.29143357276917] Vectorize `item_description` completed.
[425.0072023868561] Label binarize `brand_name` completed.
[430.4015955924988] Get dummies on `item_condition_id` and `shipping` completed.
(2175894, 6) (2175894, 2040339) (2175894, 4501) (2175894, 14) (2175894, 143) (2175894, 977) (2175894, 518467)
[521.7781324386597] Create sparse merge completed
(2175894, 2564447)
(2175894, 2563931)


In [None]:
print('finished')
d_shape = sparse_merge.shape[1]


finished


In [None]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from bayes_opt import BayesianOptimization
from tqdm import tqdm
import math
def FTRL_evaluate(alpha,
                 beta,
                 L1,
                 L2,
                 D_fm,
                 alpha_fm,
                 L2_fm,
                 init_fm,
                 e_noise,
                 iters
                 ):
    try:

        model = FM_FTRL(alpha=alpha, beta=beta, L1=L1, L2=L2, D=d_shape, alpha_fm=alpha_fm, L2_fm=L2_fm, 
            init_fm=init_fm, D_fm=int(D_fm), e_noise=e_noise, iters=int(iters), inv_link="identity", 
            threads=16, seed=2017)
        model.fit(train_X, train_y)
        preds = model.predict(X=valid_X)
        value = -rmsle(np.expm1(valid_y), np.expm1(preds))
        del model, preds
        if math.isnan(value):
            value = -999
        return value
    except:
        pass


num_rounds = 3000
random_state = 2016
num_iter = 1000
init_points = 5
params = {
    'eta': 0.1,
    'silent': 1,
    'eval_metric': 'mae',
    'verbose_eval': True,
    'seed': random_state
}
for i in range(100):
    try:
        tun = BayesianOptimization(FTRL_evaluate, {'alpha': (0, 9.9),
                                                    'beta': (0, 9.9),
                                                    'L1': (0, 9.9),
                                                    'L2': (0, 9.9),
                                                    'alpha_fm': (0.0001, 0.0999),
                                                    'L2_fm': (0.00001, 0.00999),
                                                    'init_fm': (0.00001, 0.00999),
                                                    'e_noise': (0.0001, 0.1),
                                                    'iters': (1, 5),
                                                    'D_fm':(50, 200)

                                                    })

        tun.maximize(init_points=init_points, n_iter=num_iter)
    except:
        continue

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |      D_fm |        L1 |        L2 |     L2_fm |     alpha |   alpha_fm |      beta |   e_noise |   init_fm |     iters | 
    1 | 02m35s | [35m  -0.51580[0m | [32m 113.6078[0m | [32m   8.2711[0m | [32m   3.4215[0m | [32m   0.0070[0m | [32m   8.1961[0m | [32m    0.0897[0m | [32m   5.0111[0m | [32m   0.0593[0m | [32m   0.0090[0m | [32m   3.8731[0m | 
    2 | 01m06s |   -0.55615 |  101.5050 |    3.0079 |    5.5783 |    0.0089 |    4.6533 |     0.0085 |    1.5053 |    0.0306 |    0.0063 |    1.0684 | 
    3 | 02m23s |   -0.62374 |   94.1188 |    7.7098 |    1.1896 |    0.0092 |    8.9889 |     0.0547 |    0.9087 |    0.0718 |    0.0039 |    3.4610 | 
    4 | 01m51s |   -0.54069 |  140.3578 |    7.8253 |    4.7332 |    0.0035 |    8.5519 |     0.0297 |    9.0

  " state: %s" % convergence_dict)


  114 | 04m16s | -999.00000 |  172.4968 |    9.9000 |    9.9000 |    0.0100 |    9.9000 |     0.0999 |    0.0000 |    0.0001 |    0.0100 |    5.0000 | 
  115 | 03m45s |   -0.55629 |  113.6641 |    7.2835 |    3.2911 |    0.0030 |    7.6441 |     0.0096 |    3.9006 |    0.0786 |    0.0011 |    4.2300 | 
  116 | 04m44s |   -0.55034 |  126.6035 |    0.0000 |    9.9000 |    0.0100 |    9.9000 |     0.0999 |    8.1045 |    0.1000 |    0.0025 |    5.0000 | 
  117 | 01m29s |   -0.46158 |   71.4292 |    0.4653 |    9.0592 |    0.0025 |    0.6535 |     0.0597 |    7.5809 |    0.0857 |    0.0075 |    1.7021 | 
  118 | 04m37s | -999.00000 |   89.7514 |    9.9000 |    9.9000 |    0.0000 |    9.9000 |     0.0999 |    0.0000 |    0.0001 |    0.0100 |    5.0000 | 
  119 | 04m41s |   -0.55676 |  140.0230 |    0.0000 |    9.9000 |    0.0100 |    9.9000 |     0.0999 |    9.9000 |    0.1000 |    0.0000 |    5.0000 | 
  120 | 04m29s | -999.00000 |   58.2816 |    9.9000 |    9.9000 |    0.0000 |    0.0000 



  222 | 01m17s |       -inf |   71.2472 |    2.2434 |    0.2766 |    0.0075 |    9.2676 |     0.0286 |    9.8375 |    0.0659 |    0.0060 |    1.2653 | 
[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |      D_fm |        L1 |        L2 |     L2_fm |     alpha |   alpha_fm |      beta |   e_noise |   init_fm |     iters | 
    1 | 01m43s | [35m  -0.53555[0m | [32m  74.0949[0m | [32m   9.4482[0m | [32m   8.1378[0m | [32m   0.0088[0m | [32m   7.8541[0m | [32m    0.0148[0m | [32m   3.0355[0m | [32m   0.0475[0m | [32m   0.0054[0m | [32m   2.7823[0m | 
    2 | 03m22s | [35m  -0.45365[0m | [32m 137.6444[0m | [32m   6.7898[0m | [32m   9.7546[0m | [32m   0.0034[0m | [32m   1.4518[0m | [32m    0.0922[0m | [32m   6.2532[0m | [32m   0.0382[0m | [32m   0.0093[0m | [32m   4.4176[0m | 
    3 | 01

In [None]:
alpha