In [1]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-z505ccs5
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l- \ done
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-nizmap_u/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib
Installing collected packages: keras-contrib
Successfully installed keras-contrib-2.0.8


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from gensim.models.fasttext import FastText
import string
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import Sequence
from keras.models import Model , load_model
from keras import backend as K
from keras.metrics import top_k_categorical_accuracy
import re
from tqdm import tqdm_notebook

from keras.layers import Bidirectional, CuDNNLSTM, Embedding , \
                            Input, GlobalMaxPooling1D, Conv1D, Lambda, \
                            Dense, Concatenate, Dropout, BatchNormalization,\
                            SpatialDropout1D, CuDNNGRU
from keras.optimizers import Nadam
from keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.metrics import accuracy_score

from keras_contrib.callbacks import CyclicLR
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['ndsc-beginner', 'test-fasttexttrain']


In [3]:
fasttextm = FastText.load('../input/test-fasttexttrain/textshopee.model')

In [4]:
train_data = pd.read_csv('../input/ndsc-beginner/train.csv')
test_data = pd.read_csv('../input/ndsc-beginner/test.csv')

train_data = train_data[train_data['image_path'].str.contains('beauty')]
test_data = test_data[test_data['image_path'].str.contains('beauty')]

In [5]:
train_data['title'] = train_data['title'].apply(lambda s : re.sub(r'[^\w\s]','',s))
train_data['title'] = train_data['title'].apply(lambda s : re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", s))
test_data['title'] = test_data['title'].apply(lambda s : re.sub(r'[^\w\s]','',s))
test_data['title'] = test_data['title'].apply(lambda s : re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", s))

In [6]:
local_cat = {cat : i for i , cat in enumerate(train_data['Category'].unique())}
inverse_local_cat = {cat : i for i , cat in enumerate(train_data['Category'].unique())}

train_data['local_class'] = train_data['Category'].map(local_cat)

In [7]:
train_text = train_data['title'].values
train_label = train_data['local_class'].values

test_text = test_data['title'].values

n_classes = len(train_data['Category'].unique())
#y_test = test_data['local_class'].values

In [8]:
MAX_NB_WORDS = 20000
EMBED_DIM = 300

In [9]:
token = Tokenizer(MAX_NB_WORDS , char_level = False)
token.fit_on_texts(train_text)

In [10]:
word_index = token.word_index
total_words = len(token.word_index)

In [11]:
sequences = token.texts_to_sequences(train_text)
sequences = np.array(sequences)
test_sequences = token.texts_to_sequences(test_text)
test_sequences = np.array(test_sequences)

In [12]:
NUM_WORDS = min(MAX_NB_WORDS, total_words)
embedding_matrix = np.zeros((NUM_WORDS, EMBED_DIM))
words_not_found = []

for word, i in tqdm_notebook(word_index.items()):
    if i >= NUM_WORDS:
        break
    try :
        embedding_matrix[i] = fasttextm.wv[word]
    except :
        words_not_found.append(word)

HBox(children=(IntProgress(value=0, max=28598), HTML(value='')))




In [13]:
class DataGenerator(Sequence) :
    def __init__(self , X_1 , labels = [] , shuffle = True, batch_size = 128) :
        
        ### Input X_1 list of tokenized titles
        ### Input X_2 mentions_array
        self.X_1 = np.array(X_1)
        if len(labels) > 0 :
            assert len(X_1) == len(labels)
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.indexes = np.arange(len(X_1))
        if self.shuffle :
            np.random.shuffle(self.indexes)
    
    def on_epoch_end(self) :
        if self.shuffle :
            np.random.shuffle(self.indexes)
    
    def __getitem__(self , idx) :
        index = self.indexes[idx*self.batch_size :min(len(self.X_1) , (idx+1)*self.batch_size)]
        curr_batch_X_1 = pad_sequences(self.X_1[index])
        #curr_batch_X_2 = self.X_2[index]
        if len(self.labels) :
            curr_batch_labels = self.labels[index]
            return curr_batch_X_1 , curr_batch_labels
        else :
            return curr_batch_X_1
            
    def __len__(self):
        return int(np.ceil(len(self.X_1) / self.batch_size))
    
    

In [14]:
def gen_model(n_classes = 2) :
    inp = Input(shape=(None,))
    x = Embedding(NUM_WORDS, EMBED_DIM, weights=[embedding_matrix], trainable=False, name='EMBEDDING')(inp)
    x = SpatialDropout1D(0.4)(x)
    x1 = Bidirectional(CuDNNLSTM(384, return_sequences=True))(x)
    x2 = Bidirectional(CuDNNGRU(256, return_sequences=True))(x1)
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    conc = Dropout(0.3)(conc)
    
    ###### Trial
    x_1 = Dense(n_classes , activation = 'softmax')(conc)
    m = Model(inp , x_1)

    m.compile(loss = 'sparse_categorical_crossentropy' , optimizer = 'Nadam' , metrics = ['accuracy'])
    
    return m

In [15]:
K.clear_session()
n_split = 8
kf = StratifiedKFold(n_split,  random_state = 100)
models = [gen_model(n_classes) for i in range(n_split)]

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [16]:
epochs = 30

for i , (train_idx , test_idx) in enumerate(kf.split(sequences, train_label)) :
    train_generator = DataGenerator(sequences[train_idx] , train_label[train_idx], shuffle = True, batch_size = 256)
    val_generator = DataGenerator(sequences[test_idx] , train_label[test_idx] , shuffle = False)
    cb = [
        #ModelCheckpoint('model_{}.h5'.format(i) , save_best_only =True , monitor = 'val_loss', verbose = True),
        CyclicLR(5e-4 , 0.004 , int(epochs*len(train_generator)/2) )
    ]
    hist = models[i].fit_generator(train_generator, 
                                epochs = epochs, 
                                validation_data = val_generator,
                                verbose = 1,
                                callbacks = cb).history
    
    models[i].layers[1].trainable = True
    models[i].compile(loss = 'sparse_categorical_crossentropy' , optimizer = Nadam(5e-4) , metrics = ['accuracy'])
    models[i].fit_generator(train_generator, 
                                epochs = 5, 
                                validation_data = val_generator,
                                verbose = 1,
                                callbacks = cb).history
    
    print('Epochs', i , 'val acc :' , hist['val_acc'][-1] , '     val loss :', hist['val_loss'][-1])
    print('         train acc :', hist['acc'][-1]  , '   train loss :' , hist['loss'][-1])

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
 53/980 [>.............................] - ETA: 37s - loss: 0.6781 - acc: 0.7827

In [17]:
test_generator = DataGenerator(test_sequences , shuffle = False)
train_pred_generator = DataGenerator(sequences, shuffle = False)

In [18]:
pred_train = np.zeros([len(train_data) , n_classes])
pred_test = np.zeros([len(test_data) , n_classes])
for i in range(n_split) :
    pred_train += models[i].predict_generator(train_pred_generator)
    pred_test += models[i].predict_generator(test_generator)
    
pred_train /= n_split
pred_test /= n_split

In [19]:
sub_pred_df = test_data[['itemid']]
for i in range(n_classes) :
    sub_pred_df['{}'.format(i)] = pred_test[:,i]
sub_train_df = train_data[['itemid']]
for i in range(n_classes) :
    sub_train_df['{}'.format(i)] = pred_train[:,i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
print('Model Training Accuracy :' , accuracy_score(train_label , pred_train.argmax(axis = 1)))

Model Training Accuracy : 0.8314066082077444


In [21]:
sub_pred_df.to_csv('sub_beauty.csv' , index = False)
sub_train_df.to_csv('sub_train_beauty.csv' , index = False)