In [12]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import pickle
from sklearn.multiclass import OneVsRestClassifier
train = pd.read_csv('../input/til2020/TIL_NLP_train_dataset.csv', index_col='id')
Ytrain = train[["outwear", "top", "trousers", "women dresses", "women skirts"]].values

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf1 = TfidfVectorizer()
tfidf1.fit(train['word_representation'])
X_text1 = tfidf1.transform(train['word_representation']).toarray()

#pickle.dump(tfidf1, open("tfidf_uni.pickle", "wb"))

Xtrain1 = pd.DataFrame(X_text1)

In [16]:
tfidf2 = TfidfVectorizer(ngram_range=(1, 2))
tfidf2.fit(train['word_representation'])
X_text2 = tfidf2.transform(train['word_representation']).toarray()
Xtrain2 = pd.DataFrame(X_text2)

#pickle.dump(tfidf2, open("tfidf_bi.pickle", "wb"))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def get_metrics(y_test, y_predicted):
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, average='micro')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, average='micro')
    # harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall)
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return f1, precision, recall, accuracy

In [None]:
model1 = OneVsRestClassifier(XGBClassifier(n_estimators=100, random_state=0, tree_method='gpu_hist', gpu_id=0))
model1.fit(Xtrain1, Ytrain)

In [None]:
pickle.dump(model1, open('tfidf_uni.sav', 'wb'))

In [18]:
with open("../input/xgboostnlp/tfidf_uni.sav", 'rb') as model:
    model1 = pickle.load(model)

In [19]:
with open("../input/xgboostnlp/tfidf_bi.sav", 'rb') as model:
    model2 = pickle.load(model)

In [20]:
test = pd.read_csv('../input/til2020/TIL_NLP_test_dataset.csv', index_col='id')
X_testtext1 = tfidf1.transform(test['word_representation']).toarray()
Xtest1 = pd.DataFrame(X_testtext1)
y_pred_prob1 = model1.predict_proba(Xtest1)

In [None]:
del model1

In [None]:
model2 = OneVsRestClassifier(XGBClassifier(n_estimators=100, random_state=0, tree_method='gpu_hist', gpu_id=0))
model2.fit(Xtrain2, Ytrain)

In [None]:
pickle.dump(model2, open('tfidf_bi.sav', 'wb'))

In [21]:
X_testtext1 = tfidf1.transform(test['word_representation']).toarray()
Xtest1 = pd.DataFrame(X_testtext1)
y_pred_prob1 = model1.predict_proba(Xtest1)

X_testtext2 = tfidf2.transform(test['word_representation']).toarray()
Xtest2 = pd.DataFrame(X_testtext2)
y_pred_prob2 = model2.predict_proba(Xtest2)

In [None]:
print(y_pred_prob2)

In [None]:
del model2

In [None]:
def calculate_thresh(Ytest, prob):
    thresholds = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        y_pred_comb = [[1 if x > thresh else 0 for idx,x in enumerate(i) ] for i in prob]
        res = get_metrics(Ytest, y_pred_comb)[0]
        thresholds.append([thresh, res])
        #print("F1 score at threshold {0} is {1}".format(thresh, res))
    thresholds.sort(key=lambda x: x[1], reverse=True)
    best_thresh = thresholds[0][0]
    print("Best threshold: ", best_thresh)
    return best_thresh

In [2]:
with open('../input/til2020/word_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

In [3]:
!pip install keras-self-attention

Collecting keras-self-attention
  Downloading keras-self-attention-0.46.0.tar.gz (10 kB)
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25ldone
[?25h  Created wheel for keras-self-attention: filename=keras_self_attention-0.46.0-py3-none-any.whl size=17278 sha256=32155114f0d4dbcaa5afd30d57c26d12f09ec65ffe7b48f3ef699b661587fd24
  Stored in directory: /root/.cache/pip/wheels/ec/f7/48/30de93f8333298bad9202aab9b04db0cfd58dcd379b5a5ef1c
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.46.0


In [4]:
import keras
from keras.layers import Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
sentences = train["word_representation"].values
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print("max sequence length:", max(len(s) for s in sequences))

Using TensorFlow backend.


Found 4249 unique tokens.
max sequence length: 47


In [5]:
data = pad_sequences(sequences, maxlen=50)

In [6]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [7]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=50,
                            trainable=False)

In [62]:
from keras.layers import Dense, Input, LSTM, GlobalMaxPool1D, Dropout, GRU
from keras.layers import Bidirectional
from keras.models import Model
from keras_self_attention import SeqSelfAttention

sequence_input = Input(shape=(50,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedded_sequences)
x = SeqSelfAttention(attention_activation='sigmoid')(x)
x = GlobalMaxPool1D()(x) # consider all the h(t)s but only get 1 output
x = Dropout(0.2)(x)
# output = Dense(1, activation="sigmoid")(x)
#x = Bidirectional(LSTM(128, return_sequences=True))(embedded_sequences)
#x = Bidirectional(LSTM(64, return_sequences=True))(x)
#x = Attention(50)(x)
#x = Dense(64, activation="relu")(x)
#x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)

model = Model(sequence_input, x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [35]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('lstm_att.hdf5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

In [36]:
print('Training model...')
r = model.fit(
  data,
  Ytrain,
  batch_size=128,
  epochs=200,
  validation_split=0.1,
  callbacks=[es, mc]
)

Training model...
Train on 6642 samples, validate on 738 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.46314, saving model to lstm_att.hdf5
Epoch 2/200

Epoch 00002: val_loss improved from 0.46314 to 0.34356, saving model to lstm_att.hdf5
Epoch 3/200

Epoch 00003: val_loss improved from 0.34356 to 0.25081, saving model to lstm_att.hdf5
Epoch 4/200

Epoch 00004: val_loss improved from 0.25081 to 0.19746, saving model to lstm_att.hdf5
Epoch 5/200

Epoch 00005: val_loss improved from 0.19746 to 0.17212, saving model to lstm_att.hdf5
Epoch 6/200

Epoch 00006: val_loss improved from 0.17212 to 0.15869, saving model to lstm_att.hdf5
Epoch 7/200

Epoch 00007: val_loss improved from 0.15869 to 0.13829, saving model to lstm_att.hdf5
Epoch 8/200

Epoch 00008: val_loss did not improve from 0.13829
Epoch 9/200

Epoch 00009: val_loss improved from 0.13829 to 0.12786, saving model to lstm_att.hdf5
Epoch 10/200

Epoch 00010: val_loss improved from 0.12786 to 0.11777, saving model 


Epoch 00038: val_loss did not improve from 0.08394
Epoch 39/200

Epoch 00039: val_loss did not improve from 0.08394
Epoch 40/200

Epoch 00040: val_loss did not improve from 0.08394
Epoch 41/200

Epoch 00041: val_loss did not improve from 0.08394
Epoch 42/200

Epoch 00042: val_loss did not improve from 0.08394
Epoch 43/200

Epoch 00043: val_loss did not improve from 0.08394
Epoch 44/200

Epoch 00044: val_loss did not improve from 0.08394
Epoch 00044: early stopping


Epoch 34/200
6642/6642 [==============================] - 11s 2ms/step - loss: 0.0592 - accuracy: 0.9815 - val_loss: 0.0839 - val_accuracy: 0.9767

Epoch 00034: val_loss improved from 0.08489 to 0.08394, saving model to lstm_att.hdf5

In [69]:
from keras.models import load_model
#from keras.utils import custom_object_scope
#with custom_object_scope({'Precision': keras.metrics.Precision(), 'binary_recall':keras.metrics.Recall()}):
#    model = load_model('/kaggle/working/lstm_att.hdf5', custom_objects={'SeqSelfAttention': SeqSelfAttention})
model = load_model('/kaggle/working/lstm_att2.hdf5', custom_objects={'SeqSelfAttention': SeqSelfAttention})

In [49]:
sequence_input = Input(shape=(50,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedded_sequences)
x = SeqSelfAttention(attention_activation='sigmoid')(x)
x = GlobalMaxPool1D()(x) # consider all the h(t)s but only get 1 output
x = Dropout(0.2)(x)
x = Dense(5, activation="sigmoid")(x)

model = Model(sequence_input, x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [50]:
mc2 = ModelCheckpoint('lstm_att2.hdf5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

In [51]:
print('Training model...')
r = model.fit(
  data,
  Ytrain,
  batch_size=32,
  epochs=200,
  validation_split=0.1,
  callbacks=[es, mc2]
)

Training model...
Train on 6642 samples, validate on 738 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.30698, saving model to lstm_att2.hdf5
Epoch 2/200

Epoch 00002: val_loss improved from 0.30698 to 0.19001, saving model to lstm_att2.hdf5
Epoch 3/200

Epoch 00003: val_loss improved from 0.19001 to 0.14916, saving model to lstm_att2.hdf5
Epoch 4/200

Epoch 00004: val_loss improved from 0.14916 to 0.13447, saving model to lstm_att2.hdf5
Epoch 5/200

Epoch 00005: val_loss improved from 0.13447 to 0.12155, saving model to lstm_att2.hdf5
Epoch 6/200

Epoch 00006: val_loss improved from 0.12155 to 0.11229, saving model to lstm_att2.hdf5
Epoch 7/200

Epoch 00007: val_loss improved from 0.11229 to 0.10432, saving model to lstm_att2.hdf5
Epoch 8/200

Epoch 00008: val_loss improved from 0.10432 to 0.10127, saving model to lstm_att2.hdf5
Epoch 9/200

Epoch 00009: val_loss improved from 0.10127 to 0.09837, saving model to lstm_att2.hdf5
Epoch 10/200

Epoch 00010: val_loss imp

Epoch 16/200
6642/6642 [==============================] - 43s 6ms/step - loss: 0.0692 - accuracy: 0.9782 - val_loss: 0.0814 - val_accuracy: 0.9748

Epoch 00016: val_loss improved from 0.08350 to 0.08144, saving model to lstm_att2.hdf5

In [38]:
test_sentences = test["word_representation"].values
test_sequences = tokenizer.texts_to_sequences(test_sentences)
Xtest_emb = pad_sequences(test_sequences, maxlen=50)

In [70]:
y_predembprob = model.predict(Xtest_emb)

In [63]:
sequence_input = Input(shape=(50,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedded_sequences)
x = SeqSelfAttention(attention_activation='sigmoid')(x)
x = GlobalMaxPool1D()(x) # consider all the h(t)s but only get 1 output
x = Dropout(0.2)(x)
x = Dense(5, activation="sigmoid")(x)

model_gru = Model(sequence_input, x)
model_gru.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [64]:
mc3 = ModelCheckpoint('gru_att.hdf5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

In [66]:
print('Training model...')
r = model_gru.fit(
  data,
  Ytrain,
  batch_size=32,
  epochs=200,
  validation_split=0.1,
  callbacks=[es, mc3]
)

Training model...
Train on 6642 samples, validate on 738 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.45033, saving model to gru_att.hdf5
Epoch 2/200

Epoch 00002: val_loss improved from 0.45033 to 0.37605, saving model to gru_att.hdf5
Epoch 3/200

Epoch 00003: val_loss improved from 0.37605 to 0.32150, saving model to gru_att.hdf5
Epoch 4/200

Epoch 00004: val_loss improved from 0.32150 to 0.29461, saving model to gru_att.hdf5
Epoch 5/200

Epoch 00005: val_loss improved from 0.29461 to 0.27051, saving model to gru_att.hdf5
Epoch 6/200

Epoch 00006: val_loss improved from 0.27051 to 0.25513, saving model to gru_att.hdf5
Epoch 7/200

Epoch 00007: val_loss improved from 0.25513 to 0.23344, saving model to gru_att.hdf5
Epoch 8/200

Epoch 00008: val_loss improved from 0.23344 to 0.21604, saving model to gru_att.hdf5
Epoch 9/200

Epoch 00009: val_loss did not improve from 0.21604
Epoch 10/200

Epoch 00010: val_loss improved from 0.21604 to 0.21010, saving model to gru_a


Epoch 00037: val_loss improved from 0.10352 to 0.10128, saving model to gru_att.hdf5
Epoch 38/200

Epoch 00038: val_loss did not improve from 0.10128
Epoch 39/200

Epoch 00039: val_loss improved from 0.10128 to 0.09767, saving model to gru_att.hdf5
Epoch 40/200

Epoch 00040: val_loss did not improve from 0.09767
Epoch 41/200

Epoch 00041: val_loss did not improve from 0.09767
Epoch 42/200

Epoch 00042: val_loss did not improve from 0.09767
Epoch 43/200

Epoch 00043: val_loss improved from 0.09767 to 0.09702, saving model to gru_att.hdf5
Epoch 44/200

Epoch 00044: val_loss did not improve from 0.09702
Epoch 45/200

Epoch 00045: val_loss did not improve from 0.09702
Epoch 46/200

Epoch 00046: val_loss improved from 0.09702 to 0.09574, saving model to gru_att.hdf5
Epoch 47/200

Epoch 00047: val_loss improved from 0.09574 to 0.09306, saving model to gru_att.hdf5
Epoch 48/200

Epoch 00048: val_loss did not improve from 0.09306
Epoch 49/200

Epoch 00049: val_loss did not improve from 0.0930

KeyboardInterrupt: 

In [67]:
model_gru = load_model('/kaggle/working/gru_att.hdf5', custom_objects={'SeqSelfAttention': SeqSelfAttention})

In [68]:
y_predgruprob = model_gru.predict(Xtest_emb)

In [74]:
print(y_pred_prob1)

[[1.7546068e-01 9.9387217e-01 9.7000837e-01 2.0070560e-04 6.3888630e-04]
 [9.9884665e-01 9.9845517e-01 1.4060500e-02 5.5575245e-05 3.0075426e-03]
 [9.9818230e-01 1.4843052e-02 9.9365497e-01 1.7182056e-04 6.9535832e-05]
 ...
 [9.8401606e-01 2.4958175e-02 9.9619764e-01 8.1978057e-04 5.0399199e-02]
 [8.5620308e-01 9.8773277e-01 9.9365342e-01 5.6184428e-05 1.5094389e-04]
 [2.1471357e-02 9.7366516e-03 9.8306781e-01 2.6072455e-03 9.9987042e-01]]


In [75]:
print(y_pred_prob2)

[[2.10789725e-01 9.82939124e-01 9.73868847e-01 1.07290718e-04
  5.78892359e-04]
 [9.97117043e-01 9.99498487e-01 6.70479052e-03 7.46134960e-04
  2.34530703e-03]
 [9.98438776e-01 1.08066425e-02 9.96100783e-01 1.46862396e-04
  9.58087840e-05]
 ...
 [9.83061135e-01 2.01729294e-02 9.96654510e-01 3.56607197e-04
  1.65813938e-02]
 [6.96623385e-01 9.95002329e-01 9.92879450e-01 8.17008840e-05
  2.21126073e-04]
 [2.01649722e-02 4.47501335e-03 9.63162601e-01 1.87176731e-04
  9.99941707e-01]]


In [76]:
print(y_predembprob)

[[1.4214219e-02 9.9895084e-01 9.9488539e-01 3.4931168e-04 2.3434998e-04]
 [9.8853439e-01 9.9916184e-01 2.3438616e-03 5.0843752e-04 1.4850023e-03]
 [9.9951744e-01 8.9493617e-03 9.9914849e-01 9.6736057e-04 4.3327658e-04]
 ...
 [9.9842322e-01 1.4080577e-02 9.9722469e-01 5.1101629e-04 2.9133479e-04]
 [6.1549615e-02 9.8918962e-01 9.9936348e-01 3.4531410e-04 8.0461803e-05]
 [2.9324330e-02 4.2786676e-02 9.8343444e-01 9.5768814e-04 9.6736532e-01]]


In [77]:
print(y_predgruprob)

[[1.3381267e-02 9.9979299e-01 9.2557770e-01 2.2734902e-04 1.8657146e-03]
 [9.8551047e-01 9.9987984e-01 7.7461742e-04 7.5418281e-04 4.9878992e-03]
 [9.9991119e-01 3.4067675e-03 9.9993360e-01 1.2145685e-04 2.1914857e-05]
 ...
 [9.9997032e-01 4.6680467e-03 9.4775236e-01 1.0693732e-04 1.7644659e-04]
 [4.2718604e-01 9.9452317e-01 9.9911159e-01 7.5989979e-04 8.0967555e-05]
 [3.5760734e-02 2.0039266e-03 9.9944478e-01 3.6785175e-04 9.8892468e-01]]


In [71]:
y_pred_combprob = 0.25*(y_pred_prob1+y_pred_prob2+y_predembprob+y_predgruprob)

In [78]:
print(y_pred_combprob)

[[1.03461474e-01 9.93888795e-01 9.66085076e-01 2.21164257e-04
  8.29460798e-04]
 [9.92502093e-01 9.99248862e-01 5.97094232e-03 5.16082626e-04
  2.95643788e-03]
 [9.99012470e-01 9.50145628e-03 9.97209430e-01 3.51875089e-04
  1.55134010e-04]
 ...
 [9.91367698e-01 1.59699321e-02 9.84457374e-01 4.48585342e-04
  1.68620925e-02]
 [5.10390520e-01 9.91611958e-01 9.96252000e-01 3.10774805e-04
  1.33374822e-04]
 [2.66803466e-02 1.47505673e-02 9.82277393e-01 1.02999050e-03
  9.89025533e-01]]


In [81]:
#threshcomb = calculate_thresh(Ytest, y_pred_combprob)
y_predcomb = [[1 if x > 0.35 else 0 for idx,x in enumerate(i) ] for i in y_pred_combprob]

In [82]:
submission = pd.read_csv('../input/til2020/NLP_submission_example.csv')
submission[["outwear", "top", "trousers", "women dresses", "women skirts"]] = y_predcomb
submission.to_csv('submission5.csv', index=False)