In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import pickle
from sklearn.multiclass import OneVsRestClassifier
train = pd.read_csv('../input/til2020/TIL_NLP_train_dataset.csv', index_col='id')

In [2]:
Ytrain = train[["outwear", "top", "trousers", "women dresses", "women skirts"]].values

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf1 = TfidfVectorizer()
tfidf1.fit(train['word_representation'])
X_text1 = tfidf1.transform(train['word_representation']).toarray()

#pickle.dump(tfidf1, open("tfidf_uni.pickle", "wb"))

Xtrain1 = pd.DataFrame(X_text1)

In [4]:
tfidf2 = TfidfVectorizer(ngram_range=(1, 2))
tfidf2.fit(train['word_representation'])
X_text2 = tfidf2.transform(train['word_representation']).toarray()
Xtrain2 = pd.DataFrame(X_text2)

pickle.dump(tfidf2, open("tfidf_bi.pickle", "wb"))

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def get_metrics(y_test, y_predicted):
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, average='micro')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, average='micro')
    # harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall)
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return f1, precision, recall, accuracy

In [6]:
model1 = OneVsRestClassifier(XGBClassifier(n_estimators=100, random_state=0, tree_method='gpu_hist', gpu_id=0))
model1.fit(Xtrain1, Ytrain)

OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=0, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=0, reg_alpha=None,
                                            

In [7]:
pickle.dump(model1, open('tfidf_uni.sav', 'wb'))

In [7]:
test = pd.read_csv('../input/til2020/TIL_NLP_test_dataset.csv', index_col='id')
X_testtext1 = tfidf1.transform(test['word_representation']).toarray()
Xtest1 = pd.DataFrame(X_testtext1)
y_pred_prob1 = model1.predict_proba(Xtest1)

In [9]:
del model1

In [10]:
model2 = OneVsRestClassifier(XGBClassifier(n_estimators=100, random_state=0, tree_method='gpu_hist', gpu_id=0))
model2.fit(Xtrain2, Ytrain)

OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=0, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=0, reg_alpha=None,
                                            

In [11]:
pickle.dump(model2, open('tfidf_bi.sav', 'wb'))

In [12]:
X_testtext2 = tfidf2.transform(test['word_representation']).toarray()
Xtest2 = pd.DataFrame(X_testtext2)
y_pred_prob2 = model2.predict_proba(Xtest2)

In [13]:
print(y_pred_prob2)

[[2.10789725e-01 9.82939124e-01 9.73868847e-01 1.07290718e-04
  5.78892359e-04]
 [9.97117043e-01 9.99498487e-01 6.70479052e-03 7.46134960e-04
  2.34530703e-03]
 [9.98438776e-01 1.08066425e-02 9.96100783e-01 1.46862396e-04
  9.58087840e-05]
 ...
 [9.83061135e-01 2.01729294e-02 9.96654510e-01 3.56607197e-04
  1.65813938e-02]
 [6.96623385e-01 9.95002329e-01 9.92879450e-01 8.17008840e-05
  2.21126073e-04]
 [2.01649722e-02 4.47501335e-03 9.63162601e-01 1.87176731e-04
  9.99941707e-01]]


In [14]:
del model2

In [15]:
def calculate_thresh(Ytest, prob):
    thresholds = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        y_pred_comb = [[1 if x > thresh else 0 for idx,x in enumerate(i) ] for i in prob]
        res = get_metrics(Ytest, y_pred_comb)[0]
        thresholds.append([thresh, res])
        #print("F1 score at threshold {0} is {1}".format(thresh, res))
    thresholds.sort(key=lambda x: x[1], reverse=True)
    best_thresh = thresholds[0][0]
    print("Best threshold: ", best_thresh)
    return best_thresh

In [16]:
with open('../input/til2020/word_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

In [17]:
import keras
from keras.layers import Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
sentences = train["word_representation"].values
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print("max sequence length:", max(len(s) for s in sequences))

Using TensorFlow backend.


Found 4249 unique tokens.
max sequence length: 47


In [18]:
data = pad_sequences(sequences, maxlen=50)

In [19]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [20]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=50,
                            trainable=False)

In [24]:
from keras.layers import Dense, Input, LSTM, GlobalMaxPool1D, Dropout
from keras.layers import Bidirectional
from keras.models import Model

sequence_input = Input(shape=(50,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Bidirectional(LSTM(15, return_sequences=True))(embedded_sequences)
x = GlobalMaxPool1D()(x) # consider all the h(t)s but only get 1 output
x = Dropout(0.2)(x)
# output = Dense(1, activation="sigmoid")(x)
#x = Bidirectional(LSTM(128, return_sequences=True))(embedded_sequences)
#x = Bidirectional(LSTM(64, return_sequences=True))(x)
#x = Attention(50)(x)
#x = Dense(64, activation="relu")(x)
#x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)

model = Model(sequence_input, x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[keras.metrics.Precision(), keras.metrics.Recall()])

In [25]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('lstm.hdf5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

In [26]:
print('Training model...')
r = model.fit(
  data,
  Ytrain,
  batch_size=128,
  epochs=200,
  validation_split=0.1,
  callbacks=[es, mc]
)

Training model...
Train on 6642 samples, validate on 738 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.58468, saving model to lstm.hdf5
Epoch 2/200

Epoch 00002: val_loss improved from 0.58468 to 0.53150, saving model to lstm.hdf5
Epoch 3/200

Epoch 00003: val_loss improved from 0.53150 to 0.45922, saving model to lstm.hdf5
Epoch 4/200

Epoch 00004: val_loss improved from 0.45922 to 0.38735, saving model to lstm.hdf5
Epoch 5/200

Epoch 00005: val_loss improved from 0.38735 to 0.32596, saving model to lstm.hdf5
Epoch 6/200

Epoch 00006: val_loss improved from 0.32596 to 0.27903, saving model to lstm.hdf5
Epoch 7/200

Epoch 00007: val_loss improved from 0.27903 to 0.24795, saving model to lstm.hdf5
Epoch 8/200

Epoch 00008: val_loss improved from 0.24795 to 0.22117, saving model to lstm.hdf5
Epoch 9/200

Epoch 00009: val_loss improved from 0.22117 to 0.20222, saving model to lstm.hdf5
Epoch 10/200

Epoch 00010: val_loss improved from 0.20222 to 0.18470, saving model t


Epoch 00030: val_loss improved from 0.10530 to 0.10209, saving model to lstm.hdf5
Epoch 31/200

Epoch 00031: val_loss improved from 0.10209 to 0.10206, saving model to lstm.hdf5
Epoch 32/200

Epoch 00032: val_loss did not improve from 0.10206
Epoch 33/200

Epoch 00033: val_loss improved from 0.10206 to 0.09936, saving model to lstm.hdf5
Epoch 34/200

Epoch 00034: val_loss improved from 0.09936 to 0.09829, saving model to lstm.hdf5
Epoch 35/200

Epoch 00035: val_loss did not improve from 0.09829
Epoch 36/200

Epoch 00036: val_loss did not improve from 0.09829
Epoch 37/200

Epoch 00037: val_loss improved from 0.09829 to 0.09715, saving model to lstm.hdf5
Epoch 38/200

Epoch 00038: val_loss improved from 0.09715 to 0.09546, saving model to lstm.hdf5
Epoch 39/200

Epoch 00039: val_loss did not improve from 0.09546
Epoch 40/200

Epoch 00040: val_loss did not improve from 0.09546
Epoch 41/200

Epoch 00041: val_loss improved from 0.09546 to 0.09462, saving model to lstm.hdf5
Epoch 42/200

Ep

In [None]:
#from keras.models import load_weights
model.load_weights('/kaggle/working/lstm.hdf5')

In [27]:
test_sentences = test["word_representation"].values
test_sequences = tokenizer.texts_to_sequences(test_sentences)
Xtest_emb = pad_sequences(test_sequences, maxlen=50)

In [28]:
y_predembprob = model.predict(Xtest_emb)

In [29]:
y_pred_combprob = 0.33*(y_pred_prob1+y_pred_prob2+y_predembprob)

In [33]:
#threshcomb = calculate_thresh(Ytest, y_pred_combprob)
y_predcomb = [[1 if x > 0.4 else 0 for idx,x in enumerate(i) ] for i in y_pred_combprob]

In [34]:
submission = pd.read_csv('../input/til2020/NLP_submission_example.csv')
submission[["outwear", "top", "trousers", "women dresses", "women skirts"]] = y_predcomb
submission.to_csv('submission.csv', index=False)