In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import pickle
from sklearn.multiclass import OneVsRestClassifier
train = pd.read_csv('../input/til2020/TIL_NLP_train_dataset.csv', index_col='id')
test = pd.read_csv('../input/til2020/TIL_NLP_test_dataset.csv', index_col='id')

In [None]:
from sklearn.model_selection import train_test_split
xgtrain, xgtest = train_test_split(train, test_size=0.1, random_state=0)

Training fastai model - creating & finetuning a language model

In [None]:
from fastai.text import *

# Create databunch
data = (TextList.from_df(train, cols='word_representation')
                .split_by_rand_pct(0.2)
                .label_for_lm()  
                .databunch(bs=48))
data.show_batch()

In [None]:
learn = language_model_learner(data,AWD_LSTM, drop_mult=0.3)

# select the appropriate learning rate
learn.lr_find()
learn.recorder.plot(suggestion=True)
min_grad_lr = learn.recorder.min_grad_lr

In [None]:
learn.fit_one_cycle(5, min_grad_lr)

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, 1e-3)

In [None]:
learn.save_encoder('enc_final')

Training fastai model - creating classifier

In [None]:
label_cols = ["outwear", "top", "trousers", "women dresses", "women skirts"]

test_datalist = TextList.from_df(test, cols='word_representation', vocab=data.vocab)

data_clas = (TextList.from_df(train, cols='word_representation', vocab=data.vocab)
             .split_by_rand_pct(0.2)
             .label_from_df(cols= label_cols , classes=label_cols)
             .add_test(test_datalist)
             .databunch(bs=32))

data_clas.show_batch()

In [None]:
learn_classifier = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)

# load the encoder saved  
learn_classifier.load_encoder('enc_final')
learn_classifier.freeze()

# select the appropriate learning rate
learn_classifier.lr_find()

# we typically find the point where the slope is steepest
learn_classifier.recorder.plot(suggestion=True)
min_classifier_grad_lr = learn_classifier.recorder.min_grad_lr

In [None]:
learn_classifier.fit_one_cycle(5, min_classifier_grad_lr)

learn_classifier.show_results()

In [None]:
learn_classifier.recorder.plot_losses()

In [None]:
learn_classifier.freeze_to(-2)
learn_classifier.fit_one_cycle(4, slice(5e-3, 2e-3), moms=(0.8,0.7))

In [None]:
learn_classifier.fit_one_cycle(10, slice(5e-3, 2e-3), moms=(0.8,0.7))

In [None]:
learn_classifier.unfreeze()
learn_classifier.fit_one_cycle(10, slice(2e-3/100, 2e-3), moms=(0.8,0.7))

In [None]:
learn_classifier.fit_one_cycle(1, slice(2e-3/100, 2e-3), moms=(0.8,0.7))

In [None]:
learn_classifier.save('clas_final')

In [None]:
learn_classifier.show_results()

In [None]:
preds,y,losses = learn_classifier.get_preds(with_loss=True)
interp = ClassificationInterpretation(learn_classifier, preds, y, losses)
interp.plot_confusion_matrix()

In [None]:
preds, target = learn_classifier.get_preds(DatasetType.Test, ordered=True)
labels_prob = preds.numpy()

In [None]:
y_pred = [[1 if x > 0.5 else 0 for idx,x in enumerate(i) ] for i in labels_prob]

Creating XGBoost model for blending with fastai model

In [None]:
Ytrain = xgtrain[["outwear", "top", "trousers", "women dresses", "women skirts"]].values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf1 = TfidfVectorizer()
tfidf1.fit(train['word_representation'])
X_text = tfidf1.transform(xgtrain['word_representation']).toarray()

Xtrain1 = pd.DataFrame(X_text)

In [None]:
model1 = OneVsRestClassifier(XGBClassifier(n_estimators=1000, random_state=0, tree_method='gpu_hist', gpu_id=0))

In [None]:
model1.fit(Xtrain1, Ytrain)

In [None]:
model2 = OneVsRestClassifier(XGBClassifier(n_estimators=500, random_state=0, tree_method='gpu_hist', gpu_id=0))
model2.fit(Xtrain1, Ytrain)

In [None]:
Ytest = xgtest[["outwear", "top", "trousers", "women dresses", "women skirts"]].values
X_testtext = tfidf1.transform(xgtest['word_representation']).toarray()
Xtest1 = pd.DataFrame(X_testtext)

y_pred_prob = model1.predict_proba(Xtest1)
y_pred_new = model1.predict(Xtest1)

In [None]:
y_pred_prob2 = model2.predict_proba(Xtest1)
y_pred_new2 = model2.predict(Xtest1)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def get_metrics(y_test, y_predicted):
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, average='micro')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, average='micro')
    # harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall)
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return f1, precision, recall, accuracy

In [None]:
def calculate_thresh(Ytest, prob):
    thresholds = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        y_pred_comb = [[1 if x > thresh else 0 for idx,x in enumerate(i) ] for i in prob]
        res = get_metrics(Ytest, y_pred_comb)[0]
        thresholds.append([thresh, res])
        #print("F1 score at threshold {0} is {1}".format(thresh, res))
    thresholds.sort(key=lambda x: x[1], reverse=True)
    best_thresh = thresholds[0][0]
    print("Best threshold: ", best_thresh)
    return best_thresh

In [None]:
thresh1 = calculate_thresh(Ytest, y_pred_prob)

In [None]:
thresh2 = calculate_thresh(Ytest, y_pred_prob2)

In [None]:
y_pred1 = [[1 if x > thresh1 else 0 for idx,x in enumerate(i) ] for i in y_pred_prob]
results1t = get_metrics(Ytest, y_pred1)
print(results1t)

In [None]:
y_pred2 = [[1 if x > thresh2 else 0 for idx,x in enumerate(i) ] for i in y_pred_prob2]
results2t = get_metrics(Ytest, y_pred2)
print(results2t)

In [None]:
print(labels_prob)

In [None]:
X_testtext1 = tfidf1.transform(test['word_representation']).toarray()
Xtest1 = pd.DataFrame(X_testtext1)
test_predprob = model2.predict_proba(Xtest1)

In [None]:
print(test_predprob)

In [None]:
comb = 0.25*(labels_prob*3 + test_predprob)

In [None]:
print(comb)

In [None]:
y_pred = [[1 if x > 0.45 else 0 for idx,x in enumerate(i) ] for i in labels_prob]

In [None]:
submission = pd.read_csv('../input/til2020/NLP_submission_example.csv')
submission[["outwear", "top", "trousers", "women dresses", "women skirts"]] = y_pred

In [None]:
submission.to_csv('submission19.csv', index=False)