<a href="https://colab.research.google.com/github/OlegBEZb/NLP_advanced_course/blob/master/HW1/NLP_advance_course_HW3_Distributive_semantic_Blending.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libs import

In [0]:
import os, csv, random, pickle, re, sys
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt

from scipy.stats import ks_2samp

from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

# Authorization on Google drive and configurings paths

In [0]:
WORKSPACE = 'COLAB' # or 'KAGGLE'

In [11]:
if WORKSPACE == 'COLAB':
    from google.colab import drive
    drive.mount('/content/drive')

    homework_folder = os.path.join('/content/drive/My Drive', 'Advanced NLP/Homework 1: Classical classification task like Kaggle Toxic or Quora')
    data_folder = os.path.join(homework_folder, 'Toxic data')
    embeddings_folder = os.path.join(homework_folder, 'embeddings')
    output_folder = os.path.join(homework_folder, 'output')
elif WORKSPACE == 'KAGGLE':
    data_folder = '../input/jigsaw-toxic-comment-classification-challenge/'
    embeddings_folder = '../input/glove-global-vectors-for-word-representation'
else: # TODO: add computing on premise
    pass

print('data found:', os.listdir(data_folder))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
data found: ['test.csv', 'test_labels.csv', 'train.csv', 'sample_submission.csv', 'submission.csv', 'train_labels.csv.npy', 'train_labels.npy']


In [12]:
test_labels_filepath = os.path.join(data_folder,"test_labels.csv")

files_to_blend = [file for file in os.listdir(output_folder) if '.csv' in file]
files_to_blend

['submission_emb_sklearn.csv',
 'submission_bidirectional_GRU.csv',
 'submission_fine_tuned_bert.csv',
 'submission_fine_tuned_bert2.csv']

# Loading frames

In [0]:
submission_dfs = [pd.read_csv(os.path.join(output_folder,filename)) for filename in files_to_blend]

In [14]:
submission_dfs[0].head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.966263,0.054183,0.831525,0.011145,0.604859,0.075418
1,0000247867823ef7,0.016774,0.008855,0.019123,0.002713,0.022219,0.006187
2,00013b17ad220c46,0.015831,0.001117,0.00704,0.000455,0.008738,0.002202
3,00017563c3f7919a,0.001216,0.000236,0.00133,0.000328,0.001082,0.000201
4,00017695ad8997eb,0.032473,0.000742,0.014291,0.000145,0.007183,0.000405


In [0]:
TARGET_COLS = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]

# Сorrelation

In [0]:
def corr(first_df, second_df):
    # assuming first column is `class_name_id`

    Pearson_values = []
    Kendall_values = []
    Spearman_values = []

    for class_name in TARGET_COLS:
        # all correlations
        print('\n Class: %s' % class_name)

        Pearson_value = first_df[class_name].corr(
                  second_df[class_name], method='pearson')
        Pearson_values.append(Pearson_value)
        print(' Pearson\'s correlation score: %0.6f' % Pearson_value)

        Kendall_value = first_df[class_name].corr(
                  second_df[class_name], method='kendall')
        Kendall_values.append(Kendall_value)
        print(' Kendall\'s correlation score: %0.6f' % Kendall_value)

        Spearman_value = first_df[class_name].corr(
                  second_df[class_name], method='spearman')
        Spearman_values.append(Spearman_value)
        print(' Spearman\'s correlation score: %0.6f' % Spearman_value)

        ks_stat, p_value = ks_2samp(first_df[class_name].values,
                                    second_df[class_name].values)
        print(' Kolmogorov-Smirnov test:    KS-stat = %.6f    p-value = %.3e\n'
              % (ks_stat, p_value))
        
    return Pearson_values, Kendall_values, Spearman_values

In [17]:
Pearson_values, Kendall_values, Spearman_values = corr(submission_dfs[0], submission_dfs[-1])


 Class: toxic
 Pearson's correlation score: 0.789643
 Kendall's correlation score: 0.609889
 Spearman's correlation score: 0.805374
 Kolmogorov-Smirnov test:    KS-stat = 0.495710    p-value = 0.000e+00


 Class: severe_toxic
 Pearson's correlation score: 0.574009
 Kendall's correlation score: 0.524903
 Spearman's correlation score: 0.718356
 Kolmogorov-Smirnov test:    KS-stat = 0.717244    p-value = 0.000e+00


 Class: obscene
 Pearson's correlation score: 0.741392
 Kendall's correlation score: 0.518156
 Spearman's correlation score: 0.710806
 Kolmogorov-Smirnov test:    KS-stat = 0.617038    p-value = 0.000e+00


 Class: threat
 Pearson's correlation score: 0.367464
 Kendall's correlation score: 0.504348
 Spearman's correlation score: 0.695485
 Kolmogorov-Smirnov test:    KS-stat = 0.735107    p-value = 0.000e+00


 Class: insult
 Pearson's correlation score: 0.703885
 Kendall's correlation score: 0.580967
 Spearman's correlation score: 0.778970
 Kolmogorov-Smirnov test:    KS-stat

In [18]:
np.mean(Pearson_values), np.mean(Kendall_values), np.mean(Spearman_values)

(0.6033529727896375, 0.5467539281622168, 0.7418259985054866)

# Blending

## Blending scheme 1

In [0]:
# create blend of submissions
submission = pd.DataFrame()
submission['id'] = submission_dfs[0]['id']

submission[TARGET_COLS] = 0.25*submission_dfs[0][TARGET_COLS]+\
                          0.25*submission_dfs[1][TARGET_COLS]+\
                          0.25*submission_dfs[2][TARGET_COLS]+\
                          0.25*submission_dfs[3][TARGET_COLS]

# Cheaty evaluating on test labels

In [20]:
submission.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.990178,0.323069,0.931331,0.074425,0.857777,0.188664
1,0000247867823ef7,0.008236,0.00223,0.005473,0.000719,0.006186,0.001629
2,00013b17ad220c46,0.006026,0.00031,0.002475,0.000192,0.002874,0.000708
3,00017563c3f7919a,0.000424,5.9e-05,0.000385,8.4e-05,0.000287,5.3e-05
4,00017695ad8997eb,0.010826,0.000196,0.004125,9.5e-05,0.002104,0.000141
5,0001ea8717f6de06,0.000852,0.000122,0.000713,0.000204,0.000769,0.000242
6,00024115d4cbde0f,0.009072,0.000648,0.003344,0.000331,0.003471,0.000776
7,000247e83dcc1211,0.728969,0.00768,0.182837,0.006816,0.172859,0.006552
8,00025358d4737918,0.185688,0.000834,0.012043,0.001015,0.051196,0.001361
9,00026d1092fe71cc,0.008467,0.000669,0.004405,0.000422,0.005647,0.000671


In [0]:
# labels for the test data; value of -1 indicates it was not used for scoring; (Note: file added after competition close!)
test_labels_df = pd.read_csv(test_labels_filepath)

In [22]:
test_labels_df = test_labels_df[(test_labels_df["toxic"] != -1) &
                                (test_labels_df["severe_toxic"] != -1) &
                                (test_labels_df["obscene"] != -1) &
                                (test_labels_df["threat"] != -1) &
                                (test_labels_df["insult"] != -1) &
                                (test_labels_df["identity_hate"] != -1)]
test_labels_df.shape                               

(63978, 7)

In [23]:
submission_to_evaluate = submission[submission['id'].isin(test_labels_df['id'].values)]
submission_to_evaluate.shape

(63978, 7)

In [24]:
scores = []
for class_name in TARGET_COLS:
    train_target = test_labels_df[class_name]
    train_predicted = submission_to_evaluate[class_name]

    cv_score = np.mean(roc_auc_score(train_target.values, train_predicted.values))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

print('Total CV score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9705846366824878
CV score for class severe_toxic is 0.9883086858360016
CV score for class obscene is 0.9801631424205746
CV score for class threat is 0.9932907771383629
CV score for class insult is 0.977960300831759
CV score for class identity_hate is 0.9879327945464434
Total CV score is 0.9830400562426048


In [0]:
submission.to_csv(os.path.join(output_folder,"blending_1.csv"), index = False)

Жирный шрифт означает изменения в процессе валидации и оценивания и при изменении действует начиная со строчки указания и ниже

* glove twitter 200:  0.98052

**С поднятием колва epochs и global_epochs + RocAucEarlyStopping** скор упал с 0.98052 до 0.97573

* GLOBAL_EPOCHS = 2, EPOCHS = 5, glove.840B.300d.txt   0.97994  
* BPEmb(lang="en", dim=25, vs=20000)   0.97375  
* BPEmb(lang="en", dim=300, vs=20000) without preprocessing   0.97699  
* BPEmb(lang="en", dim=300, vs=20000)   0.97865   
* fasttext crawl-300d-2M-subword 0.95732  
* BPEmb(lang="en", dim=300, vs=20000) **Patience 2->1 SEEDS 1->1**  0.98041   
* flair ELMoEmbeddings('small') embedding len 768  0.98005  
* flair ELMoEmbeddings('medium') embedding len 1536  0.98038  
* flair RoBERTaEmbeddings('roberta-base') embedding len 768  0.97413  
* flair stack of ELMoEmbeddings('small') + RoBERTaEmbeddings('roberta-base') embedding len 1536  0.98069      
* 3 tuned layers in BERT modeldense = tf.keras.layers.Dense(256, activation='relu')(bert_output)pred = tf.keras.layers.Dense(len(TARGET_COLS), activation='sigmoid')(dense)model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])epochs=1,batch_size=32   0.97658
* 10 tuned layers in BERT modeldense = tf.keras.layers.Dense(256, activation='relu')(bert_output)pred = tf.keras.layers.Dense(len(TARGET_COLS), activation='sigmoid')(dense)model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])epochs=1,batch_size=128, 5 epochs+ES(patience=1)  0.97377
* 5 tuned layers in BERT modeldense = tf.keras.layers.Dense(256, activation='relu')(bert_output)pred = tf.keras.layers.Dense(len(TARGET_COLS), activation='sigmoid')(dense)model.compile(loss='binary_crossentropy', optimizer='adam')epochs=10,batch_size=256, ES(patience=1) +save best  0.97346  
* 0.25 of each 'submission_emb_sklearn.csv', 'submission_bidirectional_GRU.csv','submission_fine_tuned_bert.csv', 'submission_fine_tuned_bert2.csv'  0.98308