<a href="https://colab.research.google.com/github/OlegBEZb/NLP_advanced_course/blob/master/HW1/NLP_advance_course_HW3_Distributive_semantic_Blending.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libs import

In [0]:
import os, csv, random, pickle
import numpy as np
import pandas as pd

from scipy.stats import ks_2samp, rankdata

from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

# Authorization on Google drive and configurings paths

In [0]:
WORKSPACE = 'COLAB' # or 'KAGGLE'

In [10]:
if WORKSPACE == 'COLAB':
    from google.colab import drive
    drive.mount('/content/drive')

    homework_folder = os.path.join('/content/drive/My Drive', 'Advanced NLP/Homework 1: Classical classification task like Kaggle Toxic or Quora')
    data_folder = os.path.join(homework_folder, 'Toxic data')
    output_folder = os.path.join(homework_folder, 'output')
elif WORKSPACE == 'KAGGLE':
    data_folder = '../input/jigsaw-toxic-comment-classification-challenge/'
else: # TODO: add computing on premise
    pass

print('data found:', os.listdir(data_folder))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
data found: ['test.csv', 'test_labels.csv', 'train.csv', 'sample_submission.csv', 'submission.csv', 'train_labels.csv.npy', 'train_labels.npy']


In [25]:
test_labels_filepath = os.path.join(data_folder,"test_labels.csv")

files_to_blend = [file for file in os.listdir(output_folder) if '.csv' in file and 'blending' not in file]
files_to_blend

['submission_emb_sklearn.csv',
 'submission_bidirectional_GRU.csv',
 'submission_fine_tuned_bert.csv',
 'submission_fine_tuned_bert2.csv']

# Loading frames

In [0]:
submission_dfs = [pd.read_csv(os.path.join(output_folder,filename)) for filename in files_to_blend]

In [28]:
submission_dfs[0].head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.966263,0.054183,0.831525,0.011145,0.604859,0.075418
1,0000247867823ef7,0.016774,0.008855,0.019123,0.002713,0.022219,0.006187
2,00013b17ad220c46,0.015831,0.001117,0.00704,0.000455,0.008738,0.002202
3,00017563c3f7919a,0.001216,0.000236,0.00133,0.000328,0.001082,0.000201
4,00017695ad8997eb,0.032473,0.000742,0.014291,0.000145,0.007183,0.000405


In [0]:
TARGET_COLS = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]

# Сorrelation

In [0]:
def corr(first_df, second_df):
    # assuming first column is `class_name_id`

    Pearson_values = []
    Kendall_values = []
    Spearman_values = []

    for class_name in TARGET_COLS:
        # all correlations
        print('\n Class: %s' % class_name)

        Pearson_value = first_df[class_name].corr(
                  second_df[class_name], method='pearson')
        Pearson_values.append(Pearson_value)
        print(' Pearson\'s correlation score: %0.6f' % Pearson_value)

        Kendall_value = first_df[class_name].corr(
                  second_df[class_name], method='kendall')
        Kendall_values.append(Kendall_value)
        print(' Kendall\'s correlation score: %0.6f' % Kendall_value)

        Spearman_value = first_df[class_name].corr(
                  second_df[class_name], method='spearman')
        Spearman_values.append(Spearman_value)
        print(' Spearman\'s correlation score: %0.6f' % Spearman_value)

        ks_stat, p_value = ks_2samp(first_df[class_name].values,
                                    second_df[class_name].values)
        print(' Kolmogorov-Smirnov test:    KS-stat = %.6f    p-value = %.3e\n'
              % (ks_stat, p_value))
        
    return Pearson_values, Kendall_values, Spearman_values

In [9]:
Pearson_values, Kendall_values, Spearman_values = corr(submission_dfs[0], submission_dfs[-1])


 Class: toxic
 Pearson's correlation score: 0.900002
 Kendall's correlation score: 0.837508
 Spearman's correlation score: 0.948873
 Kolmogorov-Smirnov test:    KS-stat = 0.234657    p-value = 0.000e+00


 Class: severe_toxic
 Pearson's correlation score: 0.771720
 Kendall's correlation score: 0.900599
 Spearman's correlation score: 0.967130
 Kolmogorov-Smirnov test:    KS-stat = 0.379123    p-value = 0.000e+00


 Class: obscene
 Pearson's correlation score: 0.868834
 Kendall's correlation score: 0.864926
 Spearman's correlation score: 0.954527
 Kolmogorov-Smirnov test:    KS-stat = 0.325886    p-value = 0.000e+00


 Class: threat
 Pearson's correlation score: 0.667546
 Kendall's correlation score: 0.816863
 Spearman's correlation score: 0.926124
 Kolmogorov-Smirnov test:    KS-stat = 0.388303    p-value = 0.000e+00


 Class: insult
 Pearson's correlation score: 0.854414
 Kendall's correlation score: 0.860028
 Spearman's correlation score: 0.954882
 Kolmogorov-Smirnov test:    KS-stat

In [0]:
np.mean(Pearson_values), np.mean(Kendall_values), np.mean(Spearman_values)

(0.6033529727896375, 0.5467539281622168, 0.7418259985054866)

TODO: add correlation matrix for each class of with averaged values

# Blending

In [0]:
submission = pd.DataFrame()
submission['id'] = submission_dfs[0]['id']

## Blending scheme 1

In [0]:
submission[TARGET_COLS] = 0.25*submission_dfs[0][TARGET_COLS]+\
                          0.25*submission_dfs[1][TARGET_COLS]+\
                          0.25*submission_dfs[2][TARGET_COLS]+\
                          0.25*submission_dfs[3][TARGET_COLS]

## Blending scheme 2
Each value is averaged between submissions

In [0]:
stacked_averages = (submission_dfs[0][TARGET_COLS].stack() + \
                    submission_dfs[1][TARGET_COLS].stack() + \
                    submission_dfs[2][TARGET_COLS].stack() + \
                    submission_dfs[2][TARGET_COLS].stack()
                   ) / 4

In [0]:
submission[TARGET_COLS] = stacked_averages.unstack()

## Blending scheme 3
Each value is an geometric mean between submissions

In [0]:
stacked_geo_means = pow((submission_dfs[0][TARGET_COLS].stack() * \
                    submission_dfs[1][TARGET_COLS].stack() * \
                    submission_dfs[2][TARGET_COLS].stack() * \
                    submission_dfs[2][TARGET_COLS].stack()
                   ), 1/4)

In [0]:
submission[TARGET_COLS] = stacked_geo_means.unstack()

## Blending scheme 4

In [0]:
# probable validation scores
weight_dict = {
    'submission_emb_sklearn.csv': 0.94733,
    'submission_bidirectional_GRU.csv': 0.9878,
    'submission_fine_tuned_bert.csv': 0.9839,
    'submission_fine_tuned_bert2.csv': 0.9820    
}

In [0]:
# subs: list of submission dataframes with two columns (id, value)
# weights: per submission weights; default is equal weighting 
def rank_average(subs, weights = None):
  if weights is None:
    weights = len(subs) * [1.0 / len(subs)]
  else:
    weights = weights / np.sum(weights)
  preds = subs[0].copy()
  preds.iloc[:,1] = np.zeros(len(subs[0]))
  for i, sub in enumerate(subs):
    preds.iloc[:,1] = np.add(preds.iloc[:,1], weights[i] * rankdata(sub.iloc[:,1]) / len(sub))
  return preds

In [0]:
submission = rank_average(submission_dfs, list(weight_dict.values()))

due the fact that validation results (as well as actual results) are lost, this approach got terrible results

# Cheaty evaluating on test labels

In [56]:
submission.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.968447,0.054183,0.831525,0.011145,0.604859,0.075418
1,0000247867823ef7,0.478894,0.008855,0.019123,0.002713,0.022219,0.006187
2,00013b17ad220c46,0.319275,0.001117,0.00704,0.000455,0.008738,0.002202
3,00017563c3f7919a,0.075061,0.000236,0.00133,0.000328,0.001082,0.000201
4,00017695ad8997eb,0.452272,0.000742,0.014291,0.000145,0.007183,0.000405
5,0001ea8717f6de06,0.216728,0.000488,0.00261,0.000808,0.002954,0.000952
6,00024115d4cbde0f,0.437397,0.002586,0.01283,0.001297,0.013505,0.003067
7,000247e83dcc1211,0.842561,0.018339,0.382074,0.014608,0.274177,0.016819
8,00025358d4737918,0.696941,0.002882,0.017591,0.002374,0.020235,0.002853
9,00026d1092fe71cc,0.293411,0.002668,0.017098,0.001654,0.022337,0.00264


In [0]:
# labels for the test data; value of -1 indicates it was not used for scoring; (Note: file added after competition close!)
test_labels_df = pd.read_csv(test_labels_filepath)

In [58]:
test_labels_df = test_labels_df[(test_labels_df["toxic"] != -1) &
                                (test_labels_df["severe_toxic"] != -1) &
                                (test_labels_df["obscene"] != -1) &
                                (test_labels_df["threat"] != -1) &
                                (test_labels_df["insult"] != -1) &
                                (test_labels_df["identity_hate"] != -1)]
test_labels_df.shape                               

(63978, 7)

In [59]:
submission_to_evaluate = submission[submission['id'].isin(test_labels_df['id'].values)]
submission_to_evaluate.shape

(63978, 7)

In [60]:
scores = []
for class_name in TARGET_COLS:
    train_target = test_labels_df[class_name]
    train_predicted = submission_to_evaluate[class_name]

    cv_score = np.mean(roc_auc_score(train_target.values, train_predicted.values))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

print('Total CV score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.969695728051042
CV score for class severe_toxic is 0.9498586799525744
CV score for class obscene is 0.9426970760475595
CV score for class threat is 0.9719721985483732
CV score for class insult is 0.9370787074676544
CV score for class identity_hate is 0.9377735462930369
Total CV score is 0.9515126560600401


In [0]:
submission.to_csv(os.path.join(output_folder,"blending_3.csv"), index = False)

Жирный шрифт означает изменения в процессе валидации и оценивания и при изменении действует начиная со строчки указания и ниже

* glove twitter 200:  0.98052

**С поднятием колва epochs и global_epochs + RocAucEarlyStopping** скор упал с 0.98052 до 0.97573

* GLOBAL_EPOCHS = 2, EPOCHS = 5, glove.840B.300d.txt   0.97994  
* BPEmb(lang="en", dim=25, vs=20000)   0.97375  
* BPEmb(lang="en", dim=300, vs=20000) without preprocessing   0.97699  
* BPEmb(lang="en", dim=300, vs=20000)   0.97865   
* fasttext crawl-300d-2M-subword 0.95732  
* BPEmb(lang="en", dim=300, vs=20000) **Patience 2->1 SEEDS 1->1**  0.98041   
* flair ELMoEmbeddings('small') embedding len 768  0.98005  
* flair ELMoEmbeddings('medium') embedding len 1536  0.98038  
* flair RoBERTaEmbeddings('roberta-base') embedding len 768  0.97413  
* flair stack of ELMoEmbeddings('small') + RoBERTaEmbeddings('roberta-base') embedding len 1536  0.98069      
* 3 tuned layers in BERT modeldense = tf.keras.layers.Dense(256, activation='relu')(bert_output)pred = tf.keras.layers.Dense(len(TARGET_COLS), activation='sigmoid')(dense)model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])epochs=1,batch_size=32   0.97658
* 10 tuned layers in BERT modeldense = tf.keras.layers.Dense(256, activation='relu')(bert_output)pred = tf.keras.layers.Dense(len(TARGET_COLS), activation='sigmoid')(dense)model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])epochs=1,batch_size=128, 5 epochs+ES(patience=1)  0.97377
* 5 tuned layers in BERT modeldense = tf.keras.layers.Dense(256, activation='relu')(bert_output)pred = tf.keras.layers.Dense(len(TARGET_COLS), activation='sigmoid')(dense)model.compile(loss='binary_crossentropy', optimizer='adam')epochs=10,batch_size=256, ES(patience=1) +save best  0.97346  
* 0.25 of each 'submission_emb_sklearn.csv', 'submission_bidirectional_GRU.csv','submission_fine_tuned_bert.csv', 'submission_fine_tuned_bert2.csv'  0.98308
* Each value is averaged between 'submission_emb_sklearn.csv', 'submission_bidirectional_GRU.csv','submission_fine_tuned_bert.csv', 'submission_fine_tuned_bert2.csv'  0.98261
* Each value is a geometric mean between 'submission_emb_sklearn.csv', 'submission_bidirectional_GRU.csv','submission_fine_tuned_bert.csv', 'submission_fine_tuned_bert2.csv'  0.98214
