In [40]:
import pandas as pd
import numpy as np
import torch
from warnings import filterwarnings
filterwarnings('ignore')
from tqdm import tqdm
import torch.nn as nn

In [5]:
df = pd.read_pickle('/kaggle/input/prepared-df-4-cl-cup/prp_df_4.pkl')

### Making validation dataset

In [6]:
df_validation = df[(df['text_id']>42994)] # making validation on the rest half of the dataset

In [7]:
df_validation.head(6)

Unnamed: 0,text_id,text,comment,prp_text,prp_com,score,text_words_qty,comment_words_qty,repeat_words,repeat_rate_words,link,quoted,wr_sum,wr_len,wr_rate,wr_rate_tot
214975,42995,Intel Acquires Basis Watch,Can someone explain to me Intel&#x27;s continu...,intel acquires basis watch,someone explain intel x27 continued fascinatio...,0,4,41,1,0.02439,0,0,215082,19,11320.105469,5245.902344
214976,42995,Intel Acquires Basis Watch,My husband has one and wrote some Nagios check...,intel acquires basis watch,husband one wrote nagios check download data w...,1,4,36,0,0.0,1,1,158339,16,9896.1875,4398.305664
214977,42995,Intel Acquires Basis Watch,I still haven&#x27;t forgiven them the Project...,intel acquires basis watch,still x27 forgiven project offset death http x...,3,4,19,0,0.0,1,0,58402,3,19467.333984,3073.789551
214978,42995,Intel Acquires Basis Watch,Intel - where startups go to die.,intel acquires basis watch,intel startup go die,4,4,4,1,0.25,0,0,29304,3,9768.0,7326.0
214979,42995,Intel Acquires Basis Watch,How does the Basis rank against the other smar...,intel acquires basis watch,basis rank smart watch see x27 priced higher 1...,2,4,29,2,0.068966,0,0,191709,17,11277.0,6610.655273
214980,42996,Intel Acquires Mashery,"Mashery has developers but not the enterprise,...",intel acquires mashery,mashery developer enterprise intel leverage re...,2,3,8,2,0.25,0,0,51420,7,7345.714355,6427.5


### Load ML models and make predictions

In [49]:
import pickle

with open('/kaggle/input/dl-ml-models/ML_model_best.pickle', 'rb') as pkl:
    ctbs = pickle.load(pkl)

with open('/kaggle/input/dl-ml-models/ML_model_worst.pickle', 'rb') as pkl:
    ctbs_w = pickle.load(pkl)   

In [50]:
cols = ['text_words_qty', 
        'comment_words_qty', 
        'repeat_words', 
        'repeat_rate_words', 
        'link', 'quoted', 
        'wr_sum', 
        'wr_len', 
        'wr_rate', 
        'wr_rate_tot'
        ]
cats = ['link', 'quoted']

In [51]:
def ML_predict(ctbs):

    temp = []

    for i in range(len(ctbs)):
        if i == 0:
            temp = ctbs[i].predict_proba(df_validation[cols])[:,1]
        else:
            temp = temp + ctbs[i].predict_proba(df_validation[cols])[:,1]

    return temp / len(ctbs)
    

In [52]:
df_validation['ML_best'] = ML_predict(ctbs)
df_validation['ML_worst'] = ML_predict(ctbs_w)

### Load DL models and make predictions

In [13]:
nn_best = torch.load('/kaggle/input/dl-ml-models/model_01')
nn_worst = torch.load('/kaggle/input/dl-ml-models/model_02')

In [14]:
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [15]:
tokenizer, model = get_model('bert')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

cuda:0


In [17]:
import datasets
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)


validate_df = pd.concat([df_validation['text'] + ' ' + df_validation['comment'], df_validation['score']], axis=1)

validate_df.columns = ['text', 'label']

validate_dataset = datasets.Dataset.from_pandas(validate_df)

validate_dataset = validate_dataset.map(tokenization, batched=True)

validate_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

validate_loader = DataLoader(validate_dataset, batch_size=64, collate_fn=data_collator, pin_memory=True, shuffle=False)

  0%|          | 0/215 [00:00<?, ?ba/s]

In [31]:
@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)



# В качестве предсказаний нам нужно получить значения от 0 до 1. Используем сигмоиду.
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


@torch.inference_mode()
def predict(model, loader):
    i = 0
    model.eval()
    model = model.to(device)
    total_loss = 0
    total_accuracy = 0

    for x in tqdm(loader, desc='Evaluation'):
        x = x[0].to(device)

        output = model(x)
        
        if i == 0:
            result = sigmoid(output.to('cpu'))
        else:
            result = torch.cat((result, sigmoid(output.to('cpu'))), 0)
        i += 1
    return result


In [33]:
model = model.to(device)

validate_embeddings, _ = get_embeddings_labels(model, validate_loader)

  0%|          | 0/3359 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 3359/3359 [55:14<00:00,  1.01it/s]


In [34]:
import pickle

with open('validate_embeddings.pickle', 'wb') as pkl:
    pickle.dump(validate_embeddings, pkl) 

In [35]:
from torch.utils.data import TensorDataset

dataset = TensorDataset(validate_embeddings)

valid_loader = DataLoader(dataset, batch_size=128, pin_memory=True, shuffle=False)

In [45]:
@torch.inference_mode()
def predict(model, loader):
    i = 0
    model.eval()
    model = model.to(device)
    total_loss = 0
    total_accuracy = 0

    for x in tqdm(loader, desc='Evaluation'):
        x = x[0].to(device)

        output = model(x)
        
        if i == 0:
            result = sigmoid(output.to('cpu'))
        else:
            result = torch.cat((result, sigmoid(output.to('cpu'))), 0)
        i += 1
    return result

In [47]:
df_validation['DL_best'] = predict(nn_best, valid_loader).numpy()
df_validation['DL_worst'] = predict(nn_worst, valid_loader).numpy()

Evaluation: 100%|██████████| 1680/1680 [00:01<00:00, 1156.87it/s]
Evaluation: 100%|██████████| 1680/1680 [00:01<00:00, 1138.53it/s]


### Put scores according to predictions of models

In [62]:
df_validation['ML_best_score'] = df_validation.sort_values(['text_id', 'ML_best'], ascending=[True, False]).groupby('text_id').cumcount()
df_validation['ML_worst_score'] = df_validation.sort_values(['text_id', 'ML_worst'], ascending=[True, True]).groupby('text_id').cumcount()
df_validation['DL_best_score'] = df_validation.sort_values(['text_id', 'DL_best'], ascending=[True, False]).groupby('text_id').cumcount()
df_validation['DL_worst_score'] = df_validation.sort_values(['text_id', 'DL_worst'], ascending=[True, True]).groupby('text_id').cumcount()

### Calculation metrics

In [2]:
from sklearn.metrics import ndcg_score

def ndcg(df, col):
    """
    Custom ndcg metric
    """
    y_true_nd = np.zeros(shape=(5,6))
    y_true_nd[np.arange(5), df['score'].values] = 1
    
    y_pred_nd = np.zeros(shape=(5,6))
    y_pred_nd[np.arange(5), df[col].values] = 1
    
    return ndcg_score(y_true_nd, y_pred_nd)

In [4]:
df_validation.head(2)

Unnamed: 0,text_id,text,comment,prp_text,prp_com,score,text_words_qty,comment_words_qty,repeat_words,repeat_rate_words,...,wr_rate_tot,ML_best,ML_worst,DL_best,DL_worst,ML_best_score,ML_worst_score,DL_best_score,DL_worst_score,voted_score
214975,42995,Intel Acquires Basis Watch,Can someone explain to me Intel&#x27;s continu...,intel acquires basis watch,someone explain intel x27 continued fascinatio...,0,4,41,1,0.02439,...,5245.902344,0.203921,0.169322,0.170862,0.209813,0,0,1,1,0
214976,42995,Intel Acquires Basis Watch,My husband has one and wrote some Nagios check...,intel acquires basis watch,husband one wrote nagios check download data w...,1,4,36,0,0.0,...,4398.305664,0.166624,0.178971,0.222585,0.177023,1,1,0,0,1


In [5]:
ndcg_mean = df_validation.groupby('text_id').apply(lambda x: ndcg(x, 'ML_best_score')).mean()

print(f'NDCG of Best comment ML model : {ndcg_mean:0.4f}')

NDCG of Best comment ML model : 0.6353


In [6]:
ndcg_mean = df_validation.groupby('text_id').apply(lambda x: ndcg(x, 'ML_worst_score')).mean()

print(f'NDCG of Worst comment ML model : {ndcg_mean:0.4f}')

NDCG of Worst comment ML model : 0.6350


In [7]:
ndcg_mean = df_validation.groupby('text_id').apply(lambda x: ndcg(x, 'DL_best_score')).mean()

print(f'NDCG of Best comment DL model : {ndcg_mean:0.4f}')

NDCG of Best comment DL model : 0.6338


In [8]:
ndcg_mean = df_validation.groupby('text_id').apply(lambda x: ndcg(x, 'DL_worst_score')).mean()

print(f'NDCG of Worst comment DL model : {ndcg_mean:0.4f}')

NDCG of Worst comment DL model : 0.6326


In [24]:
df_validation[['text_id', 'ML_best_score', 'ML_worst_score', 'DL_best_score', 'DL_worst_score']].head(10)

Unnamed: 0,text_id,ML_best_score,ML_worst_score,DL_best_score,DL_worst_score
214975,42995,0,0,1,1
214976,42995,1,1,0,0
214977,42995,3,3,2,3
214978,42995,4,4,4,4
214979,42995,2,2,3,2
214980,42996,2,1,2,1
214981,42996,4,4,4,4
214982,42996,1,3,3,2
214983,42996,0,0,0,0
214984,42996,3,2,1,3


In [9]:
def voter(df):
    """Function takes results from 4 models and chooses mostly voted variants"""
    
    df = df[['ML_best_score', 'ML_worst_score', 'DL_best_score', 'DL_worst_score']].to_numpy()
    result = [6,6,6,6,6]
    seq = [0,1,2,3,4]
    temp = None
    i = 0
    for i in range(df.shape[0]):
        counts = dict(np.concatenate([np.unique(df[i], return_counts=True)[::-1]], axis=0).T) # Make dictionary {count: unique}

        skipper = 0

        if 4 in counts.keys():              # 4 voted the same
            result[i] = counts[4]
            seq[seq.index(result[i])] = 6

        if 3 in counts.keys():              # 3 voted the same
            result[i] = counts[3]
            seq[seq.index(result[i])] = 6      

        if (2 in counts.keys()) and (1 not in counts.keys()):               # 2 by 2 vote opposit. Use first model choice.
            result[i] = df[i][0]
            seq[seq.index(result[i])] = 6
            skipper = 1

        if (2 in counts.keys()) and (result[i] == 6) and (0 in np.where(df[i] == counts[2])[0]):                # if 2 vote. Use first model choice.
            result[i] = counts[2]
            seq[seq.index(result[i])] = 6
            skipper = 1

        if (2 in counts.keys()) and (skipper==0):                # if 2 vote. Use second model choice.
            if temp is None:
                temp = {counts[2]: i}
            elif (temp is not None) and ~(counts[2] in temp.keys()):
                temp[counts[2]] = i
            else:
                if np.where(df[i] == counts[2])[0][0] < np.where(df[temp[counts[2]]] == counts[2])[0][0]:
                    result[i] = counts[2]
                    seq[seq.index(result[i])] = 6
                    del temp[counts[2]]
                else:
                    result[temp[counts[2]]] = counts[2]
                    seq[seq.index(counts[2])] = 6
                    del temp[counts[2]]



    if temp is not None:
        for j in temp.keys():
            if j in seq:
                result[temp[j]] = j
                seq[seq.index(j)] = 6

    n = 0
    while result.count(6) > 1:
        for k in np.where(np.array(result)==6)[0]:
            if df[k][n] not in result:
                result[k] = df[k][n]
                seq[seq.index(result[k])] = 6
        n+=1
        
        
    if result.count(6) == 1:                # Fill the last value from result
        num = next(iter(set(seq)))
        seq[seq.index(num)] = 6
        result[result.index(6)] = num  

    return result

In [10]:
from itertools import chain

s = df_validation[['text_id', 'ML_best_score', 'ML_worst_score', 'DL_best_score', 'DL_worst_score']].groupby('text_id').apply(voter)

df_validation['voted_score'] = sum(list(chain(s.values)),[])

In [12]:
df_validation['voted_score'].value_counts()

0    42992
1    42992
3    42992
4    42992
2    42992
Name: voted_score, dtype: int64

In [11]:
ndcg_mean = df_validation.groupby('text_id').apply(lambda x: ndcg(x, 'voted_score')).mean()

print(f'NDCG of Combined result of 4 models : {ndcg_mean:0.4f}')

NDCG of Combined result of 4 models : 0.6355


### Check random metric

In [28]:
def random_ndcg(df):
    """
    Random ndcg metric
    """
    y_true_nd = np.zeros(shape=(5,6))
    y_true_nd[np.arange(5), df['score'].values] = 1
    
    y_pred_nd = np.zeros(shape=(5,6))
    y_pred_nd[np.arange(5), np.random.choice(range(5), 5, replace=False)] = 1
    
    return ndcg_score(y_true_nd, y_pred_nd)

In [46]:
ndcg_mean = df_validation.groupby('text_id').apply(random_ndcg).mean()

print(f'Random NDCG score : {ndcg_mean:0.4f}')

Random NDCG score : 0.5686


### Results
We got ensemble solution for ranking comments to any text.

The quality of NDCG metric (Normalized Discounted Cumulative Gain) is **0.6355**