In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
from tqdm import tqdm
import transformers
%matplotlib inline

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
words = stopwords.words("russian")
lemma = nltk.stem.WordNetLemmatizer()


import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

import random
import time

In [None]:
features = ['salary_rating', 'team_rating', 'managment_rating', 'career_rating',
       'workplace_rating', 'rest_recovery_rating']

In [None]:
train_df = pd.read_csv('/kaggle/input/headhunter-employer-review-competition/HeadHunter_train.csv')

In [None]:
train_df.head()

In [None]:
train_df['Target_vector'] = train_df.target.apply(lambda target: [1  if number in [int(x) for x in target if x.isdigit()] else 0 for number in range(9)])

In [None]:
train_df = train_df[(~train_df.positive.isna())]
train_df = train_df[(~train_df.negative.isna())]

In [None]:
print(train_df.positive.apply(lambda x: len(x)).mean(), train_df.positive.apply(lambda x: len(x)).max())
print(train_df.negative.apply(lambda x: len(x)).mean(), train_df.negative.apply(lambda x: len(x)).max())

In [None]:
train_full = train_df.copy()

In [None]:
train_df, val = train_test_split(train_full, test_size=0.1, random_state=42)

## Bert

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased',do_lower_case=True)
# Create a funcition to tokenize a set of text

def tokenize_text(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # create empty lists to store outputs
    input_ids = []
    attention_masks = []
    
    #for every sentence...
    
    for sent in tqdm(data):
        # 'encode_plus will':
        # (1) Tokenize the sentence
        # (2) Add the `[CLS]` and `[SEP]` token to the start and end
        # (3) Truncate/Pad sentence to max length
        # (4) Map tokens to their IDs
        # (5) Create attention mask
        # (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            sent,   #preprocess sentence
            add_special_tokens = True,         #Add `[CLS]` and `[SEP]`
            max_length= 512  ,             #Max length to truncate/pad
            truncation=True,
            pad_to_max_length = True,          #pad sentence to max length 
            return_attention_mask= True        #Return attention mask 
        )
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
        
    #convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    return input_ids,attention_masks

In [None]:
input_ids_pos, attention_masks_pos = tokenize_text(train_df.positive.values)
input_ids_neg, attention_masks_neg = tokenize_text(train_df.negative.values)
train_features = torch.Tensor(train_df[features].values)
train_labels = train_df.Target_vector.tolist()

val_input_ids_pos, val_attention_masks_pos = tokenize_text(val.positive.values)
val_input_ids_neg, val_attention_masks_neg = tokenize_text(val.negative.values)
val_features = torch.Tensor(val[features].values)
val_labels = val.Target_vector.tolist()

In [None]:
train_labels = torch.Tensor(train_labels)
val_labels = torch.Tensor(val_labels)

In [None]:
batch_size = 8

train_data = TensorDataset(input_ids_pos, attention_masks_pos, input_ids_neg, attention_masks_neg, train_features, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_input_ids_pos, val_attention_masks_pos, val_input_ids_neg, val_attention_masks_neg, val_features, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:

class BertClassifier(nn.Module):
    """
        Bert Model for classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param   bert: a BertModel object
        @param   classifier: a torch.nn.Module classifier
        @param   freeze_bert (bool): Set `False` to fine_tune the Bert model
        """
        super(BertClassifier,self).__init__()
        # Specify hidden size of Bert, hidden size of our classifier, and number of labels
        D_in, H,D_out = 768,30,9
        D_in_logreg, H_logreg, D_out_logreg = 6, 30, 9
        self.bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
        
        self.classifier = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(2*D_in + 2*H_logreg, H),
            nn.ReLU(),
            nn.Linear(H, D_out))
        
        self.logreg = nn.Sequential(
            nn.Linear(D_in_logreg, H_logreg),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(H_logreg, 2*H_logreg))
        
        self.sigmoid = nn.Sigmoid()
        # Freeze the Bert Model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    
    def forward(self,input_ids_pos,attention_mask_pos, input_ids_neg,attention_mask_neg, logreg_features):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        outputs_pos = self.bert(input_ids=input_ids_pos,
                           attention_mask = attention_mask_pos)
        
        
        outputs_neg = self.bert(input_ids=input_ids_neg,
                           attention_mask = attention_mask_neg)
        
        
        outputs_logreg = self.logreg(logreg_features)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls_pos = outputs_pos[0][:,0,:]
        last_hidden_state_cls_neg = outputs_neg[0][:,0,:]
        
        last_hidden_state_cls = torch.cat([last_hidden_state_cls_pos, last_hidden_state_cls_neg, outputs_logreg], dim=1)
        
        # Feed input to classifier to compute logits
        logit = self.classifier(last_hidden_state_cls)
        
#         logits = self.sigmoid(logit)
        
        return logit

In [None]:
def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)
    
    bert_classifier.to(device)
    
    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                     lr=5e-5, #Default learning rate
                     eps=1e-8 #Default epsilon value
                     )
    
    # Total number of training steps
    total_steps = len(train_dataloader) * epochs
    
    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=0, # Default value
                                              num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
loss_fn = nn.BCEWithLogitsLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            b_input_ids_pos, b_attn_mask_pos, b_input_ids_neg, b_attn_mask_neg, b_features, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            logits = model(b_input_ids_pos, b_attn_mask_pos, b_input_ids_neg, b_attn_mask_neg, b_features)

            loss = loss_fn(logits, b_labels.float())
            batch_loss += loss.item()
            total_loss += loss.item()


            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 1000 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    model.eval()
    val_accuracy = []
    val_loss = []
    for batch in val_dataloader:
        b_input_ids_pos, b_attn_mask_pos, b_input_ids_neg, b_attn_mask_neg, b_features, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids_pos, b_attn_mask_pos, b_input_ids_neg, b_attn_mask_neg, b_features)

        loss = loss_fn(logits, b_labels.float())
        val_loss.append(loss.item())

        accuracy = accuracy_thresh(logits.view(-1,9),b_labels.view(-1,9))
        
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

def accuracy_thresh(y_pred, y_true, thresh:float=0.5, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: 
        y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.byte()).float().mean().item()

In [None]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
set_seed(42)
train(bert_classifier, train_dataloader, val_dataloader, epochs=1, evaluation=True)

In [None]:
test = pd.read_csv('/kaggle/input/headhunter-employer-review-competition/HeadHunter_test.csv')

In [None]:
nan = test[test.positive.isna()].positive.values[0]

In [None]:
test = test.replace(to_replace=nan, value='')

In [None]:
input_ids_pos_test, attention_masks_pos_test = tokenize_text(test.positive.values)
input_ids_neg_test, attention_masks_neg_test = tokenize_text(test.negative.values)
test_features = torch.Tensor(test[features].values)

In [None]:
test_data = TensorDataset(input_ids_pos_test, attention_masks_pos_test, input_ids_neg_test, attention_masks_neg_test, test_features)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
checkpoint = {'model': bert_classifier,
          'state_dict': bert_classifier.state_dict(),
          'optimizer' : optimizer.state_dict()}

torch.save(checkpoint, 'checkpoint_v4_2.pth')

In [None]:
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath, map_location=torch.device('cpu'))
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False

    model.eval()
    return model

# model = load_checkpoint('/kaggle/input/bertclassifier/checkpoint.pth')

In [None]:
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    model.eval()

    all_logits = []

    for step, batch in enumerate(test_dataloader):
        print(step, end='\r')
        b_input_ids_pos, b_attn_mask_pos, b_input_ids_neg, b_attn_mask_neg, b_features = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids_pos, b_attn_mask_pos, b_input_ids_neg, b_attn_mask_neg, b_features)
        all_logits.append(logits)
    
    all_logits = torch.cat(all_logits, dim=0)
    probs = all_logits.sigmoid().cpu().numpy()
    

    return probs

In [None]:

test_probs = bert_predict(bert_classifier, test_dataloader)

In [None]:
val_probs = bert_predict(bert_classifier, val_dataloader)

In [None]:
def logits_to_targets(df, probs ,thresh):
    probs_list = [list(x) for x in probs]
    df.Logits = probs_list
    df['Pred_vector'] = df['Logits'].apply(lambda x: [1 if el > thresh else 0 for el in x])
    df['Pred_vector'] = df['Pred_vector'].apply(lambda x: np.array(x))
    df['Predict'] = df['Pred_vector'].apply(lambda x: ','.join([str(el) for el in x.nonzero()[0]]))
    df['Predict'] = df.apply(lambda row: ','.join(str(np.argmax(row.Logits))) if row.Predict =='' else row.Predict, axis=1)
    return df

In [None]:
binar = MultiLabelBinarizer()
binar = binar.fit(val.target)

In [None]:
res = []
for thresh in np.linspace(start = 0, stop=1, num=51):
    a = logits_to_targets(val, val_probs, thresh)
    score = f1_score(binar.transform(a.target), binar.transform(a.Predict), average='samples')
    res.append([thresh, score])

In [None]:
pd.DataFrame(res, columns=['Threshhold','f1_score'])

In [None]:
test = logits_to_targets(test, test_probs, 0.48)

In [None]:
submit_sample = pd.read_csv('/kaggle/input/headhunter-employer-review-competition/HeadHunter_sample_submit.csv')

In [None]:
test.rename(columns={'Predict':'target'}, inplace=True)

In [None]:
test[['review_id','target']].to_csv('submit.csv', index=False)

In [None]:
submit_sample.to_csv('sub.csv', index=False)