In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 20.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 72.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


## Load lib

In [2]:
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from torch.utils.data import (TensorDataset,
                              Dataset, 
                              random_split, 
                              DataLoader, 
                              RandomSampler, 
                              SequentialSampler)

from transformers import (get_linear_schedule_with_warmup,
                          AutoModelForSequenceClassification,
                          get_cosine_schedule_with_warmup,


                          AdamW, 
                          AutoTokenizer)

from sklearn.metrics import (f1_score, 
                             precision_score,
                             recall_score)


In [3]:
import random
import numpy as np
seed_val = 42
random.seed(42)

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

## Load data

In [4]:
!gdown --id 1OhIsmCxOV7ixd10WPbcawkt_IllLeRM2

Downloading...
From: https://drive.google.com/uc?id=1OhIsmCxOV7ixd10WPbcawkt_IllLeRM2
To: /content/database.csv
100% 956k/956k [00:00<00:00, 163MB/s]


In [5]:
!gdown --id 13EUGzAMaP5g_p8DqANI7Z2M1iAvW8vDz

Downloading...
From: https://drive.google.com/uc?id=13EUGzAMaP5g_p8DqANI7Z2M1iAvW8vDz
To: /content/data_v2_19_oct.csv
100% 46.3M/46.3M [00:01<00:00, 45.3MB/s]


### Check cuda

In [6]:
df = pd.read_csv('/content/data_v2_19_oct.csv')

In [7]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Thera are  %d GPU(s) available.' % torch.cuda.device_count())
    print(torch.cuda.get_device_name(device=None))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Thera are  1 GPU(s) available.
Tesla T4


## Create model Deberta_base

In [8]:
model_name = "microsoft/deberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=2,
                                                          output_attentions = False,
                                                            output_hidden_states = False,).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 30
tokenizer.model_max_length = max_length

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.d

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [9]:
df['sentences'] = df.name_1 + tokenizer.sep_token + df.name_2

In [10]:
df.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate,name_1_upd,name_2_upd,sentences
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,Iko,Enormous Trade,Iko Industries Ltd.[SEP]Enormous Industrial Tr...
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,Apcotex,Technocraft India,Apcotex Industries Ltd.[SEP]Technocraft Indust...
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0,Rishichem Distributors,Dsa,"Rishichem Distributors Pvt., Ltd.[SEP]Dsa"
3,4,Powermax Rubber Factory,Co. One,0,Powermax Factory,One,Powermax Rubber Factory[SEP]Co. One
4,5,Tress A/S,Longyou Industries Park Zhejiang,0,Tress,Longyou Park Zhejiang,Tress A/S[SEP]Longyou Industries Park Zhejiang


## Create stratify data

In [11]:
sentences = df.sentences.values
labels = df.is_duplicate.values

In [12]:
max_length = 30

In [13]:
input_ids = []
attention_masks = []

for sent in tqdm(sentences):
  
    encoded_dict = tokenizer.encode_plus(
        sent, 
        add_special_tokens = True,
        max_length = max_length,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

  0%|          | 0/497819 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [14]:
y = torch.tensor(labels)

In [15]:
df_labels = df[['is_duplicate']]

In [16]:
y_train, y_test = train_test_split(df_labels, shuffle=True, stratify=df_labels, train_size = 0.65)

In [17]:
train_index, test_inex = y_train.index, y_test.index

In [18]:
train_dataset = TensorDataset(input_ids[train_index], attention_masks[train_index], y[train_index])
test_dataset = TensorDataset(input_ids[test_inex], attention_masks[test_inex], y[test_inex])

In [19]:
batch_size = 256
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = batch_size
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = batch_size
)

## Training model

In [20]:
optimizer = AdamW( model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
)



In [21]:
epochs = 4

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup( optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps= total_steps)

In [22]:
training_stats = []

for epoch_i in (range(1, epochs + 1)):
    total_train_loss = 0
    model.train()
    
    
    for batch in tqdm(train_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()
        
        
        res = model(b_input_ids,
                   token_type_ids=None,
                   attention_mask=b_input_mask,
                   labels = b_labels)
            
        loss= res['loss']
        logits = res['logits']
        
        total_train_loss += loss.item()
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        
        optimizer.step()
        
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    model.eval()
    
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    list_of_logits = None
    list_of_label_ids = None

    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():
            res = model(b_input_ids,
                   token_type_ids=None,
                   attention_mask=b_input_mask,
                   labels = b_labels) 
            
            loss= res['loss']
            logits = res['logits']
             
        total_eval_loss += loss.item()

        logits = logits.detach().to('cpu')
        label_ids = b_labels.to('cpu')

        if list_of_logits is None:
            list_of_logits = logits
            list_of_label_ids = label_ids
        
        else:
            list_of_logits = torch.cat((list_of_logits, logits))
            list_of_label_ids = torch.cat((list_of_label_ids, label_ids))
        
    pred  = torch.Tensor(list_of_logits).argmax(dim=1).to('cpu')

    f1_ = f1_score(list_of_label_ids, pred)
    precision = precision_score(list_of_label_ids, pred)
    recall = recall_score(list_of_label_ids, pred)
    avg_val_loss = total_eval_loss / len(test_dataloader)


    training_stats.append({
        'epoch': epoch_i,
        'training_loss': avg_train_loss,
        'valid_loss': avg_val_loss,
        'valid_f1_score': f1_,
        'valid_precision': precision,
        'valid_recall': recall,
    })
    

  0%|          | 0/1264 [00:00<?, ?it/s]

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


  0%|          | 0/1264 [00:00<?, ?it/s]

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


  0%|          | 0/1264 [00:00<?, ?it/s]

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


  0%|          | 0/1264 [00:00<?, ?it/s]

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


In [23]:
result = pd.DataFrame(training_stats)

In [24]:
result

Unnamed: 0,epoch,training_loss,valid_loss,valid_f1_score,valid_precision,valid_recall
0,1,0.013179,0.004018,0.921023,0.900075,0.942969
1,2,0.002659,0.002869,0.953057,0.96255,0.94375
2,3,0.001283,0.002851,0.961234,0.973558,0.949219
3,4,0.000644,0.003154,0.961373,0.960249,0.9625


In [25]:
model.save_pretrained("model_Deberta.h5")
tokenizer.save_pretrained("token_Deberta.h5")

('token_Deberta.h5/tokenizer_config.json',
 'token_Deberta.h5/special_tokens_map.json',
 'token_Deberta.h5/vocab.json',
 'token_Deberta.h5/merges.txt',
 'token_Deberta.h5/added_tokens.json',
 'token_Deberta.h5/tokenizer.json')