In [1]:
import numpy as np
import pandas as pd
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from sentence_transformers import InputExample
import torch
import math
from tqdm import tqdm
import random

In [2]:
model_name = 'microsoft/deberta-v3-large' 
model = CrossEncoder(model_name, num_labels=1, automodel_args={'ignore_mismatched_sizes':True})

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset

paws_dataset = load_dataset("paws", "labeled_final")

In [4]:
paws_train = paws_dataset["train"].to_pandas()

In [5]:
paws_train.head()

Unnamed: 0,id,sentence1,sentence2,label
0,1,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0
1,2,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
2,3,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0
3,4,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
4,5,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1


In [6]:
paws_train_examples = []
for i, row in paws_train.iterrows():
    paws_train_examples.append(InputExample(texts=[row['sentence2'], row['sentence1']], label= int(row['label'])))

In [7]:
len(paws_train_examples)

49401

In [8]:
for i in range(5):
    random.shuffle(paws_train_examples)

In [9]:
from datasets import load_dataset

snli_dataset = load_dataset("snli")

In [10]:
snli_dataset = snli_dataset["train"].to_pandas()

In [11]:
clean_snli_dataset = snli_dataset[snli_dataset["label"] != -1]

In [12]:
clean_snli_dataset["label"].value_counts()

0    183416
2    183187
1    182764
Name: label, dtype: int64

In [13]:
new_snil_labels = []
for i in snli_dataset["label"]:
    if i == 0:
        new_snil_labels.append(1)
    elif i == 1:
        new_snil_labels.append(0.5)
    elif i == 2:
        new_snil_labels.append(0)

In [14]:
clean_snli_dataset["new_label"] = new_snil_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_snli_dataset["new_label"] = new_snil_labels


In [15]:
clean_snli_dataset.head()

Unnamed: 0,premise,hypothesis,label,new_label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1,0.5
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2,0.0
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0,1.0
3,Children smiling and waving at camera,They are smiling at their parents,1,0.5
4,Children smiling and waving at camera,There are children present,0,1.0


In [16]:
train_examples_snli = []
for i, row in tqdm(clean_snli_dataset.iterrows()):
    train_examples_snli.append(InputExample(texts=[row['premise'], row['hypothesis']], label= row['new_label']))

549367it [00:49, 11197.15it/s]


In [17]:
for i in range(10):
    random.shuffle(train_examples_snli)

In [18]:
from datasets import load_dataset

multi_nli_dataset = load_dataset("multi_nli")

In [19]:
multi_nli_train_dataset = multi_nli_dataset["train"].to_pandas()

In [20]:
multi_nli_train_dataset.head()

Unnamed: 0,promptID,pairID,premise,premise_binary_parse,premise_parse,hypothesis,hypothesis_binary_parse,hypothesis_parse,genre,label
0,31193,31193n,Conceptually cream skimming has two basic dime...,( ( Conceptually ( cream skimming ) ) ( ( has ...,(ROOT (S (NP (JJ Conceptually) (NN cream) (NN ...,Product and geography are what make cream skim...,( ( ( Product and ) geography ) ( ( are ( what...,(ROOT (S (NP (NN Product) (CC and) (NN geograp...,government,1
1,101457,101457e,you know during the season and i guess at at y...,( you ( ( know ( during ( ( ( the season ) and...,(ROOT (S (NP (PRP you)) (VP (VBP know) (PP (IN...,You lose the things to the following level if ...,( You ( ( ( ( lose ( the things ) ) ( to ( the...,(ROOT (S (NP (PRP You)) (VP (VBP lose) (NP (DT...,telephone,0
2,134793,134793e,One of our number will carry out your instruct...,( ( One ( of ( our number ) ) ) ( ( will ( ( (...,(ROOT (S (NP (NP (CD One)) (PP (IN of) (NP (PR...,A member of my team will execute your orders w...,( ( ( A member ) ( of ( my team ) ) ) ( ( will...,(ROOT (S (NP (NP (DT A) (NN member)) (PP (IN o...,fiction,0
3,37397,37397e,How do you know? All this is their information...,( ( How ( ( ( do you ) know ) ? ) ) ( ( All th...,(ROOT (S (SBARQ (WHADVP (WRB How)) (SQ (VBP do...,This information belongs to them.,( ( This information ) ( ( belongs ( to them )...,(ROOT (S (NP (DT This) (NN information)) (VP (...,fiction,0
4,50563,50563n,yeah i tell you what though if you go price so...,( yeah ( i ( ( tell you ) ( what ( ( though ( ...,(ROOT (S (VP (VB yeah) (S (NP (FW i)) (VP (VB ...,The tennis shoes have a range of prices.,( ( The ( tennis shoes ) ) ( ( have ( ( a rang...,(ROOT (S (NP (DT The) (NN tennis) (NNS shoes))...,telephone,1


In [21]:
new_multi_nli_labels = []
for i in multi_nli_train_dataset["label"]:
    if i == 0:
        new_multi_nli_labels.append(1)
    elif i == 1:
        new_multi_nli_labels.append(0.5)
    elif i == 2:
        new_multi_nli_labels.append(0)

In [22]:
multi_nli_train_dataset["new_label"] = new_multi_nli_labels

In [23]:
train_examples_multi_nli = []
for i, row in tqdm(multi_nli_train_dataset.iterrows()):
    train_examples_multi_nli.append(InputExample(texts=[row['premise'], row['hypothesis']], label= row['new_label']))

392702it [00:38, 10120.30it/s]


In [24]:
for i in range(10):
    random.shuffle(train_examples_multi_nli)

In [25]:
all_train_examples =  paws_train_examples + train_examples_snli + train_examples_multi_nli 

In [26]:
for i in range(50):
    random.shuffle(all_train_examples)

In [27]:
test_evaluator = CEBinaryClassificationEvaluator.from_input_examples(paws_train_examples, name='test_eval')

In [None]:
num_epochs = 2
model_save_path = "./model_dump"
train_dataloader = torch.utils.data.DataLoader(all_train_examples, shuffle=True, batch_size=6)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
model.fit(train_dataloader=train_dataloader,
          evaluator=test_evaluator,
          epochs=num_epochs,
          evaluation_steps=10_000,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          show_progress_bar=True)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/165245 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
model.save("ai_train_20_04_2024")