In [1]:
# ! conda install kaggle pandas datasets protobuf ! conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia ! conda install --no-cache-dir transformers sentencepiece
import numpy as np, os, pandas as pd
from pathlib import Path
from datasets import Dataset,DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments,Trainer 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file = open('kagglecreds.txt', 'r')
creds = file.read()
file.close()

In [3]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

Run the code below only once

In [4]:

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok = True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [5]:
path = Path('us-patent-phrase-to-phrase-matching')

In [6]:
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [7]:
if iskaggle:
    path = Path('../input/us-patent-phrase-to-phrase-matching')
    ! pip install -q datasets

In [8]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [9]:

df = pd.read_csv(path/'train.csv')
df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [10]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


In [11]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor


In [12]:
df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

In [13]:

ds = Dataset.from_pandas(df)

In [14]:
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [15]:
model_nm = 'microsoft/deberta-v3-small'

In [16]:

tokz = AutoTokenizer.from_pretrained(model_nm, use_fast=True)



In [17]:
tokz.tokenize("Y'all could never ever find what'll make me talk, and that's that!")

['▁Y',
 "'",
 'all',
 '▁could',
 '▁never',
 '▁ever',
 '▁find',
 '▁what',
 "'",
 'll',
 '▁make',
 '▁me',
 '▁talk',
 ',',
 '▁and',
 '▁that',
 "'",
 's',
 '▁that',
 '!']

In [18]:
tokz.tokenize("My treatment for my condition of Irodocyclitis is treated by my dexamethasone systemic")

['▁My',
 '▁treatment',
 '▁for',
 '▁my',
 '▁condition',
 '▁of',
 '▁I',
 'ro',
 'do',
 'cycl',
 'itis',
 '▁is',
 '▁treated',
 '▁by',
 '▁my',
 '▁dexamethasone',
 '▁systemic']

In [19]:
def tok_func(x): return tokz(x["input"])

In [20]:
tok_ds = ds.map(tok_func, batched=True)

Map: 100%|██████████| 36473/36473 [00:01<00:00, 26540.04 examples/s]


In [21]:
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

The following line only succeeds if the presumed underscore _ is the correct type of underscore. To understand, this next line will fail if you use the normal _ instead of ▁ which is completely different so use:

▁

In [22]:
tokz.vocab['▁of']

265

In [23]:
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [24]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [25]:
eval_df = pd.read_csv(path/'test.csv')
eval_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,hybrid bearing,inorganic photoconductor drum,G02
freq,1,2,1,3


In [26]:
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map: 100%|██████████| 36/36 [00:00<00:00, 2846.65 examples/s]


In [27]:
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [28]:
bs = 64
epochs = 4
lr = 8e-5

In [29]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=False,
                          evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, num_train_epochs=epochs, 
                          weight_decay=0.01, report_to='none')


In [30]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'], tokenizer=tokz, compute_metrics=corr_d)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.026717,0.800512
2,0.043900,0.023912,0.821539
3,0.019400,0.022361,0.832463
4,0.012400,0.022067,0.83389


TrainOutput(global_step=1712, training_loss=0.023405167543999503, metrics={'train_runtime': 13255.1999, 'train_samples_per_second': 8.255, 'train_steps_per_second': 0.129, 'total_flos': 688183481361480.0, 'train_loss': 0.023405167543999503, 'epoch': 4.0})

In [32]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

array([[ 0.55825454],
       [ 0.68021327],
       [ 0.54855138],
       [ 0.28229406],
       [-0.03154131],
       [ 0.50266159],
       [ 0.51106459],
       [-0.03682663],
       [ 0.24497098],
       [ 1.08840477],
       [ 0.2491537 ],
       [ 0.25824532],
       [ 0.72380513],
       [ 0.98779285],
       [ 0.76816249],
       [ 0.37395683],
       [ 0.34676331],
       [-0.03385903],
       [ 0.63814491],
       [ 0.3208895 ],
       [ 0.5022403 ],
       [ 0.23948567],
       [ 0.21449547],
       [ 0.24654602],
       [ 0.55353552],
       [-0.02816693],
       [-0.03997635],
       [-0.02863501],
       [-0.02953703],
       [ 0.59507257],
       [ 0.34705564],
       [ 0.02286228],
       [ 0.6928761 ],
       [ 0.53406417],
       [ 0.40826851],
       [ 0.22286388]])

In [33]:
preds = np.clip(preds, 0 , 1)
preds

array([[0.55825454],
       [0.68021327],
       [0.54855138],
       [0.28229406],
       [0.        ],
       [0.50266159],
       [0.51106459],
       [0.        ],
       [0.24497098],
       [1.        ],
       [0.2491537 ],
       [0.25824532],
       [0.72380513],
       [0.98779285],
       [0.76816249],
       [0.37395683],
       [0.34676331],
       [0.        ],
       [0.63814491],
       [0.3208895 ],
       [0.5022403 ],
       [0.23948567],
       [0.21449547],
       [0.24654602],
       [0.55353552],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59507257],
       [0.34705564],
       [0.02286228],
       [0.6928761 ],
       [0.53406417],
       [0.40826851],
       [0.22286388]])

In [34]:
import datasets

submission = datasets.Dataset.from_dict({
    'id' : eval_ds['id'],
    'score' : preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.49ba/s]


1021