In [1]:
import pandas as pd
import os
from datetime import datetime
from transformers import (BertTokenizer, BertConfig, BertForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling,
                          EarlyStoppingCallback)
from datasets import load_dataset
import random
import wandb
import torch

In [3]:
pip install wandb

[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
print(pd.__version__)

2.2.2


## Load Data

In [2]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Domain Adaptation


In [3]:
# Navigate to pickles directory
os.chdir('../../savsnet_resources/pickles')
print(os.getcwd())

/opt/jupyterlab/notebooks/savsnet_resources/pickles


In [4]:
# Load Narratives Dataframe
df_narratives = pd.read_pickle('narrative_pickle.pkl.gz', compression='gzip').drop_duplicates(subset='savsnet_consult_id', keep='first')

# Load Extras Dataframe
df_extras = pd.read_pickle('extras_df.pkl.gz', compression='gzip').drop_duplicates(subset='savsnet_consult_id')[['savsnet_consult_id','species','breed', 'age_at_consult', 'gender', 'neutered', 'mpc']]

#Filter for just dog records
df_extras = df_extras[df_extras.species == 'dog']

#Join data
df_dataset = df_narratives.set_index('savsnet_consult_id').join(df_extras.set_index('savsnet_consult_id'), how='inner')

df_dataset.reset_index()

Unnamed: 0,savsnet_consult_id,item_text,consult_record,pk,consult_record_date,species,breed,age_at_consult,gender,neutered,mpc
0,71631,"""O worried that she has been limping on RH for...",230515,2040019,2014-06-10 14:10:05+00:00,dog,Crossbreed,10.35,female,no,trauma
1,71644,"""booster and ears. v reactive for exam, snarli...",230516,2040021,2014-06-10 14:14:03+00:00,dog,Heeler (generic),6.31,male,no,vaccination
2,71660,"""o was stroking chest last night and felt bump...",230517,2040023,2014-06-10 14:18:51+00:00,dog,Jack Russell Terrier,3.45,male,no,other_healthy
3,71675,"""1st vacc. bar, biop two weeks, was puppy from...",230518,2040024,2014-06-10 14:25:12+00:00,dog,Crossbreed,0.17,male,no,vaccination
4,88118,"""2nd vacc. bar, doing well at home, nothing ab...",230519,2040025,2014-06-24 14:05:25+00:00,dog,Crossbreed,0.21,male,no,vaccination
...,...,...,...,...,...,...,...,...,...,...,...
6081341,11517687,CONSULT: HPC 6 month check up. Any concerns fr...,10142293,16553510,2024-01-30 12:01:22+00:00,dog,Crossbreed,1.80,male,yes,other_healthy
6081342,11516812,Euthanasia. Booked in for PTS. As soon as went...,10142295,16553511,2024-01-30 10:00:29+00:00,dog,Crossbreed,10.33,male,no,tumour
6081343,11516407,NO MURMUR HEARD TODAY.,10142292,16553513,2024-01-30 09:16:27+00:00,dog,Dachshund,0.97,female,no,other_healthy
6081344,11512021,Following todays examination: Milbemycin Oxime...,10135039,16553515,2024-01-27 15:25:42+00:00,dog,Spaniel (Cocker),8.71,male,no,post_op


In [10]:
print(len(df_dataset['consult_record_date']))

6081346


In [7]:
# Get a list of all narratives and rename column to text
texts = df_dataset['item_text'].tolist()

# remove leading and trailing "" characters
texts = [text.strip('"') for text in texts]

df = pd.DataFrame(texts, columns=['text'])
df = df.dropna()
df = df[df['text'] != ""]
df.reset_index(drop=True)

Unnamed: 0,text
0,O worried that she has been limping on RH for ...
1,"booster and ears. v reactive for exam, snarlin..."
2,o was stroking chest last night and felt bumps...
3,"1st vacc. bar, biop two weeks, was puppy from ..."
4,"2nd vacc. bar, doing well at home, nothing abn..."
...,...
5947245,CONSULT: HPC 6 month check up. Any concerns fr...
5947246,Euthanasia. Booked in for PTS. As soon as went...
5947247,NO MURMUR HEARD TODAY.
5947248,Following todays examination: Milbemycin Oxime...


## Randomly Select 3 million records

In [8]:
print(len(df))

5947250


In [9]:
def random_non_repeating_integers(num_values, min_value=0, max_value=1):
  """Generates a list of `num_values` random integers between min_value (inclusive) and max_value (exclusive) without repetitions.

  Args:
      num_values: The number of random integers to generate.
      min_value: The minimum value (inclusive) for the random integers. Defaults to 0.
      max_value: The maximum value (exclusive) for the random integers. Defaults to 1.

  Returns:
      A list of `num_values` unique random integers.
  """

  if num_values > max_value - min_value:
    raise ValueError("Cannot generate more unique numbers than the range allows.")

  # Use a set to ensure no duplicates are added
  values = set()
  while len(values) < num_values:
    values.add(random.randint(min_value, max_value - 1))

  return list(values)

In [10]:
unique_numbers = random_non_repeating_integers(3000000, max_value=len(df))

In [11]:
print(len(unique_numbers))

3000000


In [12]:
df = df.iloc[unique_numbers]

In [13]:
print(len(df))

3000000


In [14]:
df.reset_index(drop=True)

Unnamed: 0,text
0,O worried that she has been limping on RH for ...
1,o was stroking chest last night and felt bumps...
2,"doing well at the moment, on hypoallergenic di..."
3,had parvo vacc but not any others so start cou...
4,"2nd vacc. bar, doing well at home, grown alot...."
...,...
2999995,"has v+ at least once today, was undigested foo..."
2999996,Licking at lump on tail. HX - has had lump on ...
2999997,Some self trauma over night. has serum dischar...
2999998,NO MURMUR HEARD TODAY.


In [15]:
df_train = df.sample(n=int(len(df)*0.8))

In [16]:
# Get all indices but exclude those in the sample
unsampled_indices = df.index.difference(df_train.index)

# Create a DataFrame of unsampled rows
df_test = df.loc[unsampled_indices]

In [17]:
print(len(df_train))

2400000


In [18]:
print(df_test.head())

                                                 text
9   2nd vacc. bar, doing well at home, grown alot....
17  booster. nothing abnormal detected health chec...
34  arched abdomen lots excess borgorygmi has gone...
72  in for booster but O worried about eyes. L&gt;...
83               <<identifier>> microchip dogs trust.


In [19]:
print(os.getcwd())

/opt/jupyterlab/notebooks/savsnet_resources/pickles


In [20]:
os.chdir('../../DogBERT/Domain Adaptation')

In [21]:
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)

## Create Dataset

In [4]:
datasets = load_dataset("csv", data_files={'train': "train.csv",
                                              'eval': "test.csv"})

## Prepare for Training

In [9]:
#train test split
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=True)

model = BertForMaskedLM.from_pretrained('bert-base-uncased')

config = BertConfig.from_pretrained("bert-base-uncased")

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Create tokenizer
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch["text"], truncation=True, max_length=512)
    return tokenized_batch

tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["text"], num_proc=16)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'datasets' is not defined

In [6]:
# Setup weights and biases stuff
os.environ["WANDB_PROJECT"]="DogBERT"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [6]:
import accelerate

In [7]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [8]:
# Do training
train_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16 = True,
    num_train_epochs=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    output_dir= "DogBERT v0.0.2",
    load_best_model_at_end=True,
    report_to="wandb",
    run_name="DogBERT_pretrain_3m",
    save_total_limit=3
)



In [9]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    callbacks=[early_stopping_callback]
)

In [14]:
pip install wandb

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
trainer.train()
trainer.save_model()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss


In [None]:
!nvidia-smi

In [3]:
torch.version.cuda

'12.1'

In [11]:
from numba import cuda

In [13]:
import gc
torch.cuda.empty_cache()
gc.collect()

33

In [16]:
torch.cuda.empty_cache()
gc.collect()

0

In [17]:
model = None
gc.collect()

0