<a href="https://colab.research.google.com/github/Teasotea/BioNER-and-RD/blob/main/Fimetuning%20SciBERT%20with%20SpaCy%20Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import io
from nltk.tokenize import TreebankWordTokenizer as twt

In [None]:
%mkdir data

In [None]:
%cd data

/content/data


# Load Data

## Load datasets

In [None]:
train_df = pd.read_csv('/content/cdr_dner_train_df.csv')
test_df = pd.read_csv('/content/cdr_dner_test_df.csv')
dev_df = pd.read_csv('/content/cdr_dner_dev_df.csv')


In [None]:
iob_train = pd.read_csv('/content/iob_train.csv')
iob_test = pd.read_csv('/content/iob_test.csv')
iob_dev = pd.read_csv('/content/iob_dev.csv')

In [None]:
iob_train['tokens'] = iob_train['tokens'].apply(lambda x: x.strip('][\'').split('\', \''))
iob_train['iob_tags'] = iob_train['iob_tags'].apply(lambda x: x.strip('][\'').split('\', \''))

iob_test['tokens'] = iob_test['tokens'].apply(lambda x: x.strip('][\'').split('\', \''))
iob_test['iob_tags'] = iob_test['iob_tags'].apply(lambda x: x.strip('][\'').split('\', \''))

iob_dev['tokens'] = iob_dev['tokens'].apply(lambda x: x.strip('][\'').split('\', \''))
iob_dev['iob_tags'] = iob_dev['iob_tags'].apply(lambda x: x.strip('][\'').split('\', \''))

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
false_ents =['\'' '2\'', '\"",', 'sensitisation',  '\'2', 'phenethylamino', '3\'', '\'flattened', '5\'', '2\'']

## Convert to tsv

In [None]:
def to_tsv_iob(df):
  tokens, tags = [], []
  for i,j in zip(df['tokens'], df['iob_tags']):  
    for k,l in zip(i,j):
      if not k.lower().startswith(tuple(false_ents)):
        tokens.append(k)
        tags.append(l)
  iob_tags = pd.DataFrame({"tokens":tokens, "tags":tags}).set_index('tokens')
  return iob_tags

In [None]:
to_tsv_iob(iob_train)['tags'].value_counts()

O             89528
B-Chemical     5024
B-Disease      4047
I-Disease      2209
I-Chemical      751
Name: tags, dtype: int64

In [None]:
train_iob = to_tsv_iob(iob_train)
test_iob = to_tsv_iob(iob_test)
dev_iob = to_tsv_iob(iob_dev)

In [None]:
%pwd

'/content/data'

In [None]:
train_iob.to_csv('/content/data/iob_tsv_train.tsv', sep = '\t')
test_iob.to_csv('/content/data/iob_tsv_test.tsv', sep = '\t')
dev_iob.to_csv('/content/data/iob_tsv_dev.tsv', sep = '\t')

## Create .json files

In [None]:
!python -m spacy convert /content/data/iob_tsv_train.tsv ./ -t json -s -c conll
!python -m spacy convert /content/data/iob_tsv_dev.tsv ./ -t json -s -c conll
!python -m spacy convert /content/data/iob_tsv_test.tsv ./ -t json -s -c conll


[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;4mℹ Segmenting sentences with sentencizer. (Use `-b model` for improved
parser-based sentence segmentation.)[0m
[38;5;2m✔ Generated output file (1 documents): iob_tsv_train.json[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;4mℹ Segmenting sentences with sentencizer. (Use `-b model` for improved
parser-based sentence segmentation.)[0m
[38;5;2m✔ Generated output file (1 documents): iob_tsv_dev.json[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;4mℹ Segmenting sentences with sentencizer. (Use `-b model` for improved
parser-based sentence segmentation

In [None]:
# %cd data

## Create .spacy files

In [None]:
!python -m spacy convert /content/data/iob_tsv_dev.json ./ -t spacy
!python -m spacy convert /content/data/iob_tsv_train.json ./ -t spacy
!python -m spacy convert /content/data/iob_tsv_test.json ./ -t spacy

[38;5;2m✔ Generated output file (877 documents): iob_tsv_dev.spacy[0m
[38;5;2m✔ Generated output file (855 documents): iob_tsv_train.spacy[0m
[38;5;2m✔ Generated output file (909 documents): iob_tsv_test.spacy[0m


# Initializing SpaCy Pipeline

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604–9–2-local_9.2.88–1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604–9–2-local_9.2.88–1_amd64.deb
!apt-key add /var/cuda-repo-9–2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

--2022-07-01 13:24:45--  https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64
Resolving developer.nvidia.com (developer.nvidia.com)... 152.195.19.142
Connecting to developer.nvidia.com (developer.nvidia.com)|152.195.19.142|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://developer.nvidia.com/compute/cuda/9.2/prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 [following]
--2022-07-01 13:24:45--  https://developer.nvidia.com/compute/cuda/9.2/prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64
Reusing existing connection to developer.nvidia.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://developer.download.nvidia.com/compute/cuda/9.2/secure/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb?6jRlN0-QgB4CVkP4RyftipgeDnlfKKKaSHSauX_OSUSVkMqKCDeTZEML0ZYGIqF8yVrpHmYYdtJZ9_U-eDpLcXS5rkSrPgSuRP2HRPJ6hWUKZ

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [None]:
%cd ..

/content


In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_trf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-trf==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.3.0/en_core_web_trf-3.3.0-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 27 kB/s 
[?25hCollecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.6-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 319 kB/s 
Collecting transformers<4.20.0,>=3.4.0
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 9.2 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 61.5 MB/s 
[?25hCollecting t

In [None]:
!pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.7.1+cu92
  Downloading https://download.pytorch.org/whl/cu92/torch-1.7.1%2Bcu92-cp37-cp37m-linux_x86_64.whl (577.3 MB)
[K     |████████████████████████████████| 577.3 MB 3.4 kB/s 
[?25hCollecting torchvision==0.8.2+cu92
  Downloading https://download.pytorch.org/whl/cu92/torchvision-0.8.2%2Bcu92-cp37-cp37m-linux_x86_64.whl (12.5 MB)
[K     |████████████████████████████████| 12.5 MB 33.1 MB/s 
[?25hCollecting torchaudio==0.7.2
  Downloading torchaudio-0.7.2-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 4.9 MB/s 
Installing collected packages: torch, torchvision, torchaudio
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting u

In [None]:
!pip install -U spacy[cuda92,transformers]
!export CUDA_PATH=”/usr/local/cuda-9.2"
!export LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH
!pip install cupy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cupy-cuda92<11.0.0,>=5.0.0b4
  Downloading cupy_cuda92-9.6.0-cp37-cp37m-manylinux1_x86_64.whl (55.0 MB)
[K     |████████████████████████████████| 55.0 MB 1.1 MB/s 
Installing collected packages: cupy-cuda92
Successfully installed cupy-cuda92-9.6.0
/bin/bash: -c: line 0: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 1: syntax error: unexpected end of file
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cupy
  Downloading cupy-10.6.0.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 4.7 MB/s 
Building wheels for collected packages: cupy
  Building wheel for cupy (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for cupy[0m
[?25h  Running setup.py clean for cupy
[31m  ERROR: Failed cleaning build dir for cupy[0m
Failed to build cupy
Installing collected packages: cup

In [None]:
!pip install git+https://github.com/explosion/spacy-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/explosion/spacy-transformers
  Cloning https://github.com/explosion/spacy-transformers to /tmp/pip-req-build-zk_gocpp
  Running command git clone -q https://github.com/explosion/spacy-transformers /tmp/pip-req-build-zk_gocpp


In [None]:
!python -m spacy init fill-config /content/config.cfg /content/config.cfg

In [None]:
!python -m spacy debug data /content/config.cfg

In [None]:
!pip uninstall cupy
!pip install cupy-cuda100

# Finetune SciBERT with Spacy Pipeline

In [None]:
!python -m spacy train -g 0 /content/config.cfg --output ./

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-07-01 12:18:22,392] [INFO] Set up nlp object from config
[2022-07-01 12:18:22,402] [INFO] Pipeline: ['transformer', 'ner']
[2022-07-01 12:18:22,406] [INFO] Created vocabulary
[2022-07-01 12:18:22,407] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This I