In [1]:
!pip install --no-deps seqeval[gpu]

Collecting seqeval[gpu]
  Using cached seqeval-1.2.2-py3-none-any.whl
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [35]:
%who

Adam	 BertAdam	 BertConfig	 BertForTokenClassification	 BertTokenizer	 DataLoader	 F	 FULL_FINETUNING	 MAX_LEN	 
RandomSampler	 SequentialSampler	 TensorDataset	 accuracy_score	 attention_masks	 b_input_ids	 b_input_mask	 b_labels	 batch	 
biluo_tags_from_offsets	 bs	 classification_report	 device	 df	 entity_dict	 epochs	 f1_score	 get_entities	 
get_tokenized_train_data	 get_train_data	 i	 idx2tag	 input_ids	 max_grad_norm	 mergeIntervals	 model	 n_gpu	 
nb_tr_examples	 nb_tr_steps	 nlp	 no_decay	 np	 optimizer	 optimizer_grouped_parameters	 pad_sequences	 param_optimizer	 
pd	 prefix_regex	 prefixes	 sentences	 spacy	 step	 tag2idx	 tag_vals	 tags	 
tokenized_texts	 tokenizer	 torch	 tr_inputs	 tr_loss	 tr_masks	 tr_tags	 train_data	 train_dataloader	 
train_sampler	 train_test_split	 trange	 val_inputs	 val_masks	 val_tags	 valid_data	 valid_dataloader	 valid_sampler	 
word_piece_labels	 


In [36]:
import numpy as np
import pandas as pd

import spacy
from spacy.gold import biluo_tags_from_offsets
nlp = spacy.load("en_core_web_lg")

from tqdm import trange
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
!pip install pytorch-pretrained-bert
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

from seqeval.metrics import classification_report, accuracy_score, f1_score



In [37]:
# Adding '\n' to the default spacy tokenizer

prefixes = ('\\n', ) + nlp.Defaults.prefixes
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

In [38]:
# Personal Custom Tags Dictionary
entity_dict = {
    'Name': 'NAME', 
    'College Name': 'CLG',
    'Degree': 'DEG',
    'Graduation Year': 'GRADYEAR',
    'Years of Experience': 'YOE',
    'Companies worked at': 'COMPANY',
    'Designation': 'DESIG',
    'Skills': 'SKILLS',
    'Location': 'LOC',
    'Email Address': 'EMAIL'
}

In [39]:
# loading the dataset
df = pd.read_json('Entity Recognition in Resumes.json', lines=True)
df.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [40]:
# Checking for unique values present in 'extras' column
df['extras'].unique()

array([nan])

In [41]:
# Since, 'extras' column contains no information we can drop the column
df = df.drop(['extras'], axis=1)
df.head()

Unnamed: 0,content,annotation
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12..."
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta..."
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37..."
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80..."
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20..."


In [42]:
def mergeIntervals(intervals):
    sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
    merged = []

    for higher in sorted_by_lower_bound:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if higher[0] <= lower[1]:
                if lower[2] is higher[2]:
                    upper_bound = max(lower[1], higher[1])
                    merged[-1] = (lower[0], upper_bound, lower[2])
                else:
                    if lower[1] > higher[1]:
                        merged[-1] = lower
                    else:
                        merged[-1] = (lower[0], higher[1], higher[2])
            else:
                merged.append(higher)

    return merged

In [43]:
# From 'annotation' column, we are extracting the starting index, ending index, entity label
# So that we can convert the content in BILOU format

def get_entities(df):
    
    entities = []
    
    for i in range(len(df)):
        entity = []
    
        for annot in df['annotation'][i]:
            try:
                ent = entity_dict[annot['label'][0]]
                start = annot['points'][0]['start']
                end = annot['points'][0]['end'] + 1
                entity.append((start, end, ent))
            except:
                pass
    
        entity = mergeIntervals(entity)
        entities.append(entity)
    
    return entities

In [44]:
# Adding a new column 'entities'
df['entities'] = get_entities(df)
df.head()

Unnamed: 0,content,annotation,entities
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...","[(0, 12, NAME), (13, 46, DESIG), (49, 58, COMP..."
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...","[(0, 14, NAME), (62, 68, LOC), (104, 148, EMAI..."
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...","[(0, 21, NAME), (22, 31, LOC), (65, 117, EMAIL..."
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...","[(0, 12, NAME), (13, 51, DESIG), (54, 60, COMP..."
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...","[(0, 13, NAME), (14, 22, DESIG), (24, 41, COMP..."


In [45]:
def get_train_data(df):
    tags = []
    sentences = []

    for i in range(len(df)):
        text = df['content'][i]
        entities = df['entities'][i]
    
        doc = nlp(text)
    
        tag = biluo_tags_from_offsets(doc, entities)
        tmp = pd.DataFrame([list(doc), tag]).T
        loc = []
        for i in range(len(tmp)):
            if tmp[0][i].text is '.' and tmp[1][i] is 'O':
                loc.append(i)
        loc.append(len(doc))
    
        last = 0
        data = []
        for pos in loc:
            data.append([list(doc)[last:pos], tag[last:pos]])
            last = pos
    
        for d in data:
            tag = ['O' if t is '-' else t for t in d[1]]
            if len(set(tag)) > 1:
                sentences.append(d[0])
                tags.append(tag)
    
    return sentences, tags

  if tmp[0][i].text is '.' and tmp[1][i] is 'O':
  if tmp[0][i].text is '.' and tmp[1][i] is 'O':
  tag = ['O' if t is '-' else t for t in d[1]]


In [46]:
sentences, tags = get_train_data(df)
len(sentences), len(tags)

Active member of IIIT Committee in ..." with entities "[(0, 14, 'NAME'), (62, 68, 'LOC'), (104, 148, 'EMA...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Hyderabad, Telangana - Email..." with entities "[(0, 21, 'NAME'), (22, 31, 'LOC'), (65, 117, 'EMAI...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Operational Analyst (SQL DBA) Enginee..." with entities "[(0, 12, 'NAME'), (13, 51, 'DESIG'), (54, 60, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
lecturer - oracle tutori

java developer

Pune, Maharashtra..." with entities "[(0, 16, 'NAME'), (17, 31, 'DESIG'), (33, 37, 'LOC...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Navi Mumbai, Maharashtra - Email me on..." with entities "[(0, 11, 'NAME'), (12, 23, 'LOC'), (51, 101, 'EMAI...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
FI/CO Consultant in Tech Mahindra - ..." with entities "[(0, 13, 'NAME'), (14, 47, 'DESIG'), (50, 58, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Associate consult

PeopleSoft Consultant

Bangalore Urban, K..." with entities "[(0, 8, 'NAME'), (32, 47, 'LOC'), (81, 119, 'EMAIL...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
LEAD ENGINEER - CISCO

- Email me on In..." with entities "[(0, 10, 'NAME'), (11, 24, 'DESIG'), (27, 32, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Senior System Engineer - Infosys ..." with entities "[(0, 16, 'NAME'), (17, 39, 'DESIG'), (42, 57, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Devops Engi

Senior Technical Lead - HCL Cisco

-..." with entities "[(0, 13, 'NAME'), (14, 35, 'DESIG'), (37, 47, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Offshore SAP CRM Functional Consultan..." with entities "[(0, 12, 'NAME'), (13, 52, 'DESIG'), (53, 62, 'LOC...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
SAP ABAP Consultant

Hyderabad, Tel..." with entities "[(0, 14, 'NAME'), (15, 23, 'COMPANY'), (24, 35, 'D...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
- Email me on In

Chandigarh, Chandigarh - Email me on I..." with entities "[(0, 11, 'NAME'), (12, 22, 'LOC'), (24, 34, 'LOC')...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Maximo Consultant - Infosys Limited..." with entities "[(0, 14, 'NAME'), (15, 32, 'DESIG'), (35, 50, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Pune, Maharashtra - Email me on Ind..." with entities "[(0, 14, 'NAME'), (15, 19, 'LOC'), (55, 99, 'EMAIL...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Tech Support Exe

Kolkata, West Bengal - Email me on In..." with entities "[(0, 12, 'NAME'), (13, 20, 'LOC'), (55, 99, 'EMAIL...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Technical Support Executive - Tele..." with entities "[(0, 15, 'NAME'), (16, 43, 'DESIG'), (46, 62, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
chnadigarh - Email me on Indeed:..." with entities "[(0, 17, 'NAME'), (18, 28, 'LOC'), (50, 98, 'EMAIL...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Kharadi, Pune, 411014

Cluster HR Manager - Velammal New

Chennai,..." with entities "[(0, 6, 'NAME'), (7, 26, 'DESIG'), (28, 40, 'COMPA...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Senior Executive (MIS & Audit) - Job ..." with entities "[(0, 12, 'NAME'), (13, 44, 'DESIG'), (61, 93, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Jr. ASP.NET Developer in True Vision..." with entities "[(0, 13, 'NAME'), (14, 35, 'DESIG'), (39, 61, 'CLG...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
7 years 

Kottayam, Kerala - Email me on Indeed..." with entities "[(0, 12, 'NAME'), (13, 21, 'LOC'), (52, 94, 'EMAIL...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Engineer / Electrical Supervisor, S..." with entities "[(0, 14, 'NAME'), (15, 101, 'COMPANY'), (419, 452,...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Software Engineer

Bangalore City, Kar..." with entities "[(717, 748, 'CLG'), (6642, 6696, 'DEG'), (6698, 67...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Senior Process

Project Lead for Infosys OpenStack
..." with entities "[(0, 14, 'NAME'), (15, 27, 'DESIG'), (31, 49, 'COM...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Jaipur, Rajasthan - Email me on Indeed: ..." with entities "[(0, 9, 'NAME'), (10, 16, 'LOC'), (49, 89, 'EMAIL'...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
Bangalore, Karnataka - Email me on Indee..." with entities "[(0, 9, 'NAME'), (10, 19, 'LOC'), (52, 92, 'EMAIL'...". Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities (with BILUO tag '-') will be ignored during training.
  tag = biluo_tags_from_offsets(doc, entities)
6+ Exp in

(781, 781)

In [47]:
tag_vals = set(['X', '[CLS]', '[SEP]'])
for i in range(len(tags)):
    tag_vals = tag_vals.union(tags[i])
tag_vals

{'B-CLG',
 'B-COMPANY',
 'B-DEG',
 'B-DESIG',
 'B-EMAIL',
 'B-GRADYEAR',
 'B-LOC',
 'B-NAME',
 'B-SKILLS',
 'B-YOE',
 'I-CLG',
 'I-COMPANY',
 'I-DEG',
 'I-DESIG',
 'I-EMAIL',
 'I-GRADYEAR',
 'I-LOC',
 'I-NAME',
 'I-SKILLS',
 'I-YOE',
 'L-CLG',
 'L-COMPANY',
 'L-DEG',
 'L-DESIG',
 'L-EMAIL',
 'L-GRADYEAR',
 'L-LOC',
 'L-NAME',
 'L-SKILLS',
 'L-YOE',
 'O',
 'U-CLG',
 'U-COMPANY',
 'U-DEG',
 'U-DESIG',
 'U-EMAIL',
 'U-GRADYEAR',
 'U-LOC',
 'U-SKILLS',
 'U-YOE',
 'X',
 '[CLS]',
 '[SEP]'}

In [48]:
tag2idx = {t: i for i, t in enumerate(tag_vals)}
tag2idx

{'I-EMAIL': 0,
 'B-COMPANY': 1,
 'L-LOC': 2,
 'I-YOE': 3,
 'B-SKILLS': 4,
 'B-EMAIL': 5,
 'B-GRADYEAR': 6,
 'I-SKILLS': 7,
 '[SEP]': 8,
 'I-NAME': 9,
 'L-COMPANY': 10,
 'B-YOE': 11,
 'I-LOC': 12,
 'U-EMAIL': 13,
 'U-YOE': 14,
 '[CLS]': 15,
 'U-SKILLS': 16,
 'U-DEG': 17,
 'L-YOE': 18,
 'U-CLG': 19,
 'L-DESIG': 20,
 'B-DEG': 21,
 'U-GRADYEAR': 22,
 'X': 23,
 'I-DESIG': 24,
 'I-CLG': 25,
 'L-CLG': 26,
 'U-LOC': 27,
 'I-COMPANY': 28,
 'U-COMPANY': 29,
 'L-GRADYEAR': 30,
 'L-DEG': 31,
 'B-DESIG': 32,
 'L-SKILLS': 33,
 'B-NAME': 34,
 'I-GRADYEAR': 35,
 'B-CLG': 36,
 'L-NAME': 37,
 'B-LOC': 38,
 'O': 39,
 'U-DESIG': 40,
 'I-DEG': 41,
 'L-EMAIL': 42}

In [49]:
idx2tag = {tag2idx[key] : key for key in tag2idx.keys()}
idx2tag

{0: 'I-EMAIL',
 1: 'B-COMPANY',
 2: 'L-LOC',
 3: 'I-YOE',
 4: 'B-SKILLS',
 5: 'B-EMAIL',
 6: 'B-GRADYEAR',
 7: 'I-SKILLS',
 8: '[SEP]',
 9: 'I-NAME',
 10: 'L-COMPANY',
 11: 'B-YOE',
 12: 'I-LOC',
 13: 'U-EMAIL',
 14: 'U-YOE',
 15: '[CLS]',
 16: 'U-SKILLS',
 17: 'U-DEG',
 18: 'L-YOE',
 19: 'U-CLG',
 20: 'L-DESIG',
 21: 'B-DEG',
 22: 'U-GRADYEAR',
 23: 'X',
 24: 'I-DESIG',
 25: 'I-CLG',
 26: 'L-CLG',
 27: 'U-LOC',
 28: 'I-COMPANY',
 29: 'U-COMPANY',
 30: 'L-GRADYEAR',
 31: 'L-DEG',
 32: 'B-DESIG',
 33: 'L-SKILLS',
 34: 'B-NAME',
 35: 'I-GRADYEAR',
 36: 'B-CLG',
 37: 'L-NAME',
 38: 'B-LOC',
 39: 'O',
 40: 'U-DESIG',
 41: 'I-DEG',
 42: 'L-EMAIL'}

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [51]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [52]:
def get_tokenized_train_data(sentences, tags):

    tokenized_texts = []
    word_piece_labels = []

    for word_list, label in zip(sentences, tags):
    
        # Add [CLS] at the front
        temp_lable = ['[CLS]']
        temp_token = ['[CLS]']
    
        for word, lab in zip(word_list, label):
            token_list = tokenizer.tokenize(word.text)
            for m, token in enumerate(token_list):
                temp_token.append(token)
                if m == 0:
                    temp_lable.append(lab)
                else:
                    temp_lable.append('X')  
                
        # Add [SEP] at the end
        temp_lable.append('[SEP]')
        temp_token.append('[SEP]')
    
        tokenized_texts.append(temp_token)
        word_piece_labels.append(temp_lable)
    
    return tokenized_texts, word_piece_labels

In [53]:
tokenized_texts, word_piece_labels = get_tokenized_train_data(sentences, tags)

In [54]:
print(tokenized_texts[0])
print(word_piece_labels[0])

['[CLS]', 'A', '##b', '##his', '##he', '##k', 'J', '##ha', 'Application', 'Development', 'Associate', '-', 'A', '##cc', '##ent', '##ure', 'Bengal', '##uru', ',', 'Karnataka', '-', 'Em', '##ail', 'me', 'on', 'Indeed', ':', 'indeed', '.', 'com', '/', 'r', '/', 'A', '##b', '##his', '##he', '##k', '-', 'J', '##ha', '/', '10', '##e', '##7', '##a', '##8', '##c', '##b', '##7', '##32', '##b', '##c', '##43', '##a', '•', 'To', 'work', 'for', 'an', 'organization', 'which', 'provides', 'me', 'the', 'opportunity', 'to', 'improve', 'my', 'skills', 'and', 'knowledge', 'for', 'my', 'individual', 'and', 'company', "'", 's', 'growth', 'in', 'best', 'possible', 'ways', '[SEP]']
['[CLS]', 'B-NAME', 'X', 'X', 'X', 'X', 'L-NAME', 'X', 'B-DESIG', 'I-DESIG', 'L-DESIG', 'O', 'U-COMPANY', 'X', 'X', 'X', 'U-LOC', 'X', 'O', 'O', 'O', 'O', 'X', 'O', 'O', 'B-EMAIL', 'I-EMAIL', 'I-EMAIL', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',

In [55]:
MAX_LEN = 512
bs = 4

In [56]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(len(input_ids[0]))
print(input_ids[0])

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (679 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (977 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (567 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1054 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1231 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length

512
[  101   138  1830 27516  4638  1377   147  2328 22491  3273  9666   118
   138 19515  3452  3313  7756 12328   117 12247   118 18653 11922  1143
  1113 10364   131  5750   119  3254   120   187   120   138  1830 27516
  4638  1377   118   147  2328   120  1275  1162  1559  1161  1604  1665
  1830  1559 17101  1830  1665 25631  1161   794  1706  1250  1111  1126
  2369  1134  2790  1143  1103  3767  1106  4607  1139  4196  1105  3044
  1111  1139  2510  1105  1419   112   188  3213  1107  1436  1936  3242
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     

In [57]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels], maxlen=MAX_LEN, value=tag2idx["O"], 
                     padding="post", dtype="long", truncating="post")
print(len(tags[0]))
print(tags[0])

512
[15 34 23 23 23 23 37 23 32 24 20 39 29 23 23 23 27 23 39 39 39 39 23 39
 39  5  0  0 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 23 39 39 39 39 39  8 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39
 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 3

In [58]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [59]:
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(input_ids, tags, attention_masks, random_state=2020, 
                                                                                 test_size=0.3)

In [60]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [61]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [62]:
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2idx))

In [68]:
# model.cuda()

In [69]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [70]:
epochs = 10
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:   0%|                                                                                    | 0/10 [00:00<?, ?it/s]


RuntimeError: CUDA error: unknown error

In [66]:
model.eval()

y_true = []
y_pred = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch

    with torch.no_grad():
        logits = model(input_ids, token_type_ids=None, attention_mask=input_mask,)

    logits = logits.detach().cpu().numpy()
    logits = [list(p) for p in np.argmax(logits, axis=2)]
    
    label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()
    
    for i,mask in enumerate(input_mask):
        temp_1 = [] # Real one
        temp_2 = [] # Predict one
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if idx2tag[label_ids[i][j]] != "X" and idx2tag[label_ids[i][j]] != "[CLS]" and idx2tag[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(idx2tag[label_ids[i][j]])
                    temp_2.append(idx2tag[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)
    
print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

print(classification_report(y_true, y_pred,digits=4))

RuntimeError: CUDA error: unknown error