# Named Entity Recognition (NER) using BERT

In [1]:
import os
import pandas as pd
import numpy as np
import json
import re
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import transformers
from tqdm import tqdm
import glob
from sklearn.model_selection import train_test_split
import datetime
import warnings
warnings.filterwarnings('ignore')

#https://www.kaggle.com/code/statsgary/bert-for-token-classification-ner-tutorial/edit


## Model Config

In [2]:
model_name = 'bert-ner-token-classification.bin'
bert_path = 'bert-base-uncased'
train_path = './data/train/'
test_path = './data/test'

In [3]:
config = {
    'MAX_LEN': 128,
    'tokenizer': AutoTokenizer.from_pretrained(bert_path, do_lower_case=True),
    'batch_size': 5,
    'Epoch': 1, 
    'train_path': train_path,
    'test_path': test_path,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'model_path': model_name,
    'model_name': model_name
}

In [4]:
print(config)

{'MAX_LEN': 128, 'tokenizer': PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), 'batch_size': 5, 'Epoch': 1, 'train_path': './data/train/', 'test_path': './data/test', 'device': 'cuda', 'model_path': 'bert-ner-token-classification.bin', 'model_name': 'bert-ner-token-classification.bin'}


## Read the train data and combine data labels

In [5]:
train = pd.read_csv('data/train.csv')
print(train.head())

                                     Id  \
0  d0fa7568-7d8e-4db9-870f-f9c6f668c17b   
1  2f26f645-3dec-485d-b68d-f013c9e05e60   
2  c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29   
3  5c9a3bc9-41ba-4574-ad71-e25c1442c8af   
4  c754dec7-c5a3-4337-9892-c02158475064   

                                           pub_title  \
0  The Impact of Dual Enrollment on College Degre...   
1  Educational Attainment of High School Dropouts...   
2  Differences in Outcomes for Female and Male St...   
3  Stepping Stone and Option Value in a Model of ...   
4  Parental Effort, School Resources, and Student...   

                           dataset_title  \
0  National Education Longitudinal Study   
1  National Education Longitudinal Study   
2  National Education Longitudinal Study   
3  National Education Longitudinal Study   
4  National Education Longitudinal Study   

                           dataset_label  \
0  National Education Longitudinal Study   
1  National Education Longitudinal Study   
2  Nati

In [6]:
train_df = train.groupby(['Id']).agg(label_count = ('cleaned_label', 'count'),
                label = ('cleaned_label', '|'.join)).reset_index()
print(train_df.columns)

Index(['Id', 'label_count', 'label'], dtype='object')


In [7]:
train_df.head()

Unnamed: 0,Id,label_count,label
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,1,program for the international assessment of ad...
1,0008656f-0ba2-4632-8602-3017b44c2e90,1,trends in international mathematics and scienc...
2,000e04d6-d6ef-442f-b070-4309493221ba,1,agricultural resources management survey
3,000efc17-13d8-433d-8f62-a3932fe4f3b8,2,adni|alzheimer s disease neuroimaging initiati...
4,0010357a-6365-4e5f-b982-582e6d32c3ee,1,genome sequence of covid 19


## Reading all the JSON train files

In [8]:
def read_all_json(df, path):
    text_data = {}
    for i, rec_id in tqdm(enumerate(df.Id), total=len(df.Id)):
        location = f'{path}{rec_id}.json'

        with open(location, 'r') as f:
            text_data[rec_id] = json.load(f)

    print('All JSON files read successfully!')
    return text_data

In [9]:
train_data_dict = read_all_json(df=train_df, path=config['train_path'])

100%|██████████| 14316/14316 [00:02<00:00, 4821.58it/s]

All JSON files read successfully!





## Data cleaning

In [10]:
def clean_text(text):
    return re.sub('[^A-Za-z0-9]+', ' ', str(text).lower())

In [11]:
def data_joining(data_dict_id):
    data_len = len(data_dict_id)
    tmp = [data_dict_id[i]['text'] for i in range(data_len)]
    tmp = '. '.join(tmp)
    return tmp

In [12]:
def make_shorter_sentence(sentence, max_length=128, overlap=20):
    sent_tokenized = sent_tokenize(sentence)
    max_length = max_length
    overlap = overlap

    final_sentences = []

    for tokenized_sent in sent_tokenized:
        sent_tokenized_clean = clean_text(tokenized_sent)
        sent_tokenized_clean = sent_tokenized_clean.replace('.', '').rstrip()

        tok_sent = sent_tokenized_clean.split(" ")

        if len(tok_sent) < max_length:
            final_sentences.append(sent_tokenized_clean)
        else:
            start = 0
            end = len(tok_sent)

            for i in range(start, end, max_length-overlap):
                tmp = tok_sent[i:(i+max_length)]
                final_sentences.append(". ".join(i for i in tmp))
    
    return final_sentences

## Tokenization and labelling of data

In [None]:
def form_labels(sentence, labels_list):
    matched_keywrds = []
    matched_token = []
    unmatched_kwords = []
    label = []

    tokens = make_shorter_sentence(sentence)

    for tk in tokens:
        tok_split = config['tokenizer'].tokenize(tk)

        for i in range(len(tok_split)):
            if tok_split[i: (1+len(kword))]
