# Named Entity Recognition (NER) Notebook
Clean, commented implementation for training a BiLSTM NER model.

In [None]:
import pandas as pd
# Load dataset
data = pd.read_csv('ner_dataset.csv', encoding='unicode_escape')
data.head()

In [None]:
from itertools import chain

def get_dict_map(data, token_or_tag):
    """Creates token↔index mappings"""
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))

    idx2tok = {idx: tok for idx, tok in enumerate(vocab)}
    tok2idx = {tok: idx for idx, tok in idx2tok.items()}
    return tok2idx, idx2tok

In [None]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):
    """Pads sequences and splits dataset"""

    n_token = len(set(data['Word'].to_list()))
    n_tag = len(set(data['Tag'].to_list()))

    tokens = pad_sequences(data_group['Word_idx'], maxlen=max_len, padding='post')
    tags = pad_sequences(data_group['Tag_idx'], maxlen=max_len, padding='post')

    tags = [to_categorical(i, num_classes=n_tag) for i in tags]

    X_train, X_test, y_train, y_test = train_test_split(tokens, tags, test_size=0.1)

    return X_train, X_test, y_train, y_test

In [None]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional

# Reproducibility
np.random.seed(1)
tensorflow.random.set_seed(2)

In [None]:
# Hyperparameters
input_dim = len(set(data['Word'].to_list())) + 1
output_dim = 64
input_length = 50  # adjust based on dataset
n_tags = len(set(data['Tag'].to_list()))

In [None]:
def get_bilstm_lstm_model():
    """Builds BiLSTM sequence labeling model"""

    model = Sequential()
    model.add(Input(shape=(input_length,)))
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim))

    model.add(Bidirectional(LSTM(output_dim, return_sequences=True, dropout=0.2)))
    model.add(LSTM(output_dim, return_sequences=True, dropout=0.5))

    model.add(TimeDistributed(Dense(n_tags, activation='softmax')))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def train_model(X, y, model):
    """Trains model and records loss"""
    losses = []
    for i in range(5):
        history = model.fit(X, y, batch_size=256, epochs=1, validation_split=0.2)
        losses.append(history.history['loss'][0])
    return losses

## spaCy Visualization Example

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

doc = nlp("Apple hired John in London")
displacy.render(doc, style='ent', jupyter=True)