In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import gensim.downloader as api

# RNN model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Bidirectional, Masking
from tensorflow.keras import backend as K


In [11]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub


  from .autonotebook import tqdm as notebook_tqdm


## If kaggle dataset is not downloaded, run the following code, otherwise uncomment the line and enter the file path to the dataset

In [12]:

path = kagglehub.dataset_download("debasisdotcom/name-entity-recognition-ner-dataset")
# path = "dataset.csv"



## Data collection

In [13]:
df = pd.read_csv(path + '/NER dataset.csv', encoding = "ISO-8859-1")

### Formatting Data

In [74]:
sentences = []
tags = []
for row in df.iterrows():
    row = row[1]
    if row['Sentence #'] is np.nan:
        sentences[-1].append(row['Word'])
        tags[-1].append(row['Tag'])
    else:
        sentences.append([row['Word']])
        tags.append([row['Tag']])


## Word2Vec Embeddings

In [None]:
# Pretrained word2vec model to use for sentiment classification
corpus = api.load('text8') #first 10^9 bytes from english wikipedia dump march 2006
model = Word2Vec(corpus)

In [16]:

# Setting up a random vector for out of vocabulary words
embeddingDim = 100 # The dimension of the word embeddings
oovVector = np.random.uniform(-0.1, 0.1, embeddingDim)  # Random small values

In [17]:
def wordToVec(word, model):
    word = word.lower()
    if word in model.wv:
        return model.wv[word]
    else:
        return oovVector

In [18]:
# Function to convert sentences to feature vectors
# sentence is an array of word vectors, each word vector is a 100-dimensional vector
def sentence_to_features(sentence, model, maxLen):
    x = []

    for ind, word in enumerate(sentence):
        word = str(word)

        prevWord = str(sentence[ind-1]) if ind > 0 else ''
        nextWord = str(sentence[ind+1]) if ind < len(sentence)-1 else ''

        wordArray = wordToVec(word, model)
        
        otherFeatures = np.array([
            word.isupper(), #is upper
            word.istitle(), #is title
            word.isdigit(), #is digit
            len(word), #length of word
            
        ])
        prevWordArray = wordToVec(prevWord,model)
        nextWordArray = wordToVec(nextWord, model)

        wordArray = np.concatenate((wordArray,otherFeatures))
        wordArray = np.concatenate((wordArray,prevWordArray))
        wordArray = np.concatenate((wordArray,nextWordArray))
        
        x.append(wordArray)

    wordLength = len(x[0])
    for i in range(len(x), maxLen):
        x.append(np.zeros(wordLength))

    return np.array(x)

In [None]:
def sentence_to_embeddings(sentence, model, max_len=10):
    embeddings = []
    for word in sentence:
        word = str(word)
        wordVec = wordToVec(word, model)
        otherFeatures = np.array([
            word.isupper(), #is upper
            word.istitle(), #is title
            word.isdigit(), #is digit
            1 if word == sentence[0] else 0, #is first in the sentence
            1 if word == sentence[-1] else 0, #is last in the sentence
            len(word), #length of word
        ])
        wordVec = np.concatenate((wordVec, otherFeatures))
        embeddings.append(wordVec)

    # Pad sequences to fixed length
    if len(embeddings) < max_len:
        padding = [0]*(len(embeddings[0])) # Add the other features as zeros
        embeddings += [padding] * (max_len - len(embeddings))
    return np.array(embeddings[:max_len])  # Truncate if needed (shouldn't happen though, but just in case something goes wrong)


In [76]:
maxLen = max([len(sentence) for sentence in sentences])
X = np.array([sentence_to_embeddings(sentence, model, maxLen) for sentence in sentences])
# X = [sentence_to_features(sentence, model) for sentence in sentences]
y = [tag for tag in tags]


In [77]:
y = [np.concatenate([y, ['O']*(maxLen - len(y))]) for y in y] # Pad the tags to make them all the same length
y = np.array([[0 if tag == 'O' else 1 for tag in tags] for tags in y]) # Convert the tags to binary labels

In [78]:
nonNE = 0 # not a named entity counter
NE = 0 # named entity counter
for tags in y:
    for tag in tags:
        if tag == 0:
            nonNE += 1
        else:
            NE += 1

print("# of Named entities: ", NE)
print("# of Non-Named entities: ", nonNE)
print("Named entities make up ", NE/(NE+nonNE), " of the dataset.")

# of Named entities:  160667
# of Non-Named entities:  4827069
Named entities make up  0.03221241060072145  of the dataset.


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
dimensionSize = len(X_train[0][0])

(9592, 104, 105)

## RNN

In [81]:
def weighted_binary_crossentropy(zero_weight, one_weight): #Needed since most of the test data is 0s, so need to focus on getting the proper noun right
    def loss(y_true, y_pred):
        weights = y_true * zero_weight + (1 - y_true) * one_weight
        return K.mean(weights * K.binary_crossentropy(y_true, y_pred))
    return loss

In [None]:
# Define model
rnnModel = Sequential([
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2), input_shape=(maxLen, dimensionSize)),
    Bidirectional(LSTM(32, return_sequences=True, dropout=0.2)),
    TimeDistributed(Dense(1, activation="sigmoid")) # Predict each word separately
])

# Compile the model
rnnModel.compile(optimizer="adam", loss=weighted_binary_crossentropy(0.03,0.97), metrics=["accuracy"])

In [None]:
rnnModel.fit(X_train, y_train, epochs=5, batch_size=16)

Epoch 1/5
[1m 257/2398[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m6:24[0m 180ms/step - accuracy: 0.9619 - loss: 0.0662

KeyboardInterrupt: 

In [113]:
predictions = rnnModel.predict(X_test)
predictions = [[1 if pred > 0.5 else 0 for pred in preds] for preds in predictions]

[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step


In [115]:
truePositive = 0
falsePositive = 0
trueNegative = 0
falseNegative = 0

for pred, actual, xInput in zip(predictions, y_test, X_test):
    for predWord, actualWord, inputWord in zip(pred, actual, xInput):
        if inputWord.sum() == 0:#This is so when the input word is a 0 vector, we know that it is a padding word and we can go to the next sentence
            break
        # Getting the confusion matrix
        if predWord == 1:
            if actualWord == 1:
                truePositive += 1
            else:
                falsePositive += 1
        else:
            if actualWord == 1:
                falseNegative += 1
            else:
                trueNegative += 1

print("True Positive: ", truePositive)
print("False Positive: ", falsePositive)
print("True Negative: ", trueNegative)
print("False Negative: ", falseNegative)
print("Predicted Positive Total: ", truePositive + falsePositive)
print("Predicted Negative Total: ", trueNegative + falseNegative)
print("Actual Positive Total: ", truePositive + falseNegative)
print("Actual Negative Total: ", falsePositive + trueNegative)
print("Total: ", truePositive + falsePositive + trueNegative + falseNegative)

True Positive:  24297
False Positive:  148
True Negative:  176729
False Negative:  7761
Predicted Positive Total:  24445
Predicted Negative Total:  184490
Actual Positive Total:  32058
Actual Negative Total:  176877
Total:  208935


In [116]:
precision = truePositive/(truePositive+falsePositive)
recall = truePositive/(truePositive+falseNegative)
f1 = 2*(precision*recall)/(precision+recall)
accuracy = (truePositive+trueNegative)/(truePositive+trueNegative+falsePositive+falseNegative)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')

Precision: 0.993945592145633
Recall: 0.7579075425790754
F1 Score: 0.8600251314089518
Accuracy: 0.962146121999665
