# Named Entity Recognition(NER)

# Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import ast

# Load data

In [None]:
def loading_data(data_path):
    
    data = pd.read_csv(data_path)
    
    data.dropna(inplace=True)
    print("Number of rows : ",data.shape[0]," and the number of columns : ",data.shape[1])
    
    return data

In [None]:
data = loading_data("/kaggle/input/named-entity-recognition-ner-corpus/ner.csv")

data.head()

In [None]:
data['POS'][0]

# Data preprocessing

In [None]:
def preprocess_data(data):
    for i in range(len(data)):
        pos = ast.literal_eval(data['POS'][i])
        tags = ast.literal_eval(data['Tag'][i])
        data['POS'][i] = [str(word) for word in pos]
        data['Tag'][i] = [str(word.upper()) for word in tags]
    return data

In [None]:
data = preprocess_data(data)
data.head()

In [None]:
import re

def lower_text(text: str):
    return text.lower()

def remove_punctuation(text: str):
    """
    Substitute all punctiations with space in case of
    "hello!nice to meet you"
    
    If subs with '' -> "hellonice to meet you"
    With ' ' -> "hello nice to meet you"
    """
    text_nopunct = re.sub('[^A-Za-z0-9\s]', '', text)
    return text_nopunct

def remove_multiple_spaces(text: str):
    text_no_doublespace = re.sub('\s+', ' ', text)
    return text_no_doublespace

In [None]:
sample_text = data['Sentence'][3]

_lowered = lower_text(sample_text)
_without_punct = remove_punctuation(_lowered)
_single_spaced = remove_multiple_spaces(_without_punct)

print(sample_text)
print('-'*10)
print(_lowered)
print('-'*10)
print(_without_punct)
print('-'*10)
print(_single_spaced)

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk

stopWords = set(stopwords.words('english'))

def tokenize_text(text: str) -> list[str]:
    return word_tokenize(text)

def remove_stop_words(tokenized_text: list[str]) -> list[str]:
    wordsFiltered = [w for w in tokenized_text if w not in stopWords]
    return wordsFiltered

def stem_words(tokenized_text: list[str]) -> list[str]:
    stemmer = WordNetLemmatizer()
    output = [stemmer.lemmatize(text) for text in tokenized_text]
    return output


In [None]:
def preprocessing_stage(text):
    _lowered = lower_text(text)
    _without_punct = remove_punctuation(_lowered)
    _single_spaced = remove_multiple_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
#     _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_tokenized)
    _stemmed = ' '.join(_stemmed)
    
    return _stemmed

def clean_text_inplace(df):
    df['Sentence'] = df['Sentence'].apply(preprocessing_stage)
    return df

In [None]:
nltk.download("wordnet")
nltk.download("omw-1.4")
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


data = clean_text_inplace(data)
data.head()

In [None]:
df_final = data[['Sentence','Tag']]

df_train, df_test = train_test_split(df_final,test_size=0.2,random_state=42)
len(df_train), len(df_test)

# Import model libraries and Make RNN model

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping

In [None]:
train_targets = list(df_train.Tag.values)
test_targets = list(df_test.Tag.values)

tokenizer = Tokenizer(lower=True,oov_token="UNK")
tokenizer.fit_on_texts(df_train['Sentence'])

train_inputs = tokenizer.texts_to_sequences(df_train['Sentence'])
test_inputs = tokenizer.texts_to_sequences(df_test['Sentence'])

In [None]:
word2idx = tokenizer.word_index
V = len(word2idx) # Vocab size
print("Found %s unique tokens "%V)

In [None]:
train_tags = set([val for sublist in train_targets for val in sublist])
test_tags = set([val for sublist in test_targets for val in sublist])

print("Unique NER tags in train set: ",train_tags)
print("Unique NER tags in test set: ",test_tags)

In [None]:
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_targets)
train_tgt_int = tag_tokenizer.texts_to_sequences(train_targets)
test_tgt_int = tag_tokenizer.texts_to_sequences(test_targets)

In [None]:
# Max length
max_length_train = max(len(sent) for sent in train_inputs)
max_length_test = max(len(sent) for sent in test_inputs)
max_length = max(max_length_train,max_length_test)

# Pad input sequences
train_inputs_final = pad_sequences(train_inputs, maxlen=max_length, padding="post")
print("Shape of train inputs: ",train_inputs_final.shape)

test_inputs_final = pad_sequences(test_inputs, maxlen=max_length, padding="post")
print("Shape of test inputs: ",test_inputs_final.shape)

train_targets_final = pad_sequences(train_tgt_int, maxlen=max_length, padding="post")
print("Shape of train targets: ",train_targets_final.shape)

test_targets_final = pad_sequences(test_tgt_int, maxlen=max_length, padding="post")
print("Shape of test targets: ",test_targets_final.shape)

In [None]:
# Number of classes

K = len(tag_tokenizer.word_index)  +1
K

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dropout, LSTM, TimeDistributed, Dense, Bidirectional
from tensorflow.keras.models import Model

# Create a MirroredStrategy for multi-GPU support
strategy = tf.distribute.MirroredStrategy()

# Define the model inside the strategy scope
with strategy.scope():
    vector_size = 128

    i = Input(shape=(max_length,))
    x = Embedding(input_dim=V+1, output_dim=vector_size, mask_zero=True)(i)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.2))(x)
    x = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.2))(x)
    x = TimeDistributed(Dense(K, activation='softmax'))(x)

    model = Model(i, x)
    model.summary()

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.utils.class_weight import compute_class_weight

# Set the visible GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Change this to the GPU IDs you want to use

# Limit GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Create data pipelines
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs_final, train_targets_final))
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs_final, test_targets_final))

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
lr_scheduler = LearningRateScheduler(lambda epoch: 0.001 * 0.9 ** epoch)

# Compile the model inside the strategy scope
with strategy.scope():
    model.compile(optimizer="adam",
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])

# Fit the model
model.fit(train_dataset.batch(32),  # Adjust the batch size based on your GPU memory
          epochs=5,
          validation_data=test_dataset.batch(32),
          callbacks=[early_stopping, lr_scheduler])

# Save the model
model.save('ner_model.h5')


In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
sentence = "Polish Prime Minister Jaroslaw Kaczynski has voiced support for the deployment of 10 U.S. missile interceptors in Poland and guidance technology in the Czech Republic ."
sentence = preprocessing_stage(sentence)
predictions = model.predict(pad_sequences(tokenizer.texts_to_sequences([sentence]),
                                          maxlen=max_length,
                                         padding="post"))
predictions

In [None]:
prediction_ner = np.argmax(predictions,axis=-1)
prediction_ner

In [None]:
NER_tags = [tag_tokenizer.index_word[num] for num in list(prediction_ner.flatten())]
NER_tags[:len(tokenizer.texts_to_sequences([sentence])[0])], sentence