### THE MODEL WHICH WILL TAKE OUT THE IMPORTANT INFORMATION LIKE ANY PERSONS NAME OR LOCATION OR ANY TIME GIVEN IN THE INPUT
Give the most important information gained from the news article like, location, time, and persons name if any or multiple persons name.
# Popularly known as NER model.

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [4]:
df = pd.read_csv("ner_dataset.tsv", sep="\t", header=None, names=["text", "label"], keep_default_na=False)
df

Unnamed: 0,text,label
0,Amber,B-PERSON
1,Walker,B-PERSON
2,confirmed,O
3,a,O
4,suspicious,O
...,...,...
13343,using,O
13344,chemical,O
13345,gas,O
13346,at,O


In [8]:
import string
sentences = []
labels = []
sentence = []
label_seq = []
punctuations = [".", "!", "?"]

for i, row in df.iterrows():
    token = row["text"]
    tag = row["label"]

    sentence.append(token)
    label_seq.append(tag)

    if any(p in token for p in punctuations):
        sentences.append(sentence)
        labels.append(label_seq)
        sentence = []
        label_seq = []
print(df.head(20))  # See how tokens look
print([t for t in df["text"].unique() if "." in t])  # See tokens containing .
print([t for t in df["label"]])  # See tokens containing .


               text       label
0             Amber    B-PERSON
1            Walker    B-PERSON
2         confirmed           O
3                 a           O
4        suspicious           O
5           package           O
6               was           O
7             found           O
8              near           O
9   Christopherside  B-LOCATION
10           around           O
11           07:45.      B-TIME
12        midnight:      B-TIME
13     Surveillance           O
14         detected           O
15           Joseph    B-PERSON
16          Elliott    B-PERSON
17     transporting           O
18              IED    B-WEAPON
19          through           O
['07:45.', 'Kimmouth.', 'IED.', 'evening.', 'grenade.', 'drone.', '20:15.', 'Eric.', '03:00.', 'AK-47.', '13:30.', 'Dr.', 'midnight.', 'Angela.', 'Mrs.', 'Amyton.', 'Robert.', 'RPG.', 'Washingtonbury.', 'rifle.', 'Nancyhaven.', 'Thomas.', 'Lydiamouth.', 'Danieltown.', 'Ericburgh.', 'Stephanieland.', 'Benjamin.', 'Greenetown.',

In [9]:
words = list(set([w for s in sentences for w in s]))
tags = list(set([t for ts in labels for t in ts]))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["OOV"] = 1

tag2idx = {t: i for i, t in enumerate(tags)}

In [10]:
X = [[word2idx.get(w, word2idx["OOV"]) for w in s] for s in sentences]
y = [[tag2idx[t] for t in ts] for ts in labels]

MAX_LEN = max(len(s) for s in sentences)
X = pad_sequences(X, maxlen=MAX_LEN, padding="post")
y = pad_sequences(y, maxlen=MAX_LEN, padding="post")

y = np.expand_dims(y, -1)  # Needed for sparse_categorical_crossentropy


In [11]:
input_word = Input(shape=(None,))
model = Embedding(input_dim=len(word2idx)+2, output_dim=64)(input_word)
model = Bidirectional(LSTM(units=64, return_sequences=True))(model)
out = TimeDistributed(Dense(len(tag2idx), activation="softmax"))(model)

model = Model(input_word, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [12]:
model.fit(X, y, batch_size=32, epochs=5, verbose=1)

Epoch 1/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7492 - loss: 1.0126
Epoch 2/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7739 - loss: 0.6314
Epoch 3/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8424 - loss: 0.4410
Epoch 4/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9405 - loss: 0.2340
Epoch 5/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9708 - loss: 0.1135


<keras.src.callbacks.history.History at 0x28de57be090>

In [None]:
# -------- Prediction After Training --------
idx2tag = {i: t for t, i in tag2idx.items()}

def ner_predict(sentence):
    words = sentence.split()
    seq = [word2idx.get(w, word2idx["OOV"]) for w in words]
    padded = pad_sequences([seq], maxlen=MAX_LEN, padding="post")
    
    pred = model.predict(padded)
    pred_tags = [idx2tag[np.argmax(p)] for p in pred[0][:len(words)]]
    
    return list(zip(words, pred_tags))

# -------- Test Example --------
test_sentence = ""
result = ner_predict(test_sentence)

print("\nNER Prediction:")
for word, tag in result:
    print(f"{word}: {tag}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step

NER Prediction:
Pranay: B-PERSON
Shit: B-PERSON
is: B-PERSON
very: B-PERSON
good: B-PERSON
person: B-PERSON


In [None]:
with open("models/ner_model.json", "w") as json_file:
    json_file.write(model.to_json())

model.save_weights("models/ner_model_weights.h5")

with open("models/ner_word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)

with open("models/ner_tag2idx.pkl", "wb") as f:
    pickle.dump(tag2idx, f)

print("✅ NER Model Trained & Saved")