Simple RNN application on sample data



In [2]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, TimeDistributed
from sklearn.preprocessing import LabelEncoder

# Sample Data
sentences = [
    "Barack Obama was born in Hawaii",
    "Google is based in Mountain View"
]

labels = [
    ["PERSON", "PERSON", "0", "0", "0", "LOCATION"],
    ["ORGANIZATION", "0", "0", "0", "LOCATION", "LOCATION"]
]

# Preprocessing
# Adding oov_token so unknown words don't disappear
tokenizer = Tokenizer(lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
X = tokenizer.texts_to_sequences(sentences)

# Padding
X = pad_sequences(X, padding='post')

# Encode the labels
label_encoder = LabelEncoder()
# We need to make sure '0', 'PERSON', 'LOCATION', 'ORGANIZATION' are in the encoder
label_encoder.fit(["0", "PERSON", "LOCATION", "ORGANIZATION"])

y = [label_encoder.transform(label) for label in labels]
y = pad_sequences(y, padding="post", maxlen=X.shape[1])
y = np.expand_dims(y, -1)

# Build the RNN Model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=X.shape[1]))
model.add(SimpleRNN(units=50, return_sequences=True))
model.add(Dropout(0.1))
model.add(TimeDistributed(Dense(len(label_encoder.classes_), activation='softmax')))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(np.array(X), np.array(y), epochs=100, batch_size=2, verbose=0)
print("Training complete")

# Test the Model
test_sentence = ["Barack Obama went to Hawaii"]

# Because we used oov_token, 'went' and 'to' will be kept as <OOV> instead of deleted
test_sequence = tokenizer.texts_to_sequences(test_sentence)
test_sequence = pad_sequences(test_sequence, padding='post', maxlen=X.shape[1])

predictions = model.predict(test_sequence)
decoded_predictions = label_encoder.inverse_transform(np.argmax(predictions, axis=-1)[0])

print("\nPrediction Results:")
# We limit the loop to the length of the actual sentence to avoid printing padding labels
words = test_sentence[0].split()
for i in range(len(words)):
    print(f"Word: {words[i]:<10} Predicted Label: {decoded_predictions[i]}")



Training complete
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 757ms/step

Prediction Results:
Word: Barack     Predicted Label: PERSON
Word: Obama      Predicted Label: PERSON
Word: went       Predicted Label: 0
Word: to         Predicted Label: 0
Word: Hawaii     Predicted Label: 0


Using NERdataset.csv from kaggle & changing the units to 100, epocs to 50 & learning rates to 0.01 for better results

In [3]:
import pandas as pd
import numpy as np
import ast  # Library to handle stringified lists (e.g., "['O', 'B-geo']")
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, TimeDistributed
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the Dataset
try:
    data = pd.read_csv('/content/ner.csv', encoding='unicode_escape')
except FileNotFoundError:
    print("ERROR: File not found. Please upload 'ner.csv' to the /content/ folder.")
    raise

# The 'Tag' column is stored as a text string like "['O', 'B-geo']".
# We use ast.literal_eval to convert it back into a real Python list.
print("Processing tags")
data['Tag'] = data['Tag'].apply(ast.literal_eval)

# We use the first 5000 sentences to ensure it runs quickly in Colab
# (The full dataset might take too long for a simple tutorial test)
sampled_data = data[:5000]

sentences = sampled_data['Sentence'].astype(str).tolist()
labels = sampled_data['Tag'].tolist()

print(f"Loaded {len(sentences)} sentences.")
# Preprocessing

# A. Tokenize Sentences
tokenizer = Tokenizer(lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
X = tokenizer.texts_to_sequences(sentences)

# Padding X (Input)
# We set a max length based on the longest sentence in our sample
max_len = max([len(x) for x in X])
X = pad_sequences(X, padding='post', maxlen=max_len)

# B. Encode Labels
# Unlike the simple example, this dataset has many tags (B-geo, I-tim, etc.)
# We must find ALL unique tags in our dataset dynamically
label_encoder = LabelEncoder()
all_tags = [tag for sublist in labels for tag in sublist] # Flatten the list
unique_tags = list(set(all_tags))
label_encoder.fit(unique_tags)

print(f"Detected {len(unique_tags)} unique tags: {unique_tags[:10]}")

# Transform text labels to numbers
y = [label_encoder.transform(seq) for seq in labels]

# Padding y (Labels)
# Labels must be padded to the exact same length as X
y = pad_sequences(y, padding="post", maxlen=max_len, value=label_encoder.transform(["O"])[0])

# Reshape y for the model (samples, time_steps, 1)
y = np.expand_dims(y, -1)

# Split into Training and Validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the RNN Model
print("Building model")
model = Sequential()

# Input dim = vocab size + 1 (for padding)
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=max_len))

# SimpleRNN Layer
model.add(SimpleRNN(units=50, return_sequences=True))

# Dropout to prevent overfitting
model.add(Dropout(0.1))

# Output Layer
# Units = number of unique tags
model.add(TimeDistributed(Dense(len(label_encoder.classes_), activation='softmax')))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the Model
print("Training model")
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32)
print("Training complete")

# Test with a New Sentence
test_sentences = [
    "Barack Obama went to nazeer ka ashiyana",
    "London is based in Paris"
]

print("\n Testing Results")
for sentence in test_sentences:
    # Tokenize and Pad
    seq = tokenizer.texts_to_sequences([sentence])
    seq_padded = pad_sequences(seq, padding='post', maxlen=max_len)

    # Predict
    prediction = model.predict(seq_padded)

    # Decode (Convert numbers back to Tag names)
    # argmax finds the highest probability class
    pred_indices = np.argmax(prediction, axis=-1)[0]
    decoded_labels = label_encoder.inverse_transform(pred_indices)

    print(f"\nSentence: {sentence}")
    print(f"{'Word':<15} {'Predicted Label'}")

    words = sentence.split()
    # We zip words with decoded_labels
    # the prediction includes padding 'O's at the end, so we stop when words run out
    for i, word in enumerate(words):
        print(f"{word:<15} {decoded_labels[i]}")

Processing tags
Loaded 5000 sentences.
Detected 17 unique tags: ['O', 'I-tim', 'I-eve', 'B-tim', 'B-art', 'B-per', 'I-gpe', 'I-nat', 'B-geo', 'B-eve']
Building model
Training model
Epoch 1/50




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 104ms/step - accuracy: 0.8828 - loss: 0.7570 - val_accuracy: 0.9479 - val_loss: 0.2669
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9485 - loss: 0.2546 - val_accuracy: 0.9508 - val_loss: 0.2289
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9516 - loss: 0.2145 - val_accuracy: 0.9533 - val_loss: 0.2084
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9545 - loss: 0.1926 - val_accuracy: 0.9544 - val_loss: 0.1940
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9571 - loss: 0.1739 - val_accuracy: 0.9560 - val_loss: 0.1845
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9616 - loss: 0.1517 - val_accuracy: 0.9554 - val_loss: 0.1794
Epoch 7/50
[1m125/125[0m [32m━━━