<a href="https://colab.research.google.com/github/SaatvikP/POS_Tagging_using_Word_Embedding/blob/main/POS_Tagging_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
from datasets import load_dataset
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder

In [7]:
from datasets import load_dataset

dataset = load_dataset("batterydata/pos_tagging")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/587 [00:00<?, ?B/s]

train.json:   0%|          | 0.00/5.05M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/601k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13054 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1451 [00:00<?, ? examples/s]

In [8]:
sentences_train = dataset['train']['words']
labels_train = dataset['train']['labels']

sentences_test = dataset['test']['words']
labels_test = dataset['test']['labels']

In [13]:
words = list(set(word for sentence in sentences_train for word in sentence))
tags = list(set(tag for label in labels_train for tag in label))


In [15]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx['PAD'] = 0
word2idx['UNK'] = 1

tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx['PAD'] = 0

idx2tag = {i: w for w, i in tag2idx.items()}

In [17]:
X_train = [[word2idx.get(w, word2idx['UNK']) for w in s] for s in sentences_train]
y_train = [[tag2idx[t] for t in l] for l in labels_train]

X_test = [[word2idx.get(w, word2idx['UNK']) for w in s] for s in sentences_test]
y_test = [[tag2idx[t] for t in l] for l in labels_test]


In [19]:
MAX_LEN = 50

X_train = pad_sequences(X_train, maxlen=MAX_LEN, padding='post')
y_train = pad_sequences(y_train, maxlen=MAX_LEN, padding='post')

X_test = pad_sequences(X_test, maxlen=MAX_LEN, padding='post')
y_test = pad_sequences(y_test, maxlen=MAX_LEN, padding='post')

In [21]:
import gensim.downloader

In [22]:
glove_vectors = gensim.downloader.load("glove-wiki-gigaword-100")

embedding_matrix = np.random.normal(size=(len(word2idx), 100))
embedding_matrix[word2idx['PAD']] = np.zeros((100,))

for word, i in word2idx.items():
    if word in glove_vectors:
        embedding_matrix[i] = glove_vectors[word]



In [23]:
y_train = np.array([to_categorical(seq, num_classes=len(tag2idx)) for seq in y_train])
y_test = np.array([to_categorical(seq, num_classes=len(tag2idx)) for seq in y_test])


In [24]:
model = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=100, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
    Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(len(tag2idx), activation="softmax"))
])



In [25]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])


In [26]:
history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_split=0.1)


Epoch 1/3
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 162ms/step - accuracy: 0.7059 - loss: 1.5099 - val_accuracy: 0.9064 - val_loss: 0.3926
Epoch 2/3
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 144ms/step - accuracy: 0.9325 - loss: 0.2735 - val_accuracy: 0.9362 - val_loss: 0.2575
Epoch 3/3
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 152ms/step - accuracy: 0.9530 - loss: 0.1820 - val_accuracy: 0.9480 - val_loss: 0.2077


In [27]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9439 - loss: 0.2332
Test Accuracy: 94.78%
