<a href="https://colab.research.google.com/github/SidElias/IDP-G11/blob/main/NLP_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import urllib.request

# Download and preprocess the dataset
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-train.conllu"
file_path = "en_gum-ud-train.conllu"
urllib.request.urlretrieve(url, file_path)

# Parse the dataset
sentences = []
pos_tags = []
sentence = []
tag = []

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

for line in lines:
    if line.startswith("#"):
        continue
    if line.strip() == "":
        if sentence:
            sentences.append(sentence)
            pos_tags.append(tag)
            sentence = []
            tag = []
    else:
        parts = line.split("\t")
        if len(parts) > 3:
            sentence.append(parts[1])
            tag.append(parts[3])

if sentence:
    sentences.append(sentence)
    pos_tags.append(tag)

# Create vocabularies for words and tags
vocab_size = 10000
embedding_dim = 16
max_length = 20
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

word_tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
word_tokenizer.fit_on_texts(sentences)

tag_tokenizer = Tokenizer(lower=False)
tag_tokenizer.fit_on_texts(pos_tags)

# Convert words and tags to sequences
word_index = word_tokenizer.word_index
tag_index = tag_tokenizer.word_index

X = word_tokenizer.texts_to_sequences(sentences)
y = tag_tokenizer.texts_to_sequences(pos_tags)

# Pad sequences
X = pad_sequences(X, maxlen=max_length, padding=padding_type, truncating=trunc_type)
y = pad_sequences(y, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert tags to categorical
num_tags = len(tag_index) + 1
y = [to_categorical(i, num_classes=num_tags) for i in y]
y = np.array(y)

# Split the data into training and testing sets
training_size = int(len(X) * 0.8)
X_train, X_test = X[:training_size], X[training_size:]
y_train, y_test = y[:training_size], y[training_size:]

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags, activation='softmax'))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.879400372505188


In [2]:
# Predict POS tags for the input sentence
print("Enter text.")
text = input()
sentences = [text.split()]
sequences = word_tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


predictions = model.predict(padded)
predicted_tags = np.argmax(predictions, axis=-1)

# Convert indices back to words and tags
words = [word for word in text.split()]
tags = [tag_tokenizer.index_word[idx] for idx in predicted_tags[0] if idx != 0]

print("Words:", words)
print("Predicted POS tags:", tags)

Enter text.
Hi! how are you? I hope you are doing well.
Words: ['Hi!', 'how', 'are', 'you?', 'I', 'hope', 'you', 'are', 'doing', 'well.']
Predicted POS tags: ['ADJ', 'ADV', 'AUX', 'VERB', 'PRON', 'VERB', 'PRON', 'AUX', 'VERB', 'ADJ', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN']
