In [1]:
import pandas as pd
import os
import gensim.downloader
from gensim.models import Word2Vec, KeyedVectors
import tensorflow as tf
import numpy as np

from keras import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import InputLayer
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

#### Train word2vec model

In [2]:
def getSentences(path):
    file_path = path
    sentences = []
    current_sentence = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    word = line.split()[0]
                    current_sentence.append(word)
                else:
                    if current_sentence:
                        sentences.append(current_sentence)
                    current_sentence = []
            if current_sentence:
                sentences.append(current_sentence)
            return sentences
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

def getTags(path):
    file_path = path
    tags = []
    current_sentence = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    tag = line.split()[3]
                    current_sentence.append(tag)
                else:
                    if current_sentence:
                        tags.append(current_sentence)
                    current_sentence = []
            if current_sentence:
                tags.append(current_sentence)
            return tags
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [3]:
train_sentences = getSentences('./data/eng.train')
train_tags = getTags('./data/eng.train')

In [4]:
train_w2v = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=4)

#### Load w2v Model

In [5]:
w2v = gensim.downloader.load("word2vec-google-news-300")

#### Preprocess Train, Development and Test Data

In [6]:
train_path = './data/eng.train'
development_path = './data/eng.testa'
test_path = './data/eng.testb'

In [7]:
def get_data(path):
    try:
        with open(path, 'r') as file:
            data = file.readlines()
        file.close()
    except Exception as e:
        data = None
        print(e)
    
    return data

In [8]:
train_data = get_data(train_path)
development_data = get_data(development_path)
test_data = get_data(test_path)

In [9]:
def extract_sentences(data):
    split_data = [line.split(' ') for line in data] if data != None else []
    sentences = []
    current_sentence = []
    for line in split_data:
        if line == ['\n']:
            sentences.append(current_sentence)
            current_sentence = []
        else:
            word = line[0]
            tag = line[-1].replace('\n', '')
            current_sentence.append([word, tag])
    sentences.append(current_sentence)
    return sentences

def extract_words_and_tags(data):
    words = []
    tags = []
    for line in data:
        parts = line.split()
        if parts:
            word = parts[0]
            tag = parts[-1]
            words.append(word)
            tags.append(tag)
    return words, tags

In [10]:
train_sentences = extract_sentences(train_data)
train_words, train_tags = extract_words_and_tags(train_data)
development_sentences = extract_sentences(development_data)
development_words, development_tags = extract_words_and_tags(development_data)
test_sentences = extract_sentences(test_data)
test_words, test_tags = extract_words_and_tags(test_data)

In [11]:
train_voc = np.unique(np.array(train_words))
dev_voc = np.unique(np.array(development_words))
tag_set = np.unique(np.array(train_tags))

In [12]:
tag_set

array(['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER',
       'O'], dtype='<U6')

In [13]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(development_sentences))
print("Number of sentences (test):", len(test_sentences))

Number of sentences (training): 14987
Number of sentences (dev): 3466
Number of sentences (test): 3684


In [14]:
print("Tag set (BIO):", tag_set)

Tag set (BIO): ['B-LOC' 'B-MISC' 'B-ORG' 'I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


In [15]:
# train_pretrained_weights = train_w2v.wv.vectors
# train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

# word2idx = train_w2v.wv.key_to_index
# word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
# word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1

# voc = train_voc
# voc = np.append(voc,'<UNK>')
# voc = np.append(voc,'<PAD>')

# tag2idx = {k: v for v, k in enumerate(tag_set)}

In [16]:
train_pretrained_weights = w2v.vectors
train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

word2idx = w2v.key_to_index
word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1
voc = word2idx.keys()
# voc = np.append(voc,'<UNK>')
# voc = np.append(voc,'<PAD>')

tag2idx = {k: v for v, k in enumerate(tag_set)}


In [17]:
word2idx['<UNK>']

3000000

In [18]:
num_tokens = len(voc) + 2
embedding_dim = 50
print(num_tokens)
for key in word2idx:
    if(word2idx[key] == 23629):
        print(key)

3000004
Tressel


In [19]:
def get_x_embeddings(sentences):
  sequence = []
  sent_seq = []
  for s in sentences:
    for w in s:
      if w[0] in word2idx.keys():
        sent_seq.append(word2idx[w[0]])
      else:
        sent_seq.append(word2idx['<UNK>'])
    sequence.append(sent_seq)
    sent_seq = []
  return sequence

In [20]:
x_train = get_x_embeddings(train_sentences)
y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
x_dev = get_x_embeddings(development_sentences)
y_dev = [[tag2idx[w[1]] for w in s] for s in development_sentences]
x_test = get_x_embeddings(test_sentences)
y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]

x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post")
y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post", value=tag2idx['O'])
x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post")
y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post", value=tag2idx['O'])
x_test = pad_sequences(maxlen=embedding_dim, sequences=x_test, padding="post")
y_test = pad_sequences(maxlen=embedding_dim, sequences=y_test, padding="post", value=tag2idx['O'])

y_train = to_categorical(y_train)
y_dev = to_categorical(y_dev)
y_test = to_categorical(y_test)

In [27]:
x_train.shape
#y_train.shape

(14987, 50)

In [69]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential
import numpy as np
from tensorflow.keras.optimizers import Adam,SGD,Adagrad

num_epochs = 1000
batch_size = 50
num_classes = len(tag_set)
sequence_length = 50
output_shape=(sequence_length,num_classes)

model = Sequential([
    layers.Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length=sequence_length,trainable=False),
    layers.LSTM(sequence_length),
    layers.Dense(sequence_length*num_classes, activation='softmax'),
    layers.Reshape(output_shape)
    
])
desired_learning_rate = 0.001

# Create an Adam optimizer instance with the desired learning rate
optimizer = Adagrad(learning_rate=desired_learning_rate)

# Compile the model with the customized optimizer
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["accuracy"])
early_stopping = EarlyStopping(patience=10)
# Training (replace with your data)
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, shuffle=True, validation_data=(x_dev, y_dev),
         callbacks = early_stopping, workers = 4)




Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000

KeyboardInterrupt: 

In [70]:
num_classes = len(tag_set)
sequence_length = 50
output_shape=(sequence_length,num_classes)

model = keras.Sequential()
model.add(InputLayer(embedding_dim))
model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length = sequence_length, trainable=False,))
model.add(LSTM(units=sequence_length))
model.add(Dense(units=256, activation= "relu", kernel_regularizer=regularizers.L1L2(l1=0.025, l2=0.025)))
model.add(Dropout(0.01))  
model.add(Dense(sequence_length * num_classes, activation='softmax'))
model.add(tf.keras.layers.Reshape(output_shape))

model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])
early_stopping = EarlyStopping(patience=10)
model.summary()

Model: "sequential_36"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_36 (Embedding)    (None, 50, 50)            150000200 
                                                                 
 lstm_36 (LSTM)              (None, 50)                20200     
                                                                 
 dense_41 (Dense)            (None, 256)               13056     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_42 (Dense)            (None, 400)               102800    
                                                                 
 reshape_25 (Reshape)        (None, 50, 8)             0         
                                                                 
Total params: 150136256 (572.72 MB)
Trainable params:

In [71]:
num_epochs = 1000
batch_size = 50

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, shuffle=True, validation_data=(x_dev, y_dev),
         callbacks = early_stopping, workers = 4)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000


<keras.src.callbacks.History at 0x3cc885ed0>

In [64]:
predicted_labels = model.predict(x_test)
print(predicted_labels)

[[[2.97168413e-07 3.58342419e-07 3.76703980e-07 ... 1.46952493e-03
   7.96799723e-04 5.54952258e-03]
  [3.45029071e-06 3.99986584e-06 3.75066463e-07 ... 6.44487329e-04
   9.45195381e-04 5.72303310e-03]
  [1.25970166e-06 4.55194993e-07 4.31053820e-07 ... 6.19496626e-04
   9.55473224e-04 8.07509571e-03]
  ...
  [1.05844390e-06 1.24763585e-06 1.22487268e-06 ... 1.25415236e-05
   1.83292595e-05 4.24492024e-02]
  [1.22808456e-06 1.54944473e-06 1.19046615e-06 ... 7.87406680e-06
   1.42421250e-05 4.76623774e-02]
  [1.56278747e-06 1.43113516e-06 1.67542589e-06 ... 1.58047806e-06
   1.72089688e-06 1.06036514e-01]]

 [[2.97168413e-07 3.58342419e-07 3.76703980e-07 ... 1.46952493e-03
   7.96799723e-04 5.54952258e-03]
  [3.45029071e-06 3.99986584e-06 3.75066463e-07 ... 6.44487329e-04
   9.45195381e-04 5.72303310e-03]
  [1.25970166e-06 4.55194993e-07 4.31053820e-07 ... 6.19496626e-04
   9.55473224e-04 8.07509571e-03]
  ...
  [1.05844390e-06 1.24763585e-06 1.22487268e-06 ... 1.25415236e-05
   1.83292

In [65]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.23075248301029205, Test Accuracy: 0.9565255045890808
