## Sequence Tagging: NER

In [19]:
import gensim
import gensim.downloader
from gensim.models import KeyedVectors

### 1.1 Word Embedding

In [20]:
# Download the pretrained word2vec embeddings and save the model
# Uncomment the lines below to download and save the pretrained model

google_vectors = gensim.downloader.load('word2vec-google-news-300')
google_vectors.save('./data/word2vec.model')


In [21]:
# Load the model
model = KeyedVectors.load('./data/word2vec.model')

In [22]:
# Vector for computer
model['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [23]:
# Cosine similarity
student = model.most_similar('student')[0]
Apple = model.most_similar('Apple')[0]
apple = model.most_similar('apple')[0]

print(f'The most similar word to student is {student[0]} with a cosine similarity of {student[1]}')
print(f'The most similar word to Apple is {Apple[0]} with a cosine similarity of {Apple[1]}')
print(f'The most similar word to apple is {apple[0]} with a cosine similarity of {apple[1]}')

The most similar word to student is students with a cosine similarity of 0.7294867038726807
The most similar word to Apple is Apple_AAPL with a cosine similarity of 0.7456986308097839
The most similar word to apple is apples with a cosine similarity of 0.720359742641449


### 1.2 Data

Question a

In [24]:
def getNoOfSentences(path,tags):
    file_path = path
    sentences=[]
    sentence=""
    try:
        with open(file_path, 'r') as file:
            # Read the entire file as a string
            #file_contents = file.read()
            #print(file_contents[0])
            # Alternatively, you can read the file line by line
            for line in file:
                words=line.split(' ')
                if(words[0]!='\n'):
                    sentence=sentence+words[0].lower()+' '
                    if('\n' in words[-1]):
                        words[-1]=words[-1].replace('\n','')
                    tags.add(words[len(words)-1])
                else:
                    sentences.append(sentence.lower())
                    sentence=""
            # for a in sentences:
            #     print(a)
        sentences.append(sentence.lower())
        return sentences
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [25]:
tags=set()
trainSentences=getNoOfSentences('data/eng.train',tags)
developmentSentences=getNoOfSentences('data/eng.testa',tags)
testSentences=getNoOfSentences('data/eng.testb',tags)

print("No of sentences in training dataset",len(trainSentences))
print("No of sentences in development dataset",len(developmentSentences))
print("No of sentences in test dataset",len(testSentences))

print(tags)

No of sentences in training dataset 14987
No of sentences in development dataset 3466
No of sentences in test dataset 3684
{'I-MISC', 'I-ORG', 'B-LOC', 'I-PER', 'B-MISC', 'O', 'I-LOC', 'B-ORG'}


Question b

In [26]:
def getWordAndTag(path):
    file_path = path
    sentences=[]
    sentence=""
    output={}
    lst=[]
    try:
        with open(file_path, 'r') as file:
            for line in file:
                words=line.split(' ')
                if(words[0]!='\n'):
                    #wordDict={}
                    sentence=sentence+words[0].lower()+' '
                    if('\n' in words[-1]):
                        words[-1]=words[-1].replace('\n','')
                    #wordDict[words[0]]=words[-1]
                    lst.append(words[-1])
                else:
                    sentences.append(sentence.lower())
                    output[sentence]=lst
                    sentence=""
                    lst=[]
        sentences.append(sentence.lower())
        return output
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [27]:
trainingDict=getWordAndTag('data/eng.train')
for key in trainingDict:
    print("Key : ",key," Value: ",trainingDict[key])


Key :  eu rejects german call to boycott british lamb .   Value:  ['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']
Key :  peter blackburn   Value:  ['I-PER', 'I-PER']
Key :  brussels 1996-08-22   Value:  ['I-LOC', 'O']
Key :  the european commission said on thursday it disagreed with german advice to consumers to shun british lamb until scientists determine whether mad cow disease can be transmitted to sheep .   Value:  ['O', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Key :  germany 's representative to the european union 's veterinary committee werner zwingmann said on wednesday consumers should buy sheepmeat from countries other than britain until the scientific advice was clearer .   Value:  ['I-LOC', 'O', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 

1.3

In [28]:
developmentDict=getWordAndTag('data/eng.testa')
for key in developmentDict:
    print("Key : ",key," Value: ",developmentDict[key])


Key :  cricket - leicestershire take over at top after innings victory .   Value:  ['O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Key :  london 1996-08-30   Value:  ['I-LOC', 'O']
Key :  west indian all-rounder phil simmons took four for 38 on friday as leicestershire beat somerset by an innings and 39 runs in two days to take over at the head of the county championship .   Value:  ['I-MISC', 'I-MISC', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Key :  their stay on top , though , may be short-lived as title rivals essex , derbyshire and surrey all closed in on victory while kent made up for lost time in their rain-affected match against nottinghamshire .   Value:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'I-ORG', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [29]:
testDict=getWordAndTag('data/eng.testb')
for key in testDict:
    print("Key : ",key," Value: ",testDict[key])

Key :  soccer - japan get lucky win , china in surprise defeat .   Value:  ['O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O']
Key :  nadim ladki   Value:  ['I-PER', 'I-PER']
Key :  al-ain , united arab emirates 1996-12-06   Value:  ['I-LOC', 'O', 'I-LOC', 'I-LOC', 'I-LOC', 'O']
Key :  japan began the defence of their asian cup title with a lucky 2-1 win against syria in a group c championship match on friday .   Value:  ['I-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Key :  but china saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers uzbekistan .   Value:  ['O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O']
Key :  china controlled most of the match and saw several chances missed until the 78th minute when uzbek striker igor shkvyrin too

Question 3

In [30]:
from sklearn.model_selection import ParameterGrid
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import Word2Vec, KeyedVectors

import pandas as pd
import numpy as np
import logging
import os

In [31]:
dev_df = pd.read_csv('data/eng.testa',sep=' ',header=None)
train_df = pd.read_csv('data/eng.train',sep=' ',header=None)
test_df = pd.read_csv('data/eng.testb',sep=' ',header=None)
print("Training Size: ", train_df.shape[0])
print("Development Size: ", dev_df.shape[0])
print("Test Size: ", test_df.shape[0])

Training Size:  204567
Development Size:  51578
Test Size:  46666


In [32]:
model = KeyedVectors.load('./data/word2vec.model')

In [33]:
pretrained_weights = model.vectors
vocab_size, embedding_size = pretrained_weights.shape

print("Vocab Size: ", vocab_size)
print("Embedding Size: ", embedding_size)

Vocab Size:  3000000
Embedding Size:  300


In [34]:
sentences = []  # List of sentences
tags = []

for sentence in trainSentences:
    words=sentence.split(' ')
    temp=[]
    for word in words:
        if(word != ''):
            temp.append(word)
    sentences.append(temp)
for sentence in trainSentences:
    tags.append(trainingDict[sentence])

In [35]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential
import numpy as np
# Data preparation and tokenization (replace with your data)
sentences = []  # List of sentences
tags = []

for sentence in trainSentences:
    words=sentence.split(' ')
    temp=[]
    for word in words:
        if(word != ''):
            temp.append(word)
    sentences.append(temp)
for sentence in trainSentences:
    tags.append(trainingDict[sentence])

# Create a vocabulary and encode tokens
vocab = set(token for sentence in sentences for token in sentence)
word2idx = {}
for word in vocab:
    try:
        word2idx[word]=word2vec[word]
    except:
        word2idx[word] = np.zeros(300)

tag2idx = {tag: idx for idx, tag in enumerate(set(tag for tags in tags for tag in tags))}

X = [[word2idx[word] for word in sentence] for sentence in sentences]
y = [[tag2idx[tag] for tag in tags] for tags in tags]

# Pad sequences to have the same length
X = tf.keras.preprocessing.sequence.pad_sequences(X)
y = tf.keras.preprocessing.sequence.pad_sequences(y)

# Model
model = Sequential([
    layers.Embedding(input_dim=len(vocab), output_dim=128, input_length=X.shape[1]),
    layers.LSTM(128, return_sequences=True),
    layers.Dense(len(tag2idx), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# Training (replace with your data)
model.fit(X, y, batch_size=32, epochs=10, validation_split=0.1)

Epoch 1/10


ValueError: in user code:

    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/input_spec.py", line 235, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential' (type Sequential).
    
    Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 113, 300, 128)
    
    Call arguments received by layer 'sequential' (type Sequential):
      • inputs=tf.Tensor(shape=(None, 113, 300), dtype=int32)
      • training=True
      • mask=None


In [None]:
# Inference (replace with your input sentence)
print(type(X))
input_sentence = "your months sentence goldman"
input_sequence = [word2idx[word] for word in input_sentence.split()]
predicted_tags = model.predict(input_sequence)

<class 'numpy.ndarray'>


KeyError: 'your'