## Sequence Tagging: NER

In [5]:
import gensim
import gensim.downloader
from gensim.models import KeyedVectors

### 1.1 Word Embedding

In [6]:
# Download the pretrained word2vec embeddings and save the model
# Uncomment the lines below to download and save the pretrained model

google_vectors = gensim.downloader.load('word2vec-google-news-300')
google_vectors.save('./data/word2vec.model')


In [7]:
# Load the model
model = KeyedVectors.load('./data/word2vec.model')

In [8]:
# Vector for computer
model['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [9]:
# Cosine similarity
student = model.most_similar('student')[0]
Apple = model.most_similar('Apple')[0]
apple = model.most_similar('apple')[0]

print(f'The most similar word to student is {student[0]} with a cosine similarity of {student[1]}')
print(f'The most similar word to Apple is {Apple[0]} with a cosine similarity of {Apple[1]}')
print(f'The most similar word to apple is {apple[0]} with a cosine similarity of {apple[1]}')

The most similar word to student is students with a cosine similarity of 0.7294867038726807
The most similar word to Apple is Apple_AAPL with a cosine similarity of 0.7456986308097839
The most similar word to apple is apples with a cosine similarity of 0.720359742641449


### 1.2 Data

Question a

In [10]:
def getNoOfSentences(path,tags):
    file_path = path
    sentences=[]
    sentence=""
    try:
        with open(file_path, 'r') as file:
            # Read the entire file as a string
            #file_contents = file.read()
            #print(file_contents[0])
            # Alternatively, you can read the file line by line
            for line in file:
                words=line.split(' ')
                if(words[0]!='\n'):
                    sentence=sentence+words[0]+' '
                    if('\n' in words[-1]):
                        words[-1]=words[-1].replace('\n','')
                    tags.add(words[len(words)-1])
                else:
                    sentences.append(sentence)
                    sentence=""
            # for a in sentences:
            #     print(a)
        sentences.append(sentence)
        return sentences
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [11]:
tags=set()
trainSentences=getNoOfSentences('data/eng.train',tags)
developmentSentences=getNoOfSentences('data/eng.testa',tags)
testSentences=getNoOfSentences('data/eng.testb',tags)

print("No of sentences in training dataset",len(trainSentences))
print("No of sentences in development dataset",len(developmentSentences))
print("No of sentences in test dataset",len(testSentences))

print(tags)

No of sentences in training dataset 14987
No of sentences in development dataset 3466
No of sentences in test dataset 3684
{'I-PER', 'O', 'B-LOC', 'I-LOC', 'I-MISC', 'I-ORG', 'B-MISC', 'B-ORG'}


Question b

In [4]:
def getWordAndTag(path):
    file_path = path
    sentences=[]
    sentence=""
    output={}
    lst=[]
    try:
        with open(file_path, 'r') as file:
            for line in file:
                words=line.split(' ')
                if(words[0]!='\n'):
                    wordDict={}
                    sentence=sentence+words[0]+' '
                    if('\n' in words[-1]):
                        words[-1]=words[-1].replace('\n','')
                    wordDict[words[0]]=words[-1]
                    lst.append(wordDict)
                else:
                    sentences.append(sentence)
                    output[sentence]=lst
                    sentence=""
                    lst=[]
        sentences.append(sentence)
        return output
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [6]:
dict=getWordAndTag('data/eng.train')
for key in dict:
    print("Key : ",key," Value: ",dict[key])


Key :  EU rejects German call to boycott British lamb .   Value:  [{'EU': 'I-ORG'}, {'rejects': 'O'}, {'German': 'I-MISC'}, {'call': 'O'}, {'to': 'O'}, {'boycott': 'O'}, {'British': 'I-MISC'}, {'lamb': 'O'}, {'.': 'O'}]
Key :  Peter Blackburn   Value:  [{'Peter': 'I-PER'}, {'Blackburn': 'I-PER'}]
Key :  BRUSSELS 1996-08-22   Value:  [{'BRUSSELS': 'I-LOC'}, {'1996-08-22': 'O'}]
Key :  The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .   Value:  [{'The': 'O'}, {'European': 'I-ORG'}, {'Commission': 'I-ORG'}, {'said': 'O'}, {'on': 'O'}, {'Thursday': 'O'}, {'it': 'O'}, {'disagreed': 'O'}, {'with': 'O'}, {'German': 'I-MISC'}, {'advice': 'O'}, {'to': 'O'}, {'consumers': 'O'}, {'to': 'O'}, {'shun': 'O'}, {'British': 'I-MISC'}, {'lamb': 'O'}, {'until': 'O'}, {'scientists': 'O'}, {'determine': 'O'}, {'whether': 'O'}, {'mad': 'O'}, {'cow': 'O'}, {'disease': 'O'}

1.3