In [1]:
# !pip install datasets



## Imports

In [2]:
from datasets import load_dataset
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer #use TF IDF transformer to change text vector created by count vectorizer
from sklearn.svm import SVC# Support Vector Machine
from sklearn.metrics import *
import re
from gensim.models import KeyedVectors
import pickle
from string import punctuation

dataset = load_dataset("conll2003")


Reusing dataset conll2003 (/Users/param/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 150.44it/s]


In [3]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import conlltags2tree, tree2conlltags


[nltk_data] Downloading package punkt to /Users/param/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/param/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/param/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/param/nltk_data...
[nltk_data]   Package words is already up-to-date!


### Data Processing and vecotrizing using Tf-Idf vectorizer

In [6]:
NEI_dataset = dataset.map(lambda example: {'ner_tags': [1 if x > 0 else 0 for x in example["ner_tags"] ]})#{'sentence1': 'My sentence: ' + example['sentence1']
Tfidf_vect = TfidfVectorizer(lowercase=False,token_pattern=r".*")
Tfidf_vect.fit([word for sublist in dataset["train"]["tokens"]  for word in sublist]+[word for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word for sublist in dataset["test"]["tokens"]  for word in sublist])
Train_X_Tfidf = Tfidf_vect.transform([word for sublist in dataset["train"]["tokens"]  for word in sublist])
Test_X_Tfidf = Tfidf_vect.transform([word for sublist in dataset["test"]["tokens"]  for word in sublist])
Test_Y = [word for sublist in NEI_dataset["test"]["ner_tags"]  for word in sublist]


Loading cached processed dataset at /Users/param/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6/cache-2ff79afbbe477ce3.arrow
Loading cached processed dataset at /Users/param/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6/cache-3c067e784a57e25f.arrow
Loading cached processed dataset at /Users/param/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6/cache-34919c5cb479a53e.arrow


## SVM model using Tf-Idf Sparse Matrix representation

In [37]:
# Classifier - SVM
# Vectorizer - Tf-IDF
# Kernel - rbf
# Fit the training dataset on the classifier
SVM_tf_idf_rbf = SVC(kernel='rbf')
SVM_tf_idf_rbf.fit(Train_X_Tfidf,[word for sublist in NEI_dataset["train"]["ner_tags"]  for word in sublist])
# Predict the labels on test dataset
predictions_SVM_tf_idf_rbf = SVM_tf_idf_rbf.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_tf_idf_rbf, Test_Y)*100)


SVM Accuracy Score ->  93.56519866480025


### Data Processing and vecotrizing using word embedding vectorizer

In [10]:
model_w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',
binary=True)

In [11]:
train_embeddings = []
test_embeddings = []

train_words = [word for sublist in dataset["train"]["tokens"]  for word in sublist]#+[word for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word for sublist in dataset["test"]["tokens"]  for word in sublist]
test_words = [word for sublist in dataset["test"]["tokens"]  for word in sublist]
count = 0
for word in train_words:
    if word in model_w2v:
        train_embeddings.append(model_w2v.get_vector(word))
    else:
        # print("OHH NOO!!!!")
        count +=1
        train_embeddings.append(np.zeros(300))

for word in test_words:
    if word in model_w2v:
        test_embeddings.append(model_w2v.get_vector(word))
    else:
        # print("OHH NOO!!!!")
        # count +=1
        test_embeddings.append(np.zeros(300))


## SVM model using Word embeddings

In [40]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM_embedding_rbf = SVC(kernel='rbf')
SVM_embedding_rbf.fit(train_embeddings,[word for sublist in NEI_dataset["train"]["ner_tags"]  for word in sublist])
# predict the labels on validation dataset
predictions_SVM_embedding_rbf = SVM_embedding_rbf.predict(test_embeddings)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_embedding_rbf, Test_Y)*100)

SVM Accuracy Score ->  97.45235275115753


In [38]:
# Results for SVM model trained on Tf-Idf representations
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  93.56519866480025
SVM Precision Score ->  65.81607495069034
SVM Recall Score ->  96.12891609650703
SVM F1 Score ->  78.13551880579541


In [39]:
# Results for SVM model trained on word embeddings
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_2, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM_2, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM_2, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM_2, Test_Y)*100)

SVM Accuracy Score ->  97.45235275115753
SVM Precision Score ->  90.53254437869822
SVM Recall Score ->  94.6513725995618
SVM F1 Score ->  92.5461533614769


## Feature function with tf-idf vectorizer

In [12]:
def tf_idf_features(dataset,datatype):
    Tfidf_vect_suffix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_suffix.fit([word[-3:] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["validation"]["tokens"]  for word in sublist]
    +[word[-3:] for sublist in dataset["test"]["tokens"]  for word in sublist])

    Tfidf_vect_prefix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_prefix.fit([word[:3] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["validation"]["tokens"]  for word in sublist]
    +[word[:3] for sublist in dataset["test"]["tokens"]  for word in sublist])


    # [(word) for sublist in dataset["train"]["tokens"]  for word in sublist]
    features = []
    tokens = dataset[datatype]["tokens"]
    pos_tags = dataset[datatype]["pos_tags"]
    for sublist in range(len(tokens[:])):
        for word_ind in range(len(tokens[sublist])):
            word = tokens[sublist][word_ind]
            # print(word)
            vec = []
            # vec.append(word)

            #TF_IDF value of word
            vec.append(Tfidf_vect.vocabulary_[word])

            #Whether word is first word of sentence.
            if word_ind==0:
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is last word of sentence.
            if word_ind==(len(tokens[sublist])-1):
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is title or has first letter capitalised.
            
            if word.istitle():
                vec.append(1)
            else:
                vec.append(0)
            
            #TF_IDF vale of suffix of word.
            vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])

            #TF_IDF vale of prefix of word.
            vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])


            #POS tag of word.
            vec.append(pos_tags[sublist][word_ind])

            #POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
            if word_ind>0:
                vec.append(pos_tags[sublist][word_ind-1])
            else:
                vec.append(47)
            

            #POS tag of suceeding word.
            if word_ind < (len(tokens[sublist])-1):
                vec.append(pos_tags[sublist][word_ind+1])
            else:
                vec.append(47)

 
            #Whether word contains any digit.
            if bool(re.search(r'\d', word)):
                vec.append(1)
            else:
                vec.append(0)

            features.append(np.array(vec))
    features = np.array(features)
    return features

In [16]:
# Classifier - SVM
# Vectorizer - Tf-IDF
# Kernel - rbf
# Fit the training dataset on the classifier
SVM_tf_idf_features_rbf = SVC(kernel='rbf')
SVM_tf_idf_features_rbf.fit(tf_idf_features(dataset,"train"),[word for sublist in NEI_dataset["train"]["ner_tags"]  for word in sublist])
# Predict the labels on test dataset
predictions_SVM_tf_idf_features_rbf = SVM_tf_idf_features_rbf.predict(tf_idf_features(dataset,"test"))
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_tf_idf_features_rbf, Test_Y)*100)


SVM Accuracy Score ->  92.4927317756003


### Feature function with word-embedding vectorizer

In [13]:
def embedding_features(dataset,datatype):
    Tfidf_vect_suffix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_suffix.fit([word[-3:] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["test"]["tokens"]  for word in sublist])

    Tfidf_vect_prefix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_prefix.fit([word[:3] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["test"]["tokens"]  for word in sublist])


    # [(word) for sublist in dataset["train"]["tokens"]  for word in sublist]
    features = []
    tokens = dataset[datatype]["tokens"]
    pos_tags = dataset[datatype]["pos_tags"]
    for sublist in range(len(tokens[:])):
        for word_ind in range(len(tokens[sublist])):
            word = tokens[sublist][word_ind]
            # print(word)
            vec = []
            # vec.append(word)

            #word embedding vector of word
            if word in model_w2v:
                vec.extend(list((model_w2v.get_vector(word))))
            else:
                vec.extend(list((np.zeros(300))))
            
            #Whether word is first word of sentence.
            if word_ind==0:
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is last word of sentence.
            if word_ind==(len(tokens[sublist])-1):
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is title or has first letter capitalised.
            
            if word.istitle():
                vec.append(1)
            else:
                vec.append(0)
            
            #TF_IDF vale of suffix of word.
            # vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])
            vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]]/Tfidf_vect_suffix.vocabulary_.__len__())


            #TF_IDF vale of prefix of word.
            # vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])
            vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]]/Tfidf_vect_prefix.vocabulary_.__len__())



            #POS tag of word.
            vec.append(pos_tags[sublist][word_ind]/47)

            #POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
            if word_ind>0:
                vec.append(pos_tags[sublist][word_ind-1]/47)
            else:
                vec.append(47/47)
            

            #POS tag of suceeding word.
            if word_ind < (len(tokens[sublist])-1):
                vec.append(pos_tags[sublist][word_ind+1]/47)
            else:
                vec.append(47/47)

 
            #Whether word contains any digit.
            if bool(re.search(r'\d', word)):
                vec.append(1)
            else:
                vec.append(0)

            features.append(np.array(vec))
    features = np.array(features)
    return features

In [60]:
# Classifier - SVM
# Vectorizer - Word Embedding
# Kernel - rbf
# Fit the training dataset on the classifier
SVM_embedding_features_rbf = SVC(kernel='rbf')
SVM_embedding_features_rbf.fit(embedding_features(dataset,"train"),[word for sublist in NEI_dataset["train"]["ner_tags"]  for word in sublist])
# Predict the labels on test dataset
predictions_SVM_embedding_features_rbf = SVM_embedding_features_rbf.predict(embedding_features(dataset,"test"))
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_embedding_features_rbf, Test_Y)*100)


SVM Accuracy Score ->  98.38699257025951


In [61]:
# Results for SVM model on different input features
models = ["SVM_tf_idf_rbf","SVM_embedding_rbf","SVM_tf_idf_features_rbf","SVM_embedding_features_rbf"]
predictions = [predictions_SVM_tf_idf_rbf,predictions_SVM_embedding_rbf,predictions_SVM_tf_idf_features_rbf,predictions_SVM_embedding_features_rbf]
for i in range(len(models)):
    print(models[i],accuracy_score(predictions[i], Test_Y)*100,precision_score(predictions[i], Test_Y)*100,recall_score(predictions[i], Test_Y)*100,f1_score(predictions[i], Test_Y)*100,sep="\t")


SVM_tf_idf_rbf	93.56519866480025	65.81607495069034	96.12891609650703	78.13551880579541
SVM_embedding_rbf	97.45235275115753	90.53254437869822	94.6513725995618	92.5461533614769
SVM_tf_idf_features_rbf	92.4927317756003	83.72781065088756	75.82049564634963	79.57820738137082
SVM_embedding_features_rbf	98.38699257025951	95.67307692307693	95.1219512195122	95.39671808739475


## NEW Features added

In [14]:
with open('Person_names.pkl','rb') as handle:
    person_name = pickle.load(handle)
with open('Location_names.pkl','rb') as handle:
    location_name = pickle.load(handle)
with open('Organization_names.pkl','rb') as handle:
    organization_name = pickle.load(handle)
with open('Country_Abb_CAPS.pkl','rb') as handle:
    loc_ABBV = pickle.load(handle)

In [15]:
def is_number(n):
    try:
        float(n)
    except ValueError:
        return False
    return True

def containsDigit(s):
    num = [i for i in range(0,10)]
    num = [str(i) for i in num]
    for n in num:
      if n in s:
        return True
    return False

punc = set(punctuation)
def containsSpecialChar(s):
    for p in punc:
      if p in s:
        return True
    return False

def isPerson(word):
    return word.lower() in person_name

def isOrganization(word):
    return word.lower() in organization_name

def isLocation(word):
    if word.lower() in location_name:
        return True
    return word in loc_ABBV
    

In [16]:
def embedding_updated_features(dataset,datatype):
    Tfidf_vect_suffix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_suffix.fit([word[-3:] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["test"]["tokens"]  for word in sublist])

    Tfidf_vect_prefix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_prefix.fit([word[:3] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["test"]["tokens"]  for word in sublist])


    # [(word) for sublist in dataset["train"]["tokens"]  for word in sublist]
    features = []
    tokens = dataset[datatype]["tokens"]
    pos_tags = dataset[datatype]["pos_tags"]
    chunk_tags = dataset[datatype]["chunk_tags"]
    for sublist in range(len(tokens[:])):
        for word_ind in range(len(tokens[sublist])):
            word = tokens[sublist][word_ind]
            # print(word)
            vec = []
            # vec.append(word)

            #word embedding vector of word
            if word in model_w2v:
                vec.extend(list((model_w2v.get_vector(word))))
            else:
                vec.extend(list((np.zeros(300))))
            
            #Whether word is first word of sentence.
            if word_ind==0:
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is last word of sentence.
            if word_ind==(len(tokens[sublist])-1):
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is title or has first letter capitalised.
            
            if word.istitle():
                vec.append(1)
            else:
                vec.append(0)
            
            #TF_IDF vale of suffix of word.
            # vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])
            vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]]/Tfidf_vect_suffix.vocabulary_.__len__())


            #TF_IDF vale of prefix of word.
            # vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])
            vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]]/Tfidf_vect_prefix.vocabulary_.__len__())



            #POS tag of word.
            vec.append(pos_tags[sublist][word_ind]/47)

            #POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
            if word_ind>0:
                vec.append(pos_tags[sublist][word_ind-1]/47)
            else:
                vec.append(47/47)
            

            #POS tag of suceeding word.
            if word_ind < (len(tokens[sublist])-1):
                vec.append(pos_tags[sublist][word_ind+1]/47)
            else:
                vec.append(47/47)

 
            #Whether word contains any digit.
            if bool(re.search(r'\d', word)):
                vec.append(1)
            else:
                vec.append(0)



            '''
            Newly added features
            # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
            '''

            # word location normalised
            vec.append(word_ind/len(tokens[sublist]))

            # word length
            vec.append(len(word)/20)

            # prev to prev pos
            if word_ind>1:
                vec.append(pos_tags[sublist][word_ind-2]/47)
            else:
                vec.append(47/47)

            # next to next pos
            if word_ind < (len(tokens[sublist])-2):
                vec.append(pos_tags[sublist][word_ind+2]/47)
            else:
                vec.append(47/47)

            # chunk tag of word all normalised
            vec.append(chunk_tags[sublist][word_ind]/22)

            # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 0 taken to outside chunk.
            if word_ind>0:
                vec.append(chunk_tags[sublist][word_ind-1]/22)
            else:
                vec.append(0)
            
            # chunk tag of next word
            if word_ind < (len(tokens[sublist])-1):
                vec.append(chunk_tags[sublist][word_ind+1]/22)
            else:
                vec.append(0)

            # prev to prev chunk
            if word_ind>1:
                vec.append(chunk_tags[sublist][word_ind-2]/22)
            else:
                vec.append(0)
            
            # chunk tag of next to next word
            if word_ind < (len(tokens[sublist])-2):
                vec.append(chunk_tags[sublist][word_ind+2]/22)
            else:
                vec.append(0)


            # if previous word is capital word
            if word_ind>0:
                next_word = tokens[sublist][word_ind-1]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)

            # if next word is capital word
            if word_ind < (len(tokens[sublist])-1):
                next_word = tokens[sublist][word_ind+1]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)

            # if previous to previous word is capital word
            if word_ind>1:
                next_word = tokens[sublist][word_ind-2]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)

            # if next to next word is capital word
            if word_ind < (len(tokens[sublist])-2):
                next_word = tokens[sublist][word_ind+2]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)



            c = 1 #100
            # is number
            if is_number(word):
                vec.append(c*1)
            else:
                vec.append(0)
            
            # containsDigit
            if containsDigit(word):
                vec.append(c*1)
            else:
                vec.append(0)

            # contains special characters
            if containsSpecialChar(word):
                vec.append(c*1)
            else:
                vec.append(0)


            '''
            Gazetteer List
            '''
            c = 1 # 400
            if isPerson(word):
                vec.append(c*1)
            else:
                vec.append(0)

            if isOrganization(word):
                vec.append(c*1)
            else:
                vec.append(0)

            if isLocation(word):
                vec.append(c*1)
            else:
                vec.append(0)

            features.append(np.array(vec))
    features = np.array(features)
    return features

In [None]:
# Classifier - SVM
# Vectorizer - Word Embedding
# Kernel - rbf
# Fit the training dataset on the classifier
SVM_embedding_updated_features_rbf = SVC(kernel='rbf')
SVM_embedding_updated_features_rbf.fit(embedding_updated_features(dataset,"train"),[word for sublist in NEI_dataset["train"]["ner_tags"]  for word in sublist])
# Predict the labels on test dataset
predictions_SVM_embedding_updated_features_rbf = SVM_embedding_updated_features_rbf.predict(embedding_updated_features(dataset,"test"))
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_embedding_updated_features_rbf, Test_Y)*100)


## Store models as pickle object to be reused later without retraining:


In [9]:
# models = ["SVM_tf_idf_rbf","SVM_embedding_rbf","SVM_tf_idf_features_rbf","SVM_embedding_features_rbf","SVM_embedding_updated_features_rbf"]
# actual_models = [SVM_tf_idf_rbf,SVM_embedding_rbf,SVM_tf_idf_features_rbf,SVM_embedding_features_rbf,SVM_embedding_updated_features_rbf]

# for i in range(len(models)):
#     pickle.dump(actual_models[i], open(models[i]+".pkl", 'wb'))

# Load models stored as pickle objects as training SVM models is slow

In [17]:
models = ["SVM_tf_idf_rbf","SVM_embedding_rbf","SVM_tf_idf_features_rbf","SVM_embedding_features_rbf","SVM_embedding_updated_features_rbf"]
predictions_str = ["predictions_SVM_tf_idf_rbf","predictions_SVM_embedding_rbf","predictions_SVM_tf_idf_features_rbf","predictions_SVM_embedding_features_rbf","predictions_SVM_embedding_updated_features_rbf"]

for i in range(len(models)):
    exec(models[i] + " = " + 'pickle.load(open("' + models[i] + '.pkl", "rb"))')

actual_models = [SVM_tf_idf_rbf,SVM_embedding_rbf,SVM_tf_idf_features_rbf,SVM_embedding_features_rbf,SVM_embedding_updated_features_rbf]

predictions_SVM_tf_idf_rbf = SVM_tf_idf_rbf.predict(Test_X_Tfidf)
predictions_SVM_embedding_rbf = SVM_embedding_rbf.predict(test_embeddings)
predictions_SVM_tf_idf_features_rbf = SVM_tf_idf_features_rbf.predict(tf_idf_features(dataset,"test"))
predictions_SVM_embedding_features_rbf = SVM_embedding_features_rbf.predict(embedding_features(dataset,"test"))
predictions_SVM_embedding_updated_features_rbf = SVM_embedding_updated_features_rbf.predict(embedding_updated_features(dataset,"test"))
predictions = [predictions_SVM_tf_idf_rbf,predictions_SVM_embedding_rbf,predictions_SVM_tf_idf_features_rbf,predictions_SVM_embedding_features_rbf,predictions_SVM_embedding_updated_features_rbf]

for i in range(len(models)):
    print(models[i],accuracy_score(predictions[i], Test_Y)*100,precision_score(predictions[i], Test_Y)*100,recall_score(predictions[i], Test_Y)*100,f1_score(predictions[i], Test_Y)*100,sep="\t")


SVM_tf_idf_rbf	93.56519866480025	65.81607495069034	96.12891609650703	78.13551880579541
SVM_embedding_rbf	97.45235275115753	90.53254437869822	94.6513725995618	92.5461533614769
SVM_tf_idf_features_rbf	92.4927317756003	83.72781065088756	75.82049564634963	79.57820738137082
SVM_embedding_features_rbf	98.38699257025951	95.67307692307693	95.1219512195122	95.39671808739475
SVM_embedding_updated_features_rbf	98.52051254441693	96.30177514792899	95.27991218441272	95.78811844767336


### Function to generate chunk and pos tags for unseen sentences

In [18]:
pos_tokenizer = {}
chunk_tokenizer = {}
chunk_max_token = {}
pos_max_token = {}
def generate_chunk_pos_tags():
    for sent,pos,chunks in zip(
        dataset['train']['tokens']+dataset['validation']['tokens']+dataset['test']['tokens'], 
        dataset['train']['pos_tags']+dataset['validation']['pos_tags']+dataset['test']['pos_tags'], 
        dataset['train']['chunk_tags']+dataset['validation']['chunk_tags']+dataset['test']['chunk_tags']):
        tree = ne_chunk(pos_tag(word_tokenize(" ".join(sent))))
        iob_tags = tree2conlltags(tree)
        for i in range(len(iob_tags)):
            chunk_pos_id = iob_tags[i][1]+'_'+iob_tags[i][2]

            try:
                if chunk_pos_id in pos_tokenizer.keys():
                    if pos[i] in pos_tokenizer[chunk_pos_id].keys():
                        pos_tokenizer[chunk_pos_id][pos[i]] += 1 
                    else:
                        pos_tokenizer[chunk_pos_id][pos[i]] = 1
                else: 
                    pos_tokenizer[chunk_pos_id] = {pos[i]:1}
            except Exception as e:
                continue  
            
            if chunk_pos_id in chunk_tokenizer.keys():
                if chunks[i] in chunk_tokenizer[chunk_pos_id].keys():
                    chunk_tokenizer[chunk_pos_id][chunks[i]] += 1 
                else:
                    chunk_tokenizer[chunk_pos_id][chunks[i]] = 1
            else: 
                chunk_tokenizer[chunk_pos_id] = {chunks[i]:1}


    for k,v in chunk_tokenizer.items():
        chunk_max_token[k] = max(v, key=v.get)
    for k,v in pos_tokenizer.items():
        pos_max_token[k] = max(v, key=v.get)

generate_chunk_pos_tags()

In [19]:
def make_features(sent_list, pass1_prediction = None):
    sent_list_pos = []
    sent_list_chunk = []
    tree = ne_chunk(pos_tag(word_tokenize(" ".join(sent_list))))
    iob_tags = tree2conlltags(tree)
    prev_prediction = pass1_prediction
    pred_ind = 0
    for w,p,c in iob_tags:
        key_id = p+'_'+c
        if key_id in pos_max_token.keys():
            sent_list_pos.append(pos_max_token[key_id])
        else:
            sent_list_pos.append(0)

        if key_id in chunk_max_token.keys():
            sent_list_chunk.append(chunk_max_token[key_id])
        else:
            sent_list_chunk.append(0)



    Tfidf_vect_suffix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_suffix.fit([word[-3:] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["test"]["tokens"]  for word in sublist])

    Tfidf_vect_prefix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
    Tfidf_vect_prefix.fit([word[:3] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["validation"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["test"]["tokens"]  for word in sublist])

    # [(word) for sublist in dataset["train"]["tokens"]  for word in sublist]
    features = []
    # print(sent_list)
    tokens = []
    tokens.append(list(sent_list.split(" ")))
    # print(tokens)
    pos_tags = [sent_list_pos]
    chunk_tags = [sent_list_chunk]
    for sublist in range(len(tokens[:])):
        for word_ind in range(len(tokens[sublist])):
            word = tokens[sublist][word_ind]
            # print(word)
            vec = []
            # vec.append(word)

            #word embedding vector of word
            if word in model_w2v:
                vec.extend(list((model_w2v.get_vector(word))))
            else:
                vec.extend(list((np.zeros(300))))
            
            #Whether word is first word of sentence.
            if word_ind==0:
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is last word of sentence.
            if word_ind==(len(tokens[sublist])-1):
                vec.append(1)
            else:
                vec.append(0)

            #Whether word is title or has first letter capitalised.
            
            if word.istitle():
                vec.append(1)
            else:
                vec.append(0)
            
            #TF_IDF vale of suffix of word.
            # vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])
            if word[-3:] in Tfidf_vect_suffix.vocabulary_:
                vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]]/Tfidf_vect_suffix.vocabulary_.__len__())
            else:
                vec.append(0)


            #TF_IDF vale of prefix of word.
            # vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])
            if word[:3] in Tfidf_vect_prefix.vocabulary_:
                vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]]/Tfidf_vect_prefix.vocabulary_.__len__())
            else:
                vec.append(0)


            #POS tag of word.
            vec.append(pos_tags[sublist][word_ind]/47)

            #POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
            if word_ind>0:
                vec.append(pos_tags[sublist][word_ind-1]/47)
            else:
                vec.append(47/47)
            

            #POS tag of suceeding word.
            if word_ind < (len(tokens[sublist])-1):
                vec.append(pos_tags[sublist][word_ind+1]/47)
            else:
                vec.append(47/47)


            #Whether word contains any digit.
            if bool(re.search(r'\d', word)):
                vec.append(1)
            else:
                vec.append(0)



            '''
            Newly added features
            # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
            '''

            # word location normalised
            vec.append(word_ind/len(tokens[sublist]))

            # word length
            vec.append(len(word)/20)

            # prev to prev pos
            if word_ind>1:
                vec.append(pos_tags[sublist][word_ind-2]/47)
            else:
                vec.append(47/47)

            # next to next pos
            if word_ind < (len(tokens[sublist])-2):
                vec.append(pos_tags[sublist][word_ind+2]/47)
            else:
                vec.append(47/47)

            # chunk tag of word all normalised
            vec.append(chunk_tags[sublist][word_ind]/22)

            # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 0 taken to outside chunk.
            if word_ind>0:
                vec.append(chunk_tags[sublist][word_ind-1]/22)
            else:
                vec.append(0)
            
            # chunk tag of next word
            if word_ind < (len(tokens[sublist])-1):
                vec.append(chunk_tags[sublist][word_ind+1]/22)
            else:
                vec.append(0)

            # prev to prev chunk
            if word_ind>1:
                vec.append(chunk_tags[sublist][word_ind-2]/22)
            else:
                vec.append(0)
            
            # chunk tag of next to next word
            if word_ind < (len(tokens[sublist])-2):
                vec.append(chunk_tags[sublist][word_ind+2]/22)
            else:
                vec.append(0)


            # if previous word is capital word
            if word_ind>0:
                next_word = tokens[sublist][word_ind-1]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)

            # if next word is capital word
            if word_ind < (len(tokens[sublist])-1):
                next_word = tokens[sublist][word_ind+1]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)

            # if previous to previous word is capital word
            if word_ind>1:
                next_word = tokens[sublist][word_ind-2]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)

            # if next to next word is capital word
            if word_ind < (len(tokens[sublist])-2):
                next_word = tokens[sublist][word_ind+2]
                if next_word.istitle():
                    vec.append(1)
                else:
                    vec.append(0)
            else:
                vec.append(0)



            c = 1 #100
            # is number
            if is_number(word):
                vec.append(c*1)
            else:
                vec.append(0)
            
            # containsDigit
            if containsDigit(word):
                vec.append(c*1)
            else:
                vec.append(0)

            # contains special characters
            if containsSpecialChar(word):
                vec.append(c*1)
            else:
                vec.append(0)


            '''
            Gazetteer List
            '''
            c = 1 # 400
            if isPerson(word):
                vec.append(c*1)
            else:
                vec.append(0)

            if isOrganization(word):
                vec.append(c*1)
            else:
                vec.append(0)

            if isLocation(word):
                vec.append(c*1)
            else:
                vec.append(0)

            features.append(np.array(vec))
    features = np.array(features)
    return features

In [20]:
def predict_sentence(sentence):
    return SVM_embedding_updated_features_rbf.predict(make_features(sentence))

In [21]:
SVM_embedding_updated_features_rbf.predict(make_features('The State Bank of India is the largest bank in the country'))

array([0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [22]:
print(predict_sentence('The State Bank of India is the largest bank in the country'))

print(predict_sentence('Pushpak Bhattacharyya teaches us CS626'))

print(predict_sentence('Pushpak Bhattacharyya teaches us CS'))

print(predict_sentence('India got its freedom on 15th August 1947'))

print(predict_sentence('India got its freedom on 15-8-1947'))

print(predict_sentence('India got its freedom on 15/8/1947'))

print(predict_sentence('India got its freedom on 15.8.1947'))

[0 1 1 0 1 0 0 0 0 0 0 0]
[1 1 0 0 0]
[1 1 0 0 1]
[1 0 0 0 0 0 0 0]
[1 0 0 0 0 0]
[1 0 0 0 0 0]
[1 0 0 0 0 0]


In [23]:
print(predict_sentence('Schools reopened after Covid.'))
print(predict_sentence('Delhi reopened after Covid.'))


[0 0 0 1]
[1 0 0 1]


In [24]:
print(predict_sentence('india has won the match against afganistan.'))

[1 0 0 0 0 0 0]
