In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 5.3 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 43.5 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.5 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 47.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 75.8 MB/s 
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[K   

In [2]:
from datasets import load_dataset
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer # Use TF IDF transformer to change text vector created by count vectorizer
from sklearn.svm import SVC # Support Vector Machine
from sklearn.metrics import *
import re
import pickle
from string import punctuation


dataset = load_dataset("conll2003")

Reusing dataset conll2003 (/home/ajayr/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 10.23it/s]


In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import conlltags2tree, tree2conlltags


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


# For unseen new sentence generate chunk and pos tags

In [6]:
pos_tokenizer = {}
chunk_tokenizer = {}
chunk_max_token = {}
pos_max_token = {}
def generate_chunk_pos_tags():
    for sent,pos,chunks in zip(
        dataset['train']['tokens']+dataset['validation']['tokens']+dataset['test']['tokens'], 
        dataset['train']['pos_tags']+dataset['validation']['pos_tags']+dataset['test']['pos_tags'], 
        dataset['train']['chunk_tags']+dataset['validation']['chunk_tags']+dataset['test']['chunk_tags']):
        tree = ne_chunk(pos_tag(word_tokenize(" ".join(sent))))
        iob_tags = tree2conlltags(tree)
        for i in range(len(iob_tags)):
            chunk_pos_id = iob_tags[i][1]+'_'+iob_tags[i][2]

            try:
                if chunk_pos_id in pos_tokenizer.keys():
                    if pos[i] in pos_tokenizer[chunk_pos_id].keys():
                        pos_tokenizer[chunk_pos_id][pos[i]] += 1 
                    else:
                        pos_tokenizer[chunk_pos_id][pos[i]] = 1
                else: 
                    pos_tokenizer[chunk_pos_id] = {pos[i]:1}
            except Exception as e:
                continue  
            
            if chunk_pos_id in chunk_tokenizer.keys():
                if chunks[i] in chunk_tokenizer[chunk_pos_id].keys():
                    chunk_tokenizer[chunk_pos_id][chunks[i]] += 1 
                else:
                    chunk_tokenizer[chunk_pos_id][chunks[i]] = 1
            else: 
                chunk_tokenizer[chunk_pos_id] = {chunks[i]:1}


    for k,v in chunk_tokenizer.items():
        chunk_max_token[k] = max(v, key=v.get)
    for k,v in pos_tokenizer.items():
        pos_max_token[k] = max(v, key=v.get)

# generate_chunk_pos_tags()

# load pre-defined chunk and pos mapping

In [3]:
with open('data/chunk_max_token.pkl','rb') as handle:
    chunk_max_token = pickle.load(handle)

with open('data/pos_max_token.pkl','rb') as handle:
    pos_max_token = pickle.load(handle)


# Load Gazetteer Lists

In [2]:
with open('data/Person_names.pkl','rb') as handle:
    person_name = pickle.load(handle)
with open('data/Location_names.pkl','rb') as handle:
    location_name = pickle.load(handle)
with open('data/Organization_names.pkl','rb') as handle:
    organization_name = pickle.load(handle)
with open('data/Country_Abb_CAPS.pkl','rb') as handle:
    loc_ABBV = pickle.load(handle)

# New Feature Functions added

In [9]:
def is_number(n):
    try:
        float(n)
    except ValueError:
        return False
    return True

def containsDigit(s):
    num = [i for i in range(0,10)]
    num = [str(i) for i in num]
    for n in num:
      if n in s:
        return True
    return False

punc = set(punctuation)
def containsSpecialChar(s):
    for p in punc:
      if p in s:
        return True
    return False

def isPerson(word):
    return word.lower() in person_name

def isOrganization(word):
    return word.lower() in organization_name

def isLocation(word):
    if word.lower() in location_name:
        return True
    return word in loc_ABBV
    

# Pass 1:

# TF-IDF sparse matrix representation of data

In [10]:
updated_dataset = dataset.map(lambda example: {'ner_tags': [1 if x > 0 else 0 for x in example["ner_tags"] ]})#{'sentence1': 'My sentence: ' + example['sentence1']
Tfidf_vect = TfidfVectorizer(lowercase=False,token_pattern=r".*")
Tfidf_vect.fit([word for sublist in dataset["train"]["tokens"]  for word in sublist]+[word for sublist in dataset["validation"]["tokens"]  for word in sublist]
+[word for sublist in dataset["test"]["tokens"]  for word in sublist])
Train_X_Tfidf = Tfidf_vect.transform([word for sublist in dataset["train"]["tokens"]  for word in sublist])
Test_X_Tfidf = Tfidf_vect.transform([word for sublist in dataset["test"]["tokens"]  for word in sublist])
Test_Y = [word for sublist in updated_dataset["test"]["ner_tags"]  for word in sublist]


  0%|          | 0/14041 [00:00<?, ?ex/s]

  0%|          | 0/3250 [00:00<?, ?ex/s]

  0%|          | 0/3453 [00:00<?, ?ex/s]

# Simple SVM Model on TF-IDF spare matrix
### Can train your own model using below code; just uncomment
### Run next cell to load trained model directly

In [None]:
def train_tf_idf()
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    SVM = SVC(kernel='rbf')
    SVM.fit(Train_X_Tfidf,[word for sublist in updated_dataset["train"]["ner_tags"]  for word in sublist])
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(Test_X_Tfidf)
    # Use accuracy_score function to get the accuracy
    print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
    return SVM

# SVM = train_tf_idf()

SVM Accuracy Score ->  93.56519866480025


# Load Trained Model

In [11]:
with open('models/svm_tdidf_sparse.pkl','rb') as handle:
  SVM = pickle.load(handle)
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  93.56519866480025


## Generating features for second model of SVM


In [12]:
Tfidf_vect_suffix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
Tfidf_vect_suffix.fit([word[-3:] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["validation"]["tokens"]  for word in sublist]
+[word[-3:] for sublist in dataset["test"]["tokens"]  for word in sublist])

Tfidf_vect_prefix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
Tfidf_vect_prefix.fit([word[:3] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["validation"]["tokens"]  for word in sublist]
+[word[:3] for sublist in dataset["test"]["tokens"]  for word in sublist])


features = []
tokens = dataset["train"]["tokens"]
pos_tags = dataset["train"]["pos_tags"]
chunk_tags = dataset["train"]["chunk_tags"]
for sublist in range(len(tokens[:])):
  for word_ind in range(len(tokens[sublist])):
    word = tokens[sublist][word_ind]
    vec = []

    # TF_IDF value of word
    vec.append(Tfidf_vect.vocabulary_[word])

    # Whether word is first word of sentence.
    if word_ind==0:
      vec.append(1)
    else:
      vec.append(0)

    # Whether word is last word of sentence.
    if word_ind==(len(tokens[sublist])-1):
      vec.append(1)
    else:
      vec.append(0)

    # Whether word is title or has first letter capitalised.
    
    if word.istitle():
      vec.append(1)
    else:
      vec.append(0)
    
    # TF_IDF vale of suffix of word.
    vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])

    # TF_IDF vale of prefix of word.
    vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])


    # POS tag of word.
    vec.append(pos_tags[sublist][word_ind])

    # POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(pos_tags[sublist][word_ind-1])
    else:
      vec.append(47)

    
      

    # POS tag of suceeding word.
    if word_ind < (len(tokens[sublist])-1):
      vec.append(pos_tags[sublist][word_ind+1])
    else:
      vec.append(47)


    # Whether word contains any digit.
    if bool(re.search(r'\d', word)):
      vec.append(1)
    else:
      vec.append(0)


    '''
    Newly added features
    # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
    '''
    # word location
    vec.append(word_ind)

    # word length
    vec.append(len(word))

    # prev to prev pos
    if word_ind>1:
      vec.append(pos_tags[sublist][word_ind-2])
    else:
      vec.append(47)

    # next to next pos
    if word_ind < (len(tokens[sublist])-2):
      vec.append(pos_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # chunk tag of word
    vec.append(chunk_tags[sublist][word_ind])

    # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(chunk_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      
    # chunk tag of next word
    if word_ind < (len(tokens[sublist])-1):
      vec.append(chunk_tags[sublist][word_ind+1])
    else:
      vec.append(47)

    # prev to prev chunk
    if word_ind>1:
      vec.append(chunk_tags[sublist][word_ind-2])
    else:
      vec.append(47)
      
    # chunk tag of next to next word
    if word_ind < (len(tokens[sublist])-2):
      vec.append(chunk_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # if previous word is capital word
    if word_ind>0:
      next_word = tokens[sublist][word_ind-1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next word is capital word
    if word_ind < (len(tokens[sublist])-1):
      next_word = tokens[sublist][word_ind+1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if previous to previous word is capital word
    if word_ind>1:
      next_word = tokens[sublist][word_ind-2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next to next word is capital word
    if word_ind < (len(tokens[sublist])-2):
      next_word = tokens[sublist][word_ind+2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    c = 1 #100
    # is number
    if is_number(word):
      vec.append(c*1)
    else:
      vec.append(0)
    
    # containsDigit
    if containsDigit(word):
      vec.append(c*1)
    else:
      vec.append(0)

    # contains special characters
    if containsSpecialChar(word):
      vec.append(c*1)
    else:
      vec.append(0)


    '''
    Gazetteer List
    '''
    c = 1 # 400
    if isPerson(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isOrganization(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isLocation(word):
      vec.append(c*1)
    else:
      vec.append(0)


    features.append(np.array(vec))
features = np.array(features)
# features



test_features = []
test_tokens = dataset["test"]["tokens"]
test_pos_tags = dataset["test"]["pos_tags"]
test_chunk_tags = dataset["test"]["chunk_tags"]
for sublist in range(len(test_tokens[:])):
  for word_ind in range(len(test_tokens[sublist])):
    word = test_tokens[sublist][word_ind]
    # print(word)
    vec = []
    # vec.append(word)

    #TF_IDF value of word
    vec.append(Tfidf_vect.vocabulary_[word])

    #Whether word is first word of sentence.
    if word_ind==0:
      vec.append(1)
    else:
      vec.append(0)

    #Whether word is last word of sentence.
    if word_ind==(len(test_tokens[sublist])-1):
      vec.append(1)
    else:
      vec.append(0)

    #Whether word is title or has first letter capitalised.
    
    if word.istitle():
      vec.append(1)
    else:
      vec.append(0)
    
    #TF_IDF vale of suffix of word.
    vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])

    #TF_IDF vale of prefix of word.
    vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])


    #POS tag of word.
    vec.append(test_pos_tags[sublist][word_ind])

    #POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(test_pos_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      

    #POS tag of suceeding word.
    if word_ind < (len(test_tokens[sublist])-1):
      vec.append(test_pos_tags[sublist][word_ind+1])
    else:
      vec.append(47)


    #Whether word contains any digit.
    if bool(re.search(r'\d', word)):
      vec.append(1)
    else:
      vec.append(0)

    '''
    Newly added features
    # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
    '''
    # word location
    vec.append(word_ind)

    # word length
    vec.append(len(word))

    # prev to prev pos
    if word_ind>1:
      vec.append(test_pos_tags[sublist][word_ind-2])
    else:
      vec.append(47)

    # next to next pos
    if word_ind < (len(test_tokens[sublist])-2):
      vec.append(test_pos_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # chunk tag of word
    vec.append(test_chunk_tags[sublist][word_ind])

    # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(test_chunk_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      
    # chunk tag of next word
    if word_ind < (len(test_tokens[sublist])-1):
      vec.append(test_chunk_tags[sublist][word_ind+1])
    else:
      vec.append(47)

    # prev to prev chunk
    if word_ind>1:
      vec.append(test_chunk_tags[sublist][word_ind-2])
    else:
      vec.append(47)
      
    # chunk tag of next to next word
    if word_ind < (len(test_tokens[sublist])-2):
      vec.append(test_chunk_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # if previous word is capital word
    if word_ind>0:
      next_word = test_tokens[sublist][word_ind-1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next word is capital word
    if word_ind < (len(test_tokens[sublist])-1):
      next_word = test_tokens[sublist][word_ind+1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if previous to previous word is capital word
    if word_ind>1:
      next_word = test_tokens[sublist][word_ind-2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next to next word is capital word
    if word_ind < (len(test_tokens[sublist])-2):
      next_word = test_tokens[sublist][word_ind+2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    c = 1 # 100
    # is number
    if is_number(word):
      vec.append(c*1)
    else:
      vec.append(0)
    
    # containsDigit
    if containsDigit(word):
      vec.append(c*1)
    else:
      vec.append(0)

    # contains special characters
    if containsSpecialChar(word):
      vec.append(c*1)
    else:
      vec.append(0)

    '''
    Gazetteer List
    '''
    c = 1 #400
    if isPerson(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isOrganization(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isLocation(word):
      vec.append(c*1)
    else:
      vec.append(0)

    test_features.append(np.array(vec))
test_features = np.array(test_features)
# test_features

# Train SVM Model with new added features for your own data
### Just uncomment
### Or load our trained model in next cell directly

In [None]:
def train_svm_feature_model()
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    SVM_newFeatures = SVC(kernel='rbf')
    SVM_newFeatures.fit(features,[word for sublist in updated_dataset["train"]["ner_tags"]  for word in sublist])
    # predict the labels on validation dataset
    predictions_SVM_newFeatures = SVM_newFeatures.predict(test_features)
    # Use accuracy_score function to get the accuracy
    print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_newFeatures, Test_Y)*100)

# SVM_newFeatures = train_svm_feature_model()

SVM Accuracy Score ->  93.39506837514806


## Load Trained Model

In [17]:
with open('models/svm-linear-scaled-pass1-B.pkl','rb') as handle:
    SVM_newFeatures = pickle.load(handle)
predictions_SVM_newFeatures = SVM_newFeatures.predict(test_features)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_newFeatures, Test_Y)*100)

SVM Accuracy Score ->  93.30246581242598


### Combining the postive cases from both the models

In [18]:

predictions_SVM_3 = predictions_SVM_newFeatures
predictions_SVM_4 = []
for i in range(len(predictions_SVM_3)):
  if predictions_SVM_3[i]==1 or predictions_SVM[i]==1:
    predictions_SVM_4.append(1)
  else:
    predictions_SVM_4.append(0)

### Metric for SVM model trained on just the tf-idf spare matrix  representation

In [15]:
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM, Test_Y)*100)


print("Confusion_matrix \n",confusion_matrix(Test_Y, predictions_SVM))


SVM Accuracy Score ->  93.56519866480025
SVM Precision Score ->  65.81607495069034
SVM Recall Score ->  96.12891609650703
SVM F1 Score ->  78.13551880579541
Confusion_matrix 
 [[38108   215]
 [ 2773  5339]]


### Metrics for New Feature SVM Model


In [19]:
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_3, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM_3, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM_3, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM_3, Test_Y)*100)


print("Confusion_matrix \n",confusion_matrix(Test_Y, predictions_SVM_3))


SVM Accuracy Score ->  93.30246581242598
SVM Precision Score ->  80.23915187376726
SVM Recall Score ->  81.2000998003992
SVM F1 Score ->  80.71676587301587
Confusion_matrix 
 [[36816  1507]
 [ 1603  6509]]


### Metrics for combined Model

In [20]:
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_4, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM_4, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM_4, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM_4, Test_Y)*100)


print("Confusion_matrix \n",confusion_matrix(Test_Y, predictions_SVM_4))


SVM Accuracy Score ->  95.29234413696564
SVM Precision Score ->  92.92406311637082
SVM Recall Score ->  82.38251366120218
SVM F1 Score ->  87.33634573050631
Confusion_matrix 
 [[36711  1612]
 [  574  7538]]


# Pass 2: 
## Using predictions of pass 1 as input to pass 2

In [21]:
Tfidf_vect_suffix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
Tfidf_vect_suffix.fit([word[-3:] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[-3:] for sublist in dataset["validation"]["tokens"]  for word in sublist]
+[word[-3:] for sublist in dataset["test"]["tokens"]  for word in sublist])

Tfidf_vect_prefix = TfidfVectorizer(lowercase=False,token_pattern=r".*")
Tfidf_vect_prefix.fit([word[:3] for sublist in dataset["train"]["tokens"]  for word in sublist]+[word[:3] for sublist in dataset["validation"]["tokens"]  for word in sublist]
+[word[:3] for sublist in dataset["test"]["tokens"]  for word in sublist])

label = [word for sublist in updated_dataset["train"]["ner_tags"]  for word in sublist]
label_ind = 0
# [(word) for sublist in dataset["train"]["tokens"]  for word in sublist]
features = []
tokens = dataset["train"]["tokens"]
pos_tags = dataset["train"]["pos_tags"]
chunk_tags = dataset["train"]["chunk_tags"]
for sublist in range(len(tokens[:])):
  for word_ind in range(len(tokens[sublist])):
    word = tokens[sublist][word_ind]
    # print(word)
    vec = []
    # vec.append(word)

    #TF_IDF value of word
    vec.append(Tfidf_vect.vocabulary_[word])

    #Whether word is first word of sentence.
    if word_ind==0:
      vec.append(1)
    else:
      vec.append(0)

    #Whether word is last word of sentence.
    if word_ind==(len(tokens[sublist])-1):
      vec.append(1)
    else:
      vec.append(0)

    #Whether word is title or has first letter capitalised.
    
    if word.istitle():
      vec.append(1)
    else:
      vec.append(0)
    
    #TF_IDF vale of suffix of word.
    vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])

    #TF_IDF vale of prefix of word.
    vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])


    #POS tag of word.
    vec.append(pos_tags[sublist][word_ind])

    #POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(pos_tags[sublist][word_ind-1])
    else:
      vec.append(47)

    #POS tag of suceeding word.
    if word_ind < (len(tokens[sublist])-1):
      vec.append(pos_tags[sublist][word_ind+1])
    else:
      vec.append(47)


    #Whether word contains any digit.
    if bool(re.search(r'\d', word)):
      vec.append(1)
    else:
      vec.append(0)


    '''
    Newly added features
    # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
    '''
    # word location
    vec.append(word_ind)

    # word length
    vec.append(len(word))

    # prev to prev pos
    if word_ind>1:
      vec.append(pos_tags[sublist][word_ind-2])
    else:
      vec.append(47)

    # next to next pos
    if word_ind < (len(tokens[sublist])-2):
      vec.append(pos_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # chunk tag of word
    vec.append(chunk_tags[sublist][word_ind])

    # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(chunk_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      
    # chunk tag of next word
    if word_ind < (len(tokens[sublist])-1):
      vec.append(chunk_tags[sublist][word_ind+1])
    else:
      vec.append(47)

    # prev to prev chunk
    if word_ind>1:
      vec.append(chunk_tags[sublist][word_ind-2])
    else:
      vec.append(47)
      
    # chunk tag of next to next word
    if word_ind < (len(tokens[sublist])-2):
      vec.append(chunk_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # if previous word is capital word
    if word_ind>0:
      next_word = tokens[sublist][word_ind-1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next word is capital word
    if word_ind < (len(tokens[sublist])-1):
      next_word = tokens[sublist][word_ind+1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if previous to previous word is capital word
    if word_ind>1:
      next_word = tokens[sublist][word_ind-2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next to next word is capital word
    if word_ind < (len(tokens[sublist])-2):
      next_word = tokens[sublist][word_ind+2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    '''
    Next and prev NEs
    '''
    c = 1 #100
    # prev to prev word NE
    if word_ind>1 and label_ind >1 :
      vec.append(c*label[label_ind - 2])
    else:
      vec.append(0)

    # prev word NE
    if word_ind>0 and label_ind >0 :
      vec.append(c*label[label_ind - 1])
    else:
      vec.append(0)

    # next word NE
    if word_ind < (len(tokens[sublist])-1) and (label_ind + 1 )< len(label):
      vec.append(c*label[label_ind + 1])
    else:
      vec.append(0)

    # next to next word NE
    if word_ind < (len(tokens[sublist])-2) and (label_ind + 2 )< len(label):
      vec.append(c*label[label_ind + 2])
    else:
      vec.append(0)

    c = 1 # 50
    # is number
    if is_number(word):
      vec.append(c*1)
    else:
      vec.append(0)
    
    # containsDigit
    if containsDigit(word):
      vec.append(c*1)
    else:
      vec.append(0)

    # contains special characters
    if containsSpecialChar(word):
      vec.append(c*1)
    else:
      vec.append(0)

    '''
    Gazetteer List
    '''
    c = 1 # 200
    if isPerson(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isOrganization(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isLocation(word):
      vec.append(c*1)
    else:
      vec.append(0)

    label_ind += 1

    features.append(np.array(vec))
features = np.array(features)

## Train Model on your own data
## Or Load trained model from below cell

In [None]:
def train_svm_with_NE():
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    svm_with_NE = SVC(kernel='linear')
    svm_with_NE.fit(features,label)

# svm_with_NE = train_svm_with_NE()

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Trained Model for SVM with NEI predictions from Pass 1

In [None]:
with open('models/svm-linear-pass2-A-no-scale.pkl','rb') as handle:
    svm_with_NE = pickle.load(handle)

## Test Features

## uses previous predictions for next NE label

In [23]:
pred_ind = 0
prev_prediction = predictions_SVM_4
test_features = []
test_tokens = dataset["test"]["tokens"]
test_pos_tags = dataset["test"]["pos_tags"]
test_chunk_tags = dataset["test"]["chunk_tags"]
c = 100
for sublist in range(len(test_tokens[:])):
  for word_ind in range(len(test_tokens[sublist])):
    word = test_tokens[sublist][word_ind]
    # print(word)
    vec = []
    # vec.append(word)

    #TF_IDF value of word
    vec.append(Tfidf_vect.vocabulary_[word])

    #Whether word is first word of sentence.
    if word_ind==0:
      vec.append(1)
    else:
      vec.append(0)

    #Whether word is last word of sentence.
    if word_ind==(len(test_tokens[sublist])-1):
      vec.append(1)
    else:
      vec.append(0)

    #Whether word is title or has first letter capitalised.
    
    if word.istitle():
      vec.append(1)
    else:
      vec.append(0)
    
    #TF_IDF vale of suffix of word.
    vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])

    #TF_IDF vale of prefix of word.
    vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])


    #POS tag of word.
    vec.append(test_pos_tags[sublist][word_ind])

    #POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(test_pos_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      

    #POS tag of suceeding word.
    if word_ind < (len(test_tokens[sublist])-1):
      vec.append(test_pos_tags[sublist][word_ind+1])
    else:
      vec.append(47)


    #Whether word contains any digit.
    if bool(re.search(r'\d', word)):
      vec.append(1)
    else:
      vec.append(0)

    '''
    Newly added features
    # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
    '''
    # word location
    vec.append(word_ind)

    # word length
    vec.append(len(word))

    # prev to prev pos
    if word_ind>1:
      vec.append(test_pos_tags[sublist][word_ind-2])
    else:
      vec.append(47)

    # next to next pos
    if word_ind < (len(test_tokens[sublist])-2):
      vec.append(test_pos_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # chunk tag of word
    vec.append(test_chunk_tags[sublist][word_ind])

    # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(test_chunk_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      
    # chunk tag of next word
    if word_ind < (len(test_tokens[sublist])-1):
      vec.append(test_chunk_tags[sublist][word_ind+1])
    else:
      vec.append(47)

    # prev to prev chunk
    if word_ind>1:
      vec.append(test_chunk_tags[sublist][word_ind-2])
    else:
      vec.append(47)
      
    # chunk tag of next to next word
    if word_ind < (len(test_tokens[sublist])-2):
      vec.append(test_chunk_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # if previous word is capital word
    if word_ind>0:
      next_word = test_tokens[sublist][word_ind-1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next word is capital word
    if word_ind < (len(test_tokens[sublist])-1):
      next_word = test_tokens[sublist][word_ind+1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if previous to previous word is capital word
    if word_ind>1:
      next_word = test_tokens[sublist][word_ind-2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next to next word is capital word
    if word_ind < (len(test_tokens[sublist])-2):
      next_word = test_tokens[sublist][word_ind+2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    '''
    Next and prev NEs
    '''
    c = 1 # 100
    # prev to prev word NE
    if word_ind>1 and pred_ind >1 :
      vec.append(c*prev_prediction[pred_ind - 2])
    else:
      vec.append(0)

    # prev word NE
    if word_ind>0 and pred_ind >0 :
      vec.append(c*prev_prediction[pred_ind - 1])
    else:
      vec.append(0)

    # next word NE
    if word_ind < (len(test_tokens[sublist])-1) and (pred_ind + 1 )< len(prev_prediction):
      vec.append(c*prev_prediction[pred_ind + 1])
    else:
      vec.append(0)

    # next to next word NE
    if word_ind < (len(test_tokens[sublist])-2) and (pred_ind + 2 )< len(prev_prediction):
      vec.append(c*prev_prediction[pred_ind + 2])
    else:
      vec.append(0)

    c = 1 # 50
    # is number
    if is_number(word):
      vec.append(c*1)
    else:
      vec.append(0)
    
    # containsDigit
    if containsDigit(word):
      vec.append(c*1)
    else:
      vec.append(0)

    # contains special characters
    if containsSpecialChar(word):
      vec.append(c*1)
    else:
      vec.append(0)

    '''
    Gazetteer List
    '''
    c = 1 # 200
    if isPerson(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isOrganization(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isLocation(word):
      vec.append(c*1)
    else:
      vec.append(0)

    pred_ind += 1

    test_features.append(np.array(vec))
test_features = np.array(test_features)

## Metric for SVM model with NE label as features

In [24]:
predictions_SVM_with_nextprev_NE = svm_with_NE.predict(test_features)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("Confusion_matrix \n",confusion_matrix(Test_Y, predictions_SVM_with_nextprev_NE))

SVM Accuracy Score ->  93.3670722515344
SVM Precision Score ->  79.46252465483235
SVM Recall Score ->  82.01017811704835
SVM F1 Score ->  80.71625344352617
Confusion_matrix 
 [[36909  1414]
 [ 1666  6446]]


## Metrics for above Model combined with tf-idf SVM model

In [25]:
predictions_SVM_5 = [] 
for i in range(len(predictions_SVM_newFeatures)):
  if predictions_SVM[i]==1 or predictions_SVM_with_nextprev_NE[i]==1:
    predictions_SVM_5.append(1)
  else:
    predictions_SVM_5.append(0)



In [26]:

print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_5, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM_5, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM_5, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM_5, Test_Y)*100)
print("Confusion_matrix \n",confusion_matrix(Test_Y, predictions_SVM_5))

SVM Accuracy Score ->  95.41078927533111
SVM Precision Score ->  92.48027613412229
SVM Recall Score ->  83.14307879862574
SVM F1 Score ->  87.56346658885322
Confusion_matrix 
 [[36802  1521]
 [  610  7502]]



## Testing if Improvement if ground truth labels were used
### : Performance is better

In [27]:
pred_ind = 0
prev_prediction = Test_Y
test_features = []

test_tokens = dataset["test"]["tokens"]
test_pos_tags = dataset["test"]["pos_tags"]
test_chunk_tags = dataset["test"]["chunk_tags"]
c = 100
for sublist in range(len(test_tokens[:])):
  for word_ind in range(len(test_tokens[sublist])):
    word = test_tokens[sublist][word_ind]
    # print(word)
    vec = []
    # vec.append(word)

    # TF_IDF value of word
    vec.append(Tfidf_vect.vocabulary_[word])

    # Whether word is first word of sentence.
    if word_ind==0:
      vec.append(1)
    else:
      vec.append(0)

    # Whether word is last word of sentence.
    if word_ind==(len(test_tokens[sublist])-1):
      vec.append(1)
    else:
      vec.append(0)

    # Whether word is title or has first letter capitalised.
    
    if word.istitle():
      vec.append(1)
    else:
      vec.append(0)
    
    # TF_IDF vale of suffix of word.
    vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])

    # TF_IDF vale of prefix of word.
    vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])


    # POS tag of word.
    vec.append(test_pos_tags[sublist][word_ind])

    # POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(test_pos_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      

    # POS tag of suceeding word.
    if word_ind < (len(test_tokens[sublist])-1):
      vec.append(test_pos_tags[sublist][word_ind+1])
    else:
      vec.append(47)


    # Whether word contains any digit.
    if bool(re.search(r'\d', word)):
      vec.append(1)
    else:
      vec.append(0)

    '''
    Newly added features
    # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
    '''
    # word location
    vec.append(word_ind)

    # word length
    vec.append(len(word))

    # prev to prev pos
    if word_ind>1:
      vec.append(test_pos_tags[sublist][word_ind-2])
    else:
      vec.append(47)

    # next to next pos
    if word_ind < (len(test_tokens[sublist])-2):
      vec.append(test_pos_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # chunk tag of word
    vec.append(test_chunk_tags[sublist][word_ind])

    # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 47 taken to indicate boundary.
    if word_ind>0:
      vec.append(test_chunk_tags[sublist][word_ind-1])
    else:
      vec.append(47)
      
    # chunk tag of next word
    if word_ind < (len(test_tokens[sublist])-1):
      vec.append(test_chunk_tags[sublist][word_ind+1])
    else:
      vec.append(47)

    # prev to prev chunk
    if word_ind>1:
      vec.append(test_chunk_tags[sublist][word_ind-2])
    else:
      vec.append(47)
      
    # chunk tag of next to next word
    if word_ind < (len(test_tokens[sublist])-2):
      vec.append(test_chunk_tags[sublist][word_ind+2])
    else:
      vec.append(47)

    # if previous word is capital word
    if word_ind>0:
      next_word = test_tokens[sublist][word_ind-1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next word is capital word
    if word_ind < (len(test_tokens[sublist])-1):
      next_word = test_tokens[sublist][word_ind+1]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if previous to previous word is capital word
    if word_ind>1:
      next_word = test_tokens[sublist][word_ind-2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    # if next to next word is capital word
    if word_ind < (len(test_tokens[sublist])-2):
      next_word = test_tokens[sublist][word_ind+2]
      if next_word.istitle():
        vec.append(1)
      else:
        vec.append(0)
    else:
      vec.append(0)

    '''
    Next and prev NEs
    '''
    c = 1 # 100
    # prev to prev word NE
    if word_ind>1 and pred_ind >1 :
      vec.append(c*prev_prediction[pred_ind - 2])
    else:
      vec.append(0)

    # prev word NE
    if word_ind>0 and pred_ind >0 :
      vec.append(c*prev_prediction[pred_ind - 1])
    else:
      vec.append(0)

    # next word NE
    if word_ind < (len(test_tokens[sublist])-1) and (pred_ind + 1 )< len(prev_prediction):
      vec.append(c*prev_prediction[pred_ind + 1])
    else:
      vec.append(0)

    # next to next word NE
    if word_ind < (len(test_tokens[sublist])-2) and (pred_ind + 2 )< len(prev_prediction):
      vec.append(c*prev_prediction[pred_ind + 2])
    else:
      vec.append(0)

    c = 1 # 50
    # is number
    if is_number(word):
      vec.append(c*1)
    else:
      vec.append(0)
    
    # containsDigit
    if containsDigit(word):
      vec.append(c*1)
    else:
      vec.append(0)

    # contains special characters
    if containsSpecialChar(word):
      vec.append(c*1)
    else:
      vec.append(0)

    '''
    Gazetteer List
    '''
    c = 1 # 200
    if isPerson(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isOrganization(word):
      vec.append(c*1)
    else:
      vec.append(0)

    if isLocation(word):
      vec.append(c*1)
    else:
      vec.append(0)

    pred_ind += 1

    test_features.append(np.array(vec))
test_features = np.array(test_features)

# Metrics for Pass 2
### Just ground truth labels used instead of Pass 1 predictions

In [29]:
predictions_SVM_with_nextprev_NE = svm_with_NE.predict(test_features)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM_with_nextprev_NE, Test_Y)*100)
print("Confusion_matrix \n",confusion_matrix(Test_Y, predictions_SVM_with_nextprev_NE))
predictions_SVM_5 = [] 
for i in range(len(predictions_SVM_newFeatures)):
  if predictions_SVM[i]==1 or predictions_SVM_with_nextprev_NE[i]==1:
    predictions_SVM_5.append(1)
  else:
    predictions_SVM_5.append(0)

SVM Accuracy Score ->  93.69871863895767
SVM Precision Score ->  79.65976331360946
SVM Recall Score ->  83.50995089170328
SVM F1 Score ->  81.53943217665615
Confusion_matrix 
 [[37047  1276]
 [ 1650  6462]]


## After Merging with TF-IDF

In [30]:
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM_5, Test_Y)*100)
print("SVM Precision Score -> ",precision_score(predictions_SVM_5, Test_Y)*100)
print("SVM Recall Score -> ",recall_score(predictions_SVM_5, Test_Y)*100)
print("SVM F1 Score -> ",f1_score(predictions_SVM_5, Test_Y)*100)
print("Confusion_matrix \n",confusion_matrix(Test_Y, predictions_SVM_5))

SVM Accuracy Score ->  95.71228599117046
SVM Precision Score ->  92.68984220907298
SVM Recall Score ->  84.3220814175171
SVM F1 Score ->  88.30818016325092
Confusion_matrix 
 [[36925  1398]
 [  593  7519]]


In [None]:
def make_features(sent_list, pass1_prediction = None):
  sent_list_pos = []
  sent_list_chunk = []
  tree = ne_chunk(pos_tag(word_tokenize(" ".join(sent_list))))
  iob_tags = tree2conlltags(tree)
  prev_prediction = pass1_prediction
  pred_ind = 0
  for w,p,c in iob_tags:
      key_id = p+'_'+c
      if key_id in pos_max_token.keys():
          sent_list_pos.append(pos_max_token[key_id])
      else:
          sent_list_pos.append(0)

      if key_id in chunk_max_token.keys():
          sent_list_chunk.append(chunk_max_token[key_id])
      else:
          sent_list_chunk.append(0)

  test_sent_feature = []
  for word_ind in range(len(sent_list)):
      word = sent_list[word_ind]
      # print(word)
      vec = []
      # vec.append(word)

      # TF_IDF value of word
      alpha = 0
      try:
        vec.append(Tfidf_vect.vocabulary_[word])
      except:
        vec.append(alpha)

      # Whether word is first word of sentence.
      if word_ind==0:
        vec.append(1)
      else:
        vec.append(0)

      # Whether word is last word of sentence.
      if word_ind==(len(sent_list)-1):
        vec.append(1)
      else:
        vec.append(0)

      # Whether word is title or has first letter capitalised.
      
      if word.istitle():
        vec.append(1)
      else:
        vec.append(0)
      
      # TF_IDF vale of suffix of word.
      try:
        vec.append(Tfidf_vect_suffix.vocabulary_[word[-3:]])
      except:
        vec.append(0)

      # TF_IDF vale of prefix of word.
      try: 
        vec.append(Tfidf_vect_prefix.vocabulary_[word[:3]])
      except:
        vec.append(0)


      # POS tag of word.
      vec.append(sent_list_pos[word_ind])

      # POS tag of preceeding word. If first word, then non-exisitng pos_tag value 47 taken to indicate boundary.
      if word_ind>0:
        vec.append(sent_list_pos[word_ind-1])
      else:
        vec.append(47)
        

      # POS tag of suceeding word.
      if word_ind < (len(sent_list)-1):
        vec.append(sent_list_pos[word_ind+1])
      else:
        vec.append(47)


      # Whether word contains any digit.
      if bool(re.search(r'\d', word)):
        vec.append(1)
      else:
        vec.append(0)

      '''
      Newly added features
      # next,prev Caps; index word location; word length, ancestor, chunk tag prev next curr
      '''
      # word location
      vec.append(word_ind)

      # word length
      vec.append(len(word))

      # prev to prev pos
      if word_ind>1:
        vec.append(sent_list_pos[word_ind-2])
      else:
        vec.append(47)

      # next to next pos
      if word_ind < (len(sent_list)-2):
        vec.append(sent_list_pos[word_ind+2])
      else:
        vec.append(47)

      # chunk tag of word
      vec.append(sent_list_chunk[word_ind])

      # chunk tag of preceeding word. If first word, then non-exisitng chunk_tag value 47 taken to indicate boundary.
      if word_ind>0:
        vec.append(sent_list_chunk[word_ind-1])
      else:
        vec.append(47)
        
      # chunk tag of next word
      if word_ind < (len(sent_list)-1):
        vec.append(sent_list_chunk[word_ind+1])
      else:
        vec.append(47)

      # prev to prev chunk
      if word_ind>1:
        vec.append(sent_list_chunk[word_ind-2])
      else:
        vec.append(47)
        
      # chunk tag of next to next word
      if word_ind < (len(sent_list)-2):
        vec.append(sent_list_chunk[word_ind+2])
      else:
        vec.append(47)

      # if previous word is capital word
      if word_ind>0:
        next_word = sent_list[word_ind-1]
        if next_word.istitle():
          vec.append(1)
        else:
          vec.append(0)
      else:
        vec.append(0)

      # if next word is capital word
      if word_ind < (len(sent_list)-1):
        next_word = sent_list[word_ind+1]
        if next_word.istitle():
          vec.append(1)
        else:
          vec.append(0)
      else:
        vec.append(0)

      # if previous to previous word is capital word
      if word_ind>1:
        next_word = sent_list[word_ind-2]
        if next_word.istitle():
          vec.append(1)
        else:
          vec.append(0)
      else:
        vec.append(0)

      # if next to next word is capital word
      if word_ind < (len(sent_list)-2):
        next_word = sent_list[word_ind+2]
        if next_word.istitle():
          vec.append(1)
        else:
          vec.append(0)
      else:
        vec.append(0)


      '''
      Used only when pass 1 output given
      '''
      if pass1_prediction is not None:
        c = 1 # 100
        # prev to prev word NE
        if word_ind>1 and pred_ind >1 :
          vec.append(c*prev_prediction[pred_ind - 2])
        else:
          vec.append(0)

        # prev word NE
        if word_ind>0 and pred_ind >0 :
          vec.append(c*prev_prediction[pred_ind - 1])
        else:
          vec.append(0)

        # next word NE
        if word_ind < (len(sent_list)-1) and (pred_ind + 1 )< len(prev_prediction):
          vec.append(c*prev_prediction[pred_ind + 1])
        else:
          vec.append(0)

        # next to next word NE
        if word_ind < (len(sent_list)-2) and (pred_ind + 2 )< len(prev_prediction):
          vec.append(c*prev_prediction[pred_ind + 2])
        else:
          vec.append(0)

        pred_ind += 1
      '''
      Next NEs end
      '''

      c = 1 # 100
      # is number
      if is_number(word):
        vec.append(c*1)
      else:
        vec.append(0)
      
      # containsDigit
      if containsDigit(word):
        vec.append(c*1)
      else:
        vec.append(0)

      # contains special characters
      if containsSpecialChar(word):
        vec.append(c*1)
      else:
        vec.append(0)

      '''
      Gazetteer List
      '''
      c = 1 #400
      if isPerson(word):
        vec.append(c*1)
      else:
        vec.append(0)

      if isOrganization(word):
        vec.append(c*1)
      else:
        vec.append(0)

      if isLocation(word):
        vec.append(c*1)
      else:
        vec.append(0)

      test_sent_feature.append(np.array(vec))

  return test_sent_feature
    

In [None]:
def predict_sentence(sent):
    sent_list = sent.lower().split()
    sent_tfidf = Tfidf_vect.transform(sent_list)
    sent_pred1 = SVM.predict(sent_tfidf)

    # sent_list = sent.split()
    sent_feature = make_features(sent_list)
    sent_pred2 = SVM_newFeatures.predict(sent_feature)

    pass1_res = []
    for pred1,pred2 in zip(sent_pred1, sent_pred2):
        if pred1 or pred2:
            pass1_res.append(1)
        else:
            pass1_res.append(0)

    sent_new_NE_feature = make_features(sent_list, pass1_res)
    sent_pred3 = svm_with_NE.predict(sent_new_NE_feature)

    pass2_res = []
    for pred1,pred2 in zip(sent_pred2, sent_pred3):
        if pred1 or pred2:
            pass2_res.append(1)
        else:
            pass2_res.append(0)
    return pass2_res

In [None]:
predict_sentence('The State Bank of India is the largest bank in the country')

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

In [None]:
predict_sentence('Pushpak Bhattacharyya teaches us CS626')

[1, 1, 1, 0, 0]

In [None]:
predict_sentence('India got its freedom on 15th August 1947')

[1, 0, 0, 0, 0, 0, 1, 0]

In [None]:
predict_sentence('India got its freedom on 15-8-1947')

[1, 0, 0, 0, 0, 1]

In [None]:
predict_sentence('India got its freedom on 15/8/1947')

[1, 0, 0, 0, 0, 0]

In [None]:
predict_sentence('India got its freedom on 15.8.1947')

[1, 0, 0, 0, 0, 0]

In [None]:
feature_list = [
'tfidf', 'first_word', 'last_word', 'caps',
'word_suffix', 'word_prefix', 'pos', 'prev_pos',
'next_pos', 'digitsWithSpecialChar', 'word_loc', 'word_len',
'prev_prev_pos', 'next_next_pos', 'chunk_tag', 
'prev_chunk_tag', 'next_chunk_tag',
'prev_prev_chunk_tag', 'next_next_chunk_tag',
'prev_caps','next_caps','prev_prev_caps',
'next_next_caps', 
 'prev_prev_NE', 'prev_NE', 'next_NE', 'next_next_NE', # pass 2
'is_number', 'containsDigit','containsSpecialChar',
'isPerson', 'isOrganization', 'isLocation'
]

# Feature Analysis

In [None]:
feature_wt = {}
for k,v in zip(feature_list, svm_with_NE.coef_[0]):
    feature_wt[k] = v
feature_wt

{'caps': 1799.490648335171,
 'chunk_tag': -16.373605242442864,
 'containsDigit': -239.62581877378705,
 'containsSpecialChar': -38.48916502772374,
 'digitsWithSpecialChar': -239.62581877378705,
 'first_word': 163.46877295035043,
 'isLocation': 1490.0,
 'isOrganization': 5.6251539989608546,
 'isPerson': 462.03634746161936,
 'is_number': -34.713888118493,
 'last_word': 111.7576588397026,
 'next_NE': 850.0008000194485,
 'next_caps': 245.46835838195614,
 'next_chunk_tag': 5.373479522186244,
 'next_next_NE': 64.46147694882666,
 'next_next_caps': -90.53852305117334,
 'next_next_chunk_tag': 21.265752206061734,
 'next_next_pos': -19.024241053273727,
 'next_pos': -15.206174396658753,
 'pos': 43.46058487471237,
 'prev_NE': 736.688068448909,
 'prev_caps': 196.93040960920615,
 'prev_chunk_tag': 1.6821546255523572,
 'prev_pos': -28.850670244224602,
 'prev_prev_NE': 238.8214371242916,
 'prev_prev_caps': 45.321926861789564,
 'prev_prev_chunk_tag': 20.48429693326034,
 'prev_prev_pos': -27.7894473852356

In [None]:
def mod(n):
    if n<0: return -1*n
    return n

# Top k Features

In [None]:
k = 20
sorted(feature_wt, key=lambda k: mod(feature_wt[k]))[::-1][:k]

['caps',
 'isLocation',
 'next_NE',
 'prev_NE',
 'isPerson',
 'word_len',
 'next_caps',
 'containsDigit',
 'digitsWithSpecialChar',
 'prev_prev_NE',
 'prev_caps',
 'first_word',
 'last_word',
 'next_next_caps',
 'next_next_NE',
 'prev_prev_caps',
 'pos',
 'containsSpecialChar',
 'is_number',
 'prev_pos']