## Predict on Test Set 

This script assumes that the test dataset filename is `ner_test.txt`, that it is in a folder called Datasets (as this is the folder in which the training data was placed), and that the script is in the parent folder. These parameters can easily be modified should these assumptions not hold true.

In [None]:
import pickle
import string
import pandas as pd
from sklearn import metrics
from nltk import word_tokenize, pos_tag

In [None]:
saved_classifier = open('ne_clf_svc.pickle', 'rb')
ne_clf = pickle.load(saved_classifier) 
saved_classifier.close()

In [None]:
def predict_on_test_set(filepath):
    
    df = pd.read_csv(filepath, sep=' ', header=None)
    df.columns = ['token', 'pos_tag', 'chunk_tag', 'ne_tag']
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    y = df['ne_tag']
    X = []

    def features(token, index, pos_tag, ne_tag):
        first_letter = token[index][0]
        features = {'token': token[index],
                    'pos': pos_tag[index],
                    'prev_token': '' if index == 0 else token[index-1],
                    'prev_pos': '' if index == 0 else pos_tag[index-1],
                    'prev_ne': '' if index == 0 else ne_tag[index-1],
                    'next_token': '' if index == len(token)-1 else token[index+1],
                    'next_pos': '' if index == len(token)-1 else pos_tag[index+1],
                    'prev_prev_token': '' if index == 0 or index == 1 else token[index-2],
                    'prev_prev_pos': '' if index == 0 or index == 1 else pos_tag[index-2],
                    'next_next_token': '' if index == len(token)-1 or index == len(token)-2 else token[index+2],
                    'next_next_pos': '' if index == len(token)-1 or index == len(token)-2 else pos_tag[index+2],
                    'is_capitalized': first_letter.upper() in string.ascii_uppercase and first_letter.upper() == first_letter,
                    'is_numeric': token[index].isdigit(),                
                    'is_all_caps': token[index].upper() == token[index],
                    'caps_inside': token[index][1:].lower() != token[index][1:]
                    }
        return features

    for index in range(len(df.token)):
        X.append(features(df.token, index, df.pos_tag, df.ne_tag))
    
    predicted = ne_clf.predict(X)  
    print(metrics.classification_report(y, predicted))

In [None]:
file = 'Datasets/ner_test.txt'
predict_on_test_set(file)

## Predict on Texts

In [None]:
news_paragraphs = ['She vowed to get the deal signed off in Brussels and put it to a vote of MPs.',
'It follows a string of ministerial resignations and talk of a no-confidence vote from Tory MPs.',
'Brexit Secretary Dominic Raab and Work and Pensions Secretary Esther McVey both quit earlier in protest at the withdrawal agreement, along with two junior ministers.']

In [None]:
def predict_on_texts(texts):
    
    texts = ' '.join(texts) 
    texts = texts.replace('\n',' ') 
    tokens = word_tokenize(texts) 
    token_pos_tag = pos_tag(tokens) 
    ne_tag_hist = []
    
    def pos(token_and_tag): 
        return [t[1] for t in token_and_tag]
    
    def features(token, index, pos_tag, ne_tag_hist):
        first_letter = token[index][0]
        features = {'token': token[index],
                    'pos': pos_tag[index],
                    'prev_token': '' if index == 0 else token[index-1],
                    'prev_pos': '' if index == 0 else pos_tag[index-1],
                    'prev_ne': '' if index == 0 else ne_tag_hist[index-1],
                    'next_token': '' if index == len(token)-1 else token[index+1],
                    'next_pos': '' if index == len(token)-1 else pos_tag[index+1],
                    'prev_prev_token': '' if index == 0 or index == 1 else token[index-2],
                    'prev_prev_pos': '' if index == 0 or index == 1 else pos_tag[index-2],
                    'next_next_token': '' if index == len(token)-1 or index == len(token)-2 else token[index+2],
                    'next_next_pos': '' if index == len(token)-1 or index == len(token)-2 else pos_tag[index+2],
                    'is_capitalized': first_letter.upper() in string.ascii_uppercase and first_letter.upper() == first_letter,
                    'is_numeric': token[index].isdigit(),                
                    'is_all_caps': token[index].upper() == token[index],
                    'caps_inside': token[index][1:].lower() != token[index][1:]
                    }
        return features
    
    for index in range(len(tokens)):
        X = features(tokens, index, pos(token_pos_tag), ne_tag_hist)
        predicted = ne_clf.predict(X)
        ne_tag_hist.append(predicted[0]) 

    for token, predicted_tag in zip(tokens, ne_tag_hist):
        print('%s => %s' % (token, predicted_tag))

In [None]:
predict_on_texts(news_paragraphs)