### I. Import Libraries

In [None]:
import time
import pickle
import string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier

%matplotlib inline
sns.set(font_scale=1.25)
sns.set(style='white')
sns.set(style='whitegrid', color_codes=True)

### II. Import and Examine Dataset

In [None]:
ner_data = 'Datasets/ner_dataset.txt'
df = pd.read_csv(ner_data, sep=' ', header=None)
df.columns = ['token', 'pos_tag', 'chunk_tag', 'ne_tag']
df.head()

In [None]:
df.info()

Remove the rows with missing values: 6 are missing tokens, and 2,818 are missing named entity tags. These are presumably the empty lines after every sentence.

In [None]:
null_columns = df.columns[df.isnull().any()]
print(df[null_columns].isnull().sum())

df[df.isnull().any(axis=1)][null_columns].head()

In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True) # Reset the index so that it reflects the length of the dataframe and its columns. 
df.info() # Confirm that the missing values have been dropped.

In [None]:
df.pos_tag.value_counts()

In [None]:
fig = plt.figure(figsize=(20,8))
plt.title('POS Tag Frequency')
sns.countplot(data=df, x='pos_tag');

In [None]:
df.chunk_tag.value_counts()

In [None]:
fig = plt.figure(figsize=(16,8))
plt.title('Chunk Tag Frequency')
sns.countplot(data=df, x='chunk_tag');

The vast majority of tokens are not named entities: 210,679 out of 253,321 (83.17%). This is one baseline for classifier accuracy.

In [None]:
df.ne_tag.value_counts()

In [None]:
fig = plt.figure(figsize=(12,8))
plt.title('Named Entity Tag Frequency')
sns.countplot(data=df, x='ne_tag');

To better understand the tags for feature engineering (what are the qualities/patterns that certain tags have?), generate examples of each.

General named entity feature trends: capitalisation; POS tag is noun of some kind.

In [None]:
#df[df.ne_tag=='B-ORG'].sample(20) # First word in organisation name. Some tokens are abbreviations in all caps (e.g., 'EU').
#df[df.ne_tag=='I-ORG'].sample(20) # Second word in organisation name.
#df[df.ne_tag=='B-MISC'].sample(20) # Not sure how this differs from B-ORG. Many are adjectives.
#df[df.ne_tag=='I-MISC'].sample(20) # Second part of B-MISC.
df[df.ne_tag=='B-PER'].sample(20) # First names of people.
#df[df.ne_tag=='I-PER'].sample(20) # Surnames of people. Mostly capitalised but not always (e.g., 'van').
#df[df.ne_tag=='B-LOC'].sample(20) # Country and city names.
#df[df.ne_tag=='I-LOC'].sample(20) # Second part of country and city names.

### III. Feature Engineering and Prepare Training and Validation Sets

In [None]:
def features(token, index, pos_tag, ne_tag):
    first_letter = token[index][0]
    features = {'token': token[index],
                'pos': pos_tag[index],
#                'chunk': chunk_tag[index],
                'prev_token': '' if index == 0 else token[index-1],
                'prev_pos': '' if index == 0 else pos_tag[index-1],
                'prev_ne': '' if index == 0 else ne_tag[index-1],
                'next_token': '' if index == len(df.token)-1 else token[index+1],
                'next_pos': '' if index == len(df.token)-1 else pos_tag[index+1],
                'prev_prev_token': '' if index == 0 or index == 1 else token[index-2],
                'prev_prev_pos': '' if index == 0 or index == 1 else pos_tag[index-2],
                'next_next_token': '' if index == len(df.token)-1 or index == len(df.token)-2 else token[index+2],
                'next_next_pos': '' if index == len(df.token)-1 or index == len(df.token)-2 else pos_tag[index+2],
                'is_capitalized': first_letter.upper() in string.ascii_uppercase and first_letter.upper() == first_letter,
                'is_numeric': token[index].isdigit(),                
                'is_all_caps': token[index].upper() == token[index],
                'caps_inside': token[index][1:].lower() != token[index][1:]
                }
    return features
    
X = []

for index in range(len(df.token)):
    X.append(features(df.token, index, df.pos_tag, df.ne_tag))

In [None]:
X[:5]

In [None]:
y = df['ne_tag']

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.1, random_state=0)

print("Size of training set (POS tags):", len(X_train)) 
print("Size of test set (POS tags):", len(X_test)) 
print("Size of training set (chunk tags):", len(y_train)) 
print("Size of test set (chunk tags):", len(y_test)) 

### IV. Classifier Training

In [None]:
# token, pos: 0.8932555327159474
# token, chunk: 0.8494862890783083 (took a LOT longer)
# token, pos, chunk: 0.8957114704338746 (chunk doesn't help)
# token, pos, previous token/pos: 0.8995017958096173
# token, pos, previous token/pos, next token/pos: 0.9055109354730966
# token, pos, previous token/pos, next token/pos, is_capitalised: 0.9115229634875965
# token, pos, previous token/pos, previous previous token/pos, next token/pos, next next token/pos, is_capitalised: 0.9128023148431639
# '', is_numeric: 0.9127205040729689
# '', is_all_caps: 0.9132049059426229
# '', caps_inside: 0.9142095785522023
# ', prev_ne: 0.9387959473000405

start_time = time.time()

clf_dt = Pipeline([('vectorizer', DictVectorizer(sparse=False)),
                   ('classifier', DecisionTreeClassifier(random_state=0, criterion='entropy'))])
 
clf_dt.fit(X_train[:10000], y_train[:10000])

end_time = time.time()
print("Total time:", end_time-start_time)

predicted_dt = clf_dt.predict(X_test)
print("Mean F1 score (weighted):", metrics.f1_score(y_test, predicted_dt, average='weighted'))

In [None]:
start_time = time.time()

clf_nb = Pipeline([('vectorizer', DictVectorizer(sparse=False)),
                   ('classifier', MultinomialNB(alpha=0.01))])
 
clf_nb.fit(X_train[:50000], y_train[:50000])

end_time = time.time()
print("Total time:", end_time-start_time)

predicted_nb = clf_nb.predict(X_test)
print("Mean F1 score (weighted):", metrics.f1_score(y_test, predicted_nb, average='weighted'))

In [None]:
start_time = time.time()

clf_lr = Pipeline([('vectorizer', DictVectorizer(sparse=False)),
                   ('classifier', LogisticRegression(random_state=0, class_weight='balanced', solver='liblinear'))])
 
clf_lr.fit(X_train[:50000], y_train[:50000])

end_time = time.time()
print("Total time:", end_time-start_time)

predicted_lr = clf_lr.predict(X_test)
print("Mean F1 score (weighted):", metrics.f1_score(y_test, predicted_lr, average='weighted'))

In [None]:
start_time = time.time()

clf_svc = Pipeline([('vectorizer', DictVectorizer(sparse=False)),
                    ('classifier', LinearSVC(random_state=0, class_weight='balanced', max_iter=10000))])
 
clf_svc.fit(X_train[:50000], y_train[:50000])

end_time = time.time()
print("Total time:", end_time-start_time)

predicted_svc = clf_svc.predict(X_test)
print("Mean F1 score (weighted):", metrics.f1_score(y_test, predicted_svc, average='weighted'))

In [None]:
print(metrics.classification_report(y_test, predicted_svc))

In [None]:
save_classifier = open('ne_clf_svc.pickle', 'wb') 
pickle.dump(clf_svc, save_classifier)
save_classifier.close()