## 1 Set Up Environment

We require the following packages and versions:
* scikit-learn<0.24 (note that sklearn-crfsuite is incompatible with scikit-learn>=0.24)
* sklearn-crfsuite==0.3.6 (https://github.com/TeamHG-Memex/sklearn-crfsuite)

In [None]:
# make sure the required python packages are installed

# install nltk (we'll use 3.6.7 in Spring 2022)
!pip install nltk==3.6.7 --upgrade

# install spacy (we'll use 3.2.1 in Spring 2022)
!pip install spacy==3.2.1 --upgrade

# upgrade scikit-learn 0.23.x (note that sklearn-crfsuite is inconsistent with scikit-learn>=0.24)
!pip install 'scikit-learn>=0.23.0,<0.24.0' --upgrade

# upgrade scikit-learn 0.3.6
!pip install 'sklearn-crfsuite==0.3.6' --upgrade

# download the spacy en_core_web_sm model (3.2.0 version)
!python -m spacy download en_core_web_sm-3.2.0 --direct

In [17]:
conda install -c conda-forge sklearn-crfsuite

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/arabi/anaconda3

  added / updated specs:
    - sklearn-crfsuite


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python-crfsuite-0.9.7      |   py38h2531618_1         181 KB
    sklearn-crfsuite-0.3.6     |     pyh9f0ad1d_0          12 KB  conda-forge
    tabulate-0.8.9             |     pyhd8ed1ab_0          26 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         219 KB

The following NEW packages will be INSTALLED:

  python-crfsuite    pkgs/main/linux-64::python-crfsuite-0.9.7-py38h2531618_1
  sklearn-crfsuite   conda-forge/noarch::sklearn-crfsuite-0.3.6-pyh9f0ad1d_0
  tabulate           conda-forge/noarch::tabulate-0.8.9-pyhd8ed1ab_0



Downloading and Extracting P

In [1]:
from sklearn.metrics import f1_score
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

## 2 Download the CoNLL2003 NER dataset and the Twitter NER dataset

CoNLL2003 NER dataset:
* training set: https://jiepujiang.github.io/data/CoNLL2003_ner_train.txt
* test set: https://jiepujiang.github.io/data/CoNLL2003_ner_test.txt

Twitter NER dataset:
* whole dataset: https://jiepujiang.github.io/data/twitter_ner.txt

In [2]:
# A utility function for reading the dataset (from the web).
# Only load the first (word) and last columns (IOB label).

import re
import requests

# regularize the name for person and geo-loc tags on different datasets
def transform_label(old):
    if old=='B-PER' or old=='B-person':
        return 'B-PER'
    if old=='I-PER' or old=='I-person':
        return 'I-PER'
    if old=='B-LOC' or old=='B-geo-loc':
        return 'B-LOC'
    if old=='I-LOC' or old=='I-geo-loc':
        return 'I-LOC'
    return old
def transform_human(old):
    for i in range(len(old)):
        for j in range(len(old[i])):
            if old[i][j] != 'B-PER' and old[i][j] != 'I-PER':
                old[i][j] = 'O'
    return old
def load_dataset(url):
    sents = []
    response = requests.get(url)
    sent = []
    for line in response.iter_lines(decode_unicode=True):
        line = line.strip()
        if len(line)==0 or line[0:10]=='-DOCSTART-':
            if len(sent)>0:
                sents.append(sent)
                sent = []
        else:
            tokens = re.split('\\s+', line)
            sent.append([tokens[0], transform_label(tokens[-1])])
    return sents
def ext_occurrence(list_labels, type):
    left,right = -1,-1
    lim = []
    for i in range(len(list_labels)):
        if list_labels[i]=='B-'+type: #beginning of new entity
            if left>=0 and right>=0: #save the previous one if existed
                lim.append((left, right))
            left, right = i,i
        elif list_labels[i]=='I-'+type: #onging entity: forward the right edge
            right = i
        else: ##anything else: end of an entity or continued "O"
            if left>=0 and right>=0:
                lim.append((left, right))
                left,right = -1,-1
    if left>=0 and right>=0:
        lim.append((left, right))
    return lim

def eval(Y, Y_pred, type):
    total_y = 0
    total_y_pred = 0
    correct = 0
    for (y, y_pred) in zip(Y, Y_pred):
        oc_y = ext_occurrence(y, type)
        oc_y_pred = ext_occurrence(y_pred, type)
        total_y += len(oc_y)
        total_y_pred += len(oc_y_pred)
        correct += len([ oc for oc in oc_y if oc in oc_y_pred ])
    return {
        'F1(%s)' % type: 0.0 if correct==0 else float("{0:.3f}".format(2.0*correct/(total_y+total_y_pred))),
        'Pre(%s)' % type: 0.0 if correct==0 else float("{0:.3f}".format(correct/total_y_pred)),
        'Rec(%s)' % type: 0.0 if correct==0 else float("{0:.3f}".format(correct/total_y)),
        
    }

In [3]:
# load the dataset files

url_conll2003_train = 'https://jiepujiang.github.io/data/CoNLL2003_ner_train.txt'
url_conll2003_test = 'https://jiepujiang.github.io/data/CoNLL2003_ner_test.txt'
url_twitter_ner = 'https://jiepujiang.github.io/data/twitter_ner.txt'

conll2003_train = load_dataset(url_conll2003_train)
conll2003_test = load_dataset(url_conll2003_test)
twitter_ner = load_dataset(url_twitter_ner)

print( len(conll2003_train), len(conll2003_test), len(twitter_ner) )

14401 3546 2467


In [4]:
# sentence[4] in conll2003_train

conll2003_train[4]

[['Germany', 'B-LOC'],
 ["'s", 'O'],
 ['representative', 'O'],
 ['to', 'O'],
 ['the', 'O'],
 ['European', 'B-ORG'],
 ['Union', 'I-ORG'],
 ["'s", 'O'],
 ['veterinary', 'O'],
 ['committee', 'O'],
 ['Werner', 'B-PER'],
 ['Zwingmann', 'I-PER'],
 ['said', 'O'],
 ['on', 'O'],
 ['Wednesday', 'O'],
 ['consumers', 'O'],
 ['should', 'O'],
 ['buy', 'O'],
 ['sheepmeat', 'O'],
 ['from', 'O'],
 ['countries', 'O'],
 ['other', 'O'],
 ['than', 'O'],
 ['Britain', 'B-LOC'],
 ['until', 'O'],
 ['the', 'O'],
 ['scientific', 'O'],
 ['advice', 'O'],
 ['was', 'O'],
 ['clearer', 'O'],
 ['.', 'O']]

In [5]:
# sentence[4]'s first token in conll2003_train is 'Germany' and its IOB-style tag is 'B-LOC'
conll2003_train[4][0]

['Germany', 'B-LOC']

In [6]:
import re

# w[offset]=? for the ith word in a sentence
# e.g., w[0] = current word
#       w[-1] = the previous word
#       w[1] = the next word
#       w[2], w[-2], w[3], w[-3]
def feat_w(sent, i, offset):
    if i+offset<0:
        return { 'w[%d]' % offset : '<S>' }
    if i+offset>=len(sent):
        return { 'w[%d]' % offset : '</S>' }
    else:
        return { 'w[%d]' % offset : sent[i+offset][0] }
    
def feat_w_casefold(sent, i, offset):
    if i+offset<0:
        return { 'w[%d]' % offset : '<S>' }
    if i+offset>=len(sent):
        return { 'w[%d]' % offset : '</S>' }
    else:
        return { 'w[%d].lower()' % offset : sent[i+offset][0].lower() }

# whether the word is alphanumeric (consisting of both letters and digits)
def feat_alphanumeric(sent, i):
    val = re.fullmatch('[a-zA-Z0-9]+', sent[i][0])!=None and re.fullmatch('[a-zA-Z]+', sent[i][0])==None and re.fullmatch('[0-9]+', sent[i][0])==None
    return { 'alphanumeric': val }

# whether the word has a dash
def feat_hasdash(sent, i):
    return { 'hasdash': ( '-' in sent[i][0] ) }

# prefix of length $len
def feat_prefix(sent, i, len):
    return { 'prefix[len=%d]' % len: sent[i][0][:len] }

def feat_prefix_casefold(sent, i, len):
    return { 'prefix[len=%d].lower()' % len: sent[i][0][:len].lower() }



# suffix of length $len
def feat_suffix(sent, i, len):
    return { 'suffix[len=%d]' % len: sent[i][0][-len:] }

def feat_suffix_casefold(sent, i, len):
    return { 'suffix[len=%d].lower()' % len: sent[i][0][-len:].lower() }

# character k-gram for the ith word in a sentence
def feat_charngram(sent, i, k):
    return { 'char-%dgram=%s' % (k,sent[i][0][j:j+k]):True for j in range(len(sent[i][0])-k+1) }

# a simple word shape feature: uppercase letter-->X, lowercase-->x, digit-->d
def feat_wordshape(sent, i):
    shape = re.sub('[A-Z]', 'X', sent[i][0])
    shape = re.sub('[a-z]', 'x', shape)
    shape = re.sub('[0-9]', 'd', shape)
    return { 'shape':shape }

# a shorter word shape feature that collapses consecutive identical characters into one
def feat_wordshape_short(sent, i):
    shape = feat_wordshape(sent, i)['shape'];
    shape = re.sub('X+', 'X', shape)
    shape = re.sub('x+', 'x', shape)
    shape = re.sub('d+', 'd', shape)
    return { 'shapeshort':shape }

In [7]:
def feat_sent(sent,allfet,word,presuf,shp,case_fold):
    
    sent_feats = []

    for i in range(len(sent)):

        feats = {}
        
        if (allfet == 1 and case_fold == 0):
            # word features
            feats.update( feat_w(sent, i, 0) )
            feats.update( feat_w(sent, i, 1) )
            feats.update( feat_w(sent, i, -1) )

            # prefix and suffix
            feats.update( feat_prefix(sent, i, 3) )
            feats.update( feat_prefix(sent, i, 4) )
            feats.update( feat_prefix(sent, i, 5) )
            feats.update( feat_suffix(sent, i, 3) )
            feats.update( feat_suffix(sent, i, 4) )
            feats.update( feat_suffix(sent, i, 5) )


            # word shape
            feats.update( feat_alphanumeric(sent, i) )
            feats.update( feat_hasdash(sent, i) )
            feats.update( feat_wordshape(sent, i) )
            feats.update( feat_wordshape_short(sent, i) )
            
        elif (allfet == 1 and case_fold == 1):
            #print("yes!")
            # word features
            feats.update( feat_w(sent, i, 0) )
            feats.update( feat_w(sent, i, 1) )
            feats.update( feat_w(sent, i, -1) )
            feats.update( feat_w_casefold(sent, i, 0) )
            feats.update( feat_w_casefold(sent, i, 1) )
            feats.update( feat_w_casefold(sent, i, -1) )

            # prefix and suffix
            feats.update( feat_prefix(sent, i, 3) )
            feats.update( feat_prefix(sent, i, 4) )
            feats.update( feat_prefix(sent, i, 5) )
            feats.update( feat_suffix(sent, i, 3) )
            feats.update( feat_suffix(sent, i, 4) )
            feats.update( feat_suffix(sent, i, 5) )
            feats.update( feat_prefix_casefold(sent, i, 3))
            feats.update( feat_prefix_casefold(sent, i, 4))
            feats.update( feat_prefix_casefold(sent, i, 5))
            feats.update( feat_suffix_casefold(sent, i, 3) )
            feats.update( feat_suffix_casefold(sent, i, 4) )
            feats.update( feat_suffix_casefold(sent, i, 5))


            # word shape
            feats.update( feat_alphanumeric(sent, i) )
            feats.update( feat_hasdash(sent, i) )
            feats.update( feat_wordshape(sent, i) )
            feats.update( feat_wordshape_short(sent, i) )
        
        elif (allfet == 0 and word == 1):
            feats.update( feat_w(sent, i, 0) )
            feats.update( feat_w(sent, i, 1) )
            feats.update( feat_w(sent, i, -1) )
            
            if case_fold == 1:
                feats.update( feat_w_casefold(sent, i, 0))
                feats.update( feat_w_casefold(sent, i, 1) )
                feats.update( feat_w_casefold(sent, i, -1) )
                
        elif (allfet == 0 and presuf == 1):
            feats.update( feat_prefix(sent, i, 3) )
            feats.update( feat_prefix(sent, i, 4) )
            feats.update( feat_prefix(sent, i, 5) )
            feats.update( feat_suffix(sent, i, 3) )
            feats.update( feat_suffix(sent, i, 4) )
            feats.update( feat_suffix(sent, i, 5) )
            
            if case_fold == 1:
                feats.update( feat_prefix_casefold(sent, i, 3))
                feats.update( feat_prefix_casefold(sent, i, 4))
                feats.update( feat_prefix_casefold(sent, i, 5))
                feats.update( feat_suffix_casefold(sent, i, 3) )
                feats.update( feat_suffix_casefold(sent, i, 4) )
                feats.update( feat_suffix_casefold(sent, i, 5))
                
        elif (allfet == 0 and shp == 1):
            feats.update( feat_alphanumeric(sent, i) )
            feats.update( feat_hasdash(sent, i) )
            feats.update( feat_wordshape(sent, i) )
            feats.update( feat_wordshape_short(sent, i) )
            
        else:
            if case_fold == 0:
                if word == -1:
                    
                    feats.update( feat_prefix(sent, i, 3) )
                    feats.update( feat_prefix(sent, i, 4) )
                    feats.update( feat_prefix(sent, i, 5) )
                    feats.update( feat_suffix(sent, i, 3) )
                    feats.update( feat_suffix(sent, i, 4) )
                    feats.update( feat_suffix(sent, i, 5) )


                    # word shape
                    feats.update( feat_alphanumeric(sent, i) )
                    feats.update( feat_hasdash(sent, i) )
                    feats.update( feat_wordshape(sent, i) )
                    feats.update( feat_wordshape_short(sent, i) )
                    
                elif presuf == -1:
                    feats.update( feat_w(sent, i, 0) )
                    feats.update( feat_w(sent, i, 1) )
                    feats.update( feat_w(sent, i, -1) )


                    # word shape
                    feats.update( feat_alphanumeric(sent, i) )
                    feats.update( feat_hasdash(sent, i) )
                    feats.update( feat_wordshape(sent, i) )
                    feats.update( feat_wordshape_short(sent, i) )
                    
                elif shp == -1:
                    feats.update( feat_w(sent, i, 0) )
                    feats.update( feat_w(sent, i, 1) )
                    feats.update( feat_w(sent, i, -1) )

                    # prefix and suffix
                    feats.update( feat_prefix(sent, i, 3) )
                    feats.update( feat_prefix(sent, i, 4) )
                    feats.update( feat_prefix(sent, i, 5) )
                    feats.update( feat_suffix(sent, i, 3) )
                    feats.update( feat_suffix(sent, i, 4) )
                    feats.update( feat_suffix(sent, i, 5) )
                    
            elif case_fold == 1:
                if word == -1:
                    feats.update( feat_prefix(sent, i, 3) )
                    feats.update( feat_prefix(sent, i, 4) )
                    feats.update( feat_prefix(sent, i, 5) )
                    feats.update( feat_suffix(sent, i, 3) )
                    feats.update( feat_suffix(sent, i, 4) )
                    feats.update( feat_suffix(sent, i, 5) )
                    feats.update( feat_prefix_casefold(sent, i, 3))
                    feats.update( feat_prefix_casefold(sent, i, 4))
                    feats.update( feat_prefix_casefold(sent, i, 5))
                    feats.update( feat_suffix_casefold(sent, i, 3) )
                    feats.update( feat_suffix_casefold(sent, i, 4) )
                    feats.update( feat_suffix_casefold(sent, i, 5))

                    # word shape
                    feats.update( feat_alphanumeric(sent, i) )
                    feats.update( feat_hasdash(sent, i) )
                    feats.update( feat_wordshape(sent, i) )
                    feats.update( feat_wordshape_short(sent, i) )
                    
                elif presuf == -1:
                    feats.update( feat_w(sent, i, 0) )
                    feats.update( feat_w(sent, i, 1) )
                    feats.update( feat_w(sent, i, -1) )
                    feats.update( feat_w_casefold(sent, i, 0))
                    feats.update( feat_w_casefold(sent, i, 1) )
                    feats.update( feat_w_casefold(sent, i, -1) )

                    # word shape
                    feats.update( feat_alphanumeric(sent, i) )
                    feats.update( feat_hasdash(sent, i) )
                    feats.update( feat_wordshape(sent, i) )
                    feats.update( feat_wordshape_short(sent, i) )
                    
                elif shp == -1:
                    feats.update( feat_w(sent, i, 0) )
                    feats.update( feat_w(sent, i, 1) )
                    feats.update( feat_w(sent, i, -1) )
                    feats.update( feat_w_casefold(sent, i, 0))
                    feats.update( feat_w_casefold(sent, i, 1) )
                    feats.update( feat_w_casefold(sent, i, -1) )

                    # prefix and suffix
                    feats.update( feat_prefix(sent, i, 3) )
                    feats.update( feat_prefix(sent, i, 4) )
                    feats.update( feat_prefix(sent, i, 5) )
                    feats.update( feat_suffix(sent, i, 3) )
                    feats.update( feat_suffix(sent, i, 4) )
                    feats.update( feat_suffix(sent, i, 5) )
                    
                    
            
            

        sent_feats.append(feats)

    return sent_feats

In [8]:
def label_sent(sent):
    return [ token[1] for token in sent ]

In [9]:
'''import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

def performmm(ctrain_X,ctrain_Y,ctest_X,ctest_Y):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs', # default; Gradient descent using the L-BFGS method
        min_freq=5, # the minimum freq of a feature in the training set
        max_iterations=100, # set this value to stop the training after k epochs
    )

    # train a CRF model on the training set
    try:
        crf.fit(ctrain_X, ctrain_Y)
    except AttributeError:
        pass
    cY_pred = crf.predict(ctest_X)
    cY_pred = transform_label(cY_pred)
    ctest_Y = transform_label(ctest_Y)
    eval(ctest_Y, cY_pred, "LOC")'''

'import sklearn_crfsuite\nfrom sklearn_crfsuite import scorers\nfrom sklearn_crfsuite import metrics\n\ndef performmm(ctrain_X,ctrain_Y,ctest_X,ctest_Y):\n    crf = sklearn_crfsuite.CRF(\n        algorithm=\'lbfgs\', # default; Gradient descent using the L-BFGS method\n        min_freq=5, # the minimum freq of a feature in the training set\n        max_iterations=100, # set this value to stop the training after k epochs\n    )\n\n    # train a CRF model on the training set\n    try:\n        crf.fit(ctrain_X, ctrain_Y)\n    except AttributeError:\n        pass\n    cY_pred = crf.predict(ctest_X)\n    cY_pred = transform_label(cY_pred)\n    ctest_Y = transform_label(ctest_Y)\n    eval(ctest_Y, cY_pred, "LOC")'

In [12]:
print("Table 1")
#table_1 row_1
ctrain_X = [ feat_sent(sent,1,0,0,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,1,0,0,0,0) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_2
ctrain_X = [ feat_sent(sent,0,-1,0,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,0,-1,0,0,0) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_3
ctrain_X = [ feat_sent(sent,0,0,-1,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,-1,0,0) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_4
ctrain_X = [ feat_sent(sent,0,0,0,-1,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,-1,0) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_5
ctrain_X = [ feat_sent(sent,0,1,0,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,1,0,0,0) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_6
ctrain_X = [ feat_sent(sent,0,0,1,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,1,0,0) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_7
ctrain_X = [ feat_sent(sent,0,0,0,1,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,1,0) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)



print("Table 2")
#table_1 row_1
ctrain_X = [ feat_sent(sent,1,0,0,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,1,0,0,0,1) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_2
ctrain_X = [ feat_sent(sent,0,-1,0,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,0,-1,0,0,1) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_3
ctrain_X = [ feat_sent(sent,0,0,-1,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,-1,0,1) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_4
ctrain_X = [ feat_sent(sent,0,0,0,-1,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,-1,1) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_5
ctrain_X = [ feat_sent(sent,0,1,0,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,1,0,0,1) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_6
ctrain_X = [ feat_sent(sent,0,0,1,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,1,0,1) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_1 row_7
ctrain_X = [ feat_sent(sent,0,0,0,1,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,1,1) for sent in conll2003_test ]
ctest_Y = [ label_sent(sent) for sent in conll2003_test ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)



print("table 3")
#table_3 row_1
ctrain_X = [ feat_sent(sent,1,0,0,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,1,0,0,0,0) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_2
ctrain_X = [ feat_sent(sent,0,-1,0,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,0,-1,0,0,0) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_3
ctrain_X = [ feat_sent(sent,0,0,-1,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,-1,0,0) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_4
ctrain_X = [ feat_sent(sent,0,0,0,-1,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,-1,0) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_5
ctrain_X = [ feat_sent(sent,0,1,0,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,1,0,0,0) for sent in twitter_ner]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_6
ctrain_X = [ feat_sent(sent,0,0,1,0,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,1,0,0) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_7
ctrain_X = [ feat_sent(sent,0,0,0,1,0) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,1,0) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

print("table 4")
#table_3 row_1
ctrain_X = [ feat_sent(sent,1,0,0,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,1,0,0,0,1) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_2
ctrain_X = [ feat_sent(sent,0,-1,0,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
ctest_X = [ feat_sent(sent,0,-1,0,0,1) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_3
ctrain_X = [ feat_sent(sent,0,0,-1,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,-1,0,1) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_4
ctrain_X = [ feat_sent(sent,0,0,0,-1,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,-1,1) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_5
ctrain_X = [ feat_sent(sent,0,1,0,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,1,0,0,1) for sent in twitter_ner]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_6
ctrain_X = [ feat_sent(sent,0,0,1,0,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,1,0,1) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#table_3 row_7
ctrain_X = [ feat_sent(sent,0,0,0,1,1) for sent in conll2003_train ]
ctrain_Y = [ label_sent(sent) for sent in conll2003_train ]
#ctrain_Y = transform_label(ctrain_Y)
ctest_X = [ feat_sent(sent,0,0,0,1,1) for sent in twitter_ner ]
ctest_Y = [ label_sent(sent) for sent in twitter_ner ]
perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y)

#ttest_X = [ feat_sent(sent,1,0,0,0,0) for sent in twitter_ner ]
#ttest_Y = [ label_sent(sent) for sent in twitter_ner ]
#ttest_Y = transform_label(ttest_Y)


Table 1
{'F1(LOC)': 0.832, 'Pre(LOC)': 0.856, 'Rec(LOC)': 0.81} {'F1(PER)': 0.851, 'Pre(PER)': 0.832, 'Rec(PER)': 0.871}
{'F1(LOC)': 0.79, 'Pre(LOC)': 0.801, 'Rec(LOC)': 0.779} {'F1(PER)': 0.786, 'Pre(PER)': 0.777, 'Rec(PER)': 0.795}
{'F1(LOC)': 0.782, 'Pre(LOC)': 0.827, 'Rec(LOC)': 0.742} {'F1(PER)': 0.805, 'Pre(PER)': 0.779, 'Rec(PER)': 0.832}
{'F1(LOC)': 0.801, 'Pre(LOC)': 0.88, 'Rec(LOC)': 0.736} {'F1(PER)': 0.776, 'Pre(PER)': 0.836, 'Rec(PER)': 0.724}
{'F1(LOC)': 0.703, 'Pre(LOC)': 0.931, 'Rec(LOC)': 0.565} {'F1(PER)': 0.657, 'Pre(PER)': 0.817, 'Rec(PER)': 0.549}
{'F1(LOC)': 0.775, 'Pre(LOC)': 0.847, 'Rec(LOC)': 0.715} {'F1(PER)': 0.675, 'Pre(PER)': 0.716, 'Rec(PER)': 0.639}
{'F1(LOC)': 0.427, 'Pre(LOC)': 0.387, 'Rec(LOC)': 0.477} {'F1(PER)': 0.561, 'Pre(PER)': 0.568, 'Rec(PER)': 0.555}
Table 2
{'F1(LOC)': 0.843, 'Pre(LOC)': 0.848, 'Rec(LOC)': 0.839} {'F1(PER)': 0.853, 'Pre(PER)': 0.836, 'Rec(PER)': 0.871}
{'F1(LOC)': 0.811, 'Pre(LOC)': 0.814, 'Rec(LOC)': 0.808} {'F1(PER)': 0.786,

In [58]:
ctrain_X[0]

[{'w[0]': 'EU',
  'w[1]': 'rejects',
  'w[-1]': '<S>',
  'w[0].lower()': 'eu',
  'w[1].lower()': 'rejects',
  'prefix[len=3]': 'EU',
  'prefix[len=4]': 'EU',
  'prefix[len=5]': 'EU',
  'suffix[len=3]': 'EU',
  'suffix[len=4]': 'EU',
  'suffix[len=5]': 'EU',
  'prefix[len=3].lower()': 'eu',
  'prefix[len=4].lower()': 'eu',
  'prefix[len=5].lower()': 'eu',
  'suffix[len=3].lower()': 'eu',
  'suffix[len=4].lower()': 'eu',
  'suffix[len=5].lower()': 'eu',
  'alphanumeric': False,
  'hasdash': False,
  'shape': 'XX',
  'shapeshort': 'X'},
 {'w[0]': 'rejects',
  'w[1]': 'German',
  'w[-1]': 'EU',
  'w[0].lower()': 'rejects',
  'w[1].lower()': 'german',
  'w[-1].lower()': 'eu',
  'prefix[len=3]': 'rej',
  'prefix[len=4]': 'reje',
  'prefix[len=5]': 'rejec',
  'suffix[len=3]': 'cts',
  'suffix[len=4]': 'ects',
  'suffix[len=5]': 'jects',
  'prefix[len=3].lower()': 'rej',
  'prefix[len=4].lower()': 'reje',
  'prefix[len=5].lower()': 'rejec',
  'suffix[len=3].lower()': 'cts',
  'suffix[len=4].lo

In [11]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

def perform(ctrain_X,ctrain_Y,ctest_X,ctest_Y):
    #print("in")
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs', # default; Gradient descent using the L-BFGS method
        min_freq=5, # the minimum freq of a feature in the training set
        max_iterations=100, # set this value to stop the training after k epochs
    )

    # train a CRF model on the training set
    try:
        crf.fit(ctrain_X, ctrain_Y)
    except AttributeError:
        pass
    #print("inside")
    cY_pred = crf.predict(ctest_X)
    cY_pred = transform_label(cY_pred)
    ctest_Y = transform_label(ctest_Y)
    a = eval(ctest_Y, cY_pred, "LOC")
    b = eval(ctest_Y, cY_pred, "PER")
    print(a,b)

In [60]:

#c_hum = transform_human(cY_pred)
#ctest_human = transform_human(ctest_Y)

[['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O'],
 ['B-PER', 'I-PER'],
 ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  

In [33]:
#from sklearn_crfsuite.utils import flatten

#y_true_flat = flatten(ctest_human)
#y_pred_flat = flatten(c_hum)
x = ["B-PER",'I-PER',"B-LOC",'I-LOC']
for label in x:
    print( label, metrics.flat_f1_score(ctest_Y, cY_pred, average='weighted', labels=label) )
    print( label, metrics.flat_precision_score(ctest_Y, cY_pred, average='weighted', labels=label) )
    print( label, metrics.flat_recall_score(ctest_Y, cY_pred, average='weighted', labels=label) )
    



B-PER 0.8090715535236194
B-PER 0.8226583625448578
B-PER 0.7976291278577476
I-PER 0.832318866428076
I-PER 0.8376584062142055
I-PER 0.8293650793650794
B-LOC 0.8090715535236194
B-LOC 0.8226583625448578
B-LOC 0.7976291278577476
I-LOC 0.8090715535236193
I-LOC 0.822658362544858
I-LOC 0.7976291278577476


{'Precision (LOC)': 0.8478787878787879,
 'Recall (LOC)': 0.8387290167865707,
 'F1 (LOC)': 0.8432790837854129}