<a href="https://colab.research.google.com/github/Shuyuan301/Shuyuan301.github.io/blob/main/hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1 Set Up Environment

We require the following packages and versions:
* scikit-learn<0.24 (note that sklearn-crfsuite is incompatible with scikit-learn>=0.24)
* sklearn-crfsuite==0.3.6 (https://github.com/TeamHG-Memex/sklearn-crfsuite)

In [None]:
# Install & upgrade packages on Google Colab

# upgrade pip
!pip3 install pip --upgrade

# install scikit-learn 0.23.x (note that sklearn-crfsuite is inconsistent with scikit-learn>=0.24)
!pip3 install 'scikit-learn>=0.23.0,<0.24.0' --upgrade

# install scikit-learn 0.3.6
!pip3 install 'sklearn-crfsuite==0.3.6' --upgrade

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 6.7MB/s 
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.3.1
    Uninstalling pip-19.3.1:
      Successfully uninstalled pip-19.3.1
Successfully installed pip-21.0.1
Collecting scikit-learn<0.24.0,>=0.23.0
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 6.9 MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.23.2 thre

## 2 Download the CoNLL2003 NER dataset and the Twitter NER dataset

CoNLL2003 NER dataset:
* training set: https://jiepujiang.github.io/data/CoNLL2003_ner_train.txt
* test set: https://jiepujiang.github.io/data/CoNLL2003_ner_test.txt

Twitter NER dataset:
* whole dataset: https://jiepujiang.github.io/data/twitter_ner.txt

In [None]:
# A utility function for reading the dataset (from the web).
# Only load the first (word) and last columns (IOB label).

import re
import requests

# regularize the name for person and geo-loc tags on different datasets
def transform_label(old):
    if old=='B-PER' or old=='B-person':
        return 'B-PER'
    if old=='I-PER' or old=='I-person':
        return 'I-PER'
    if old=='B-LOC' or old=='B-geo-loc':
        return 'B-LOC'
    if old=='I-LOC' or old=='I-geo-loc':
        return 'I-LOC'
    return old

def load_dataset(url):
    sents = []
    response = requests.get(url)
    sent = []
    for line in response.iter_lines(decode_unicode=True):
        line = line.strip()
        if len(line)==0 or line[0:10]=='-DOCSTART-':
            if len(sent)>0:
                sents.append(sent)
                sent = []
        else:
            tokens = re.split('\\s+', line)
            sent.append([tokens[0], transform_label(tokens[-1])])
    return sents

In [None]:
# load the dataset files

url_conll2003_train = 'https://jiepujiang.github.io/data/CoNLL2003_ner_train.txt'
url_conll2003_test = 'https://jiepujiang.github.io/data/CoNLL2003_ner_test.txt'
url_twitter_ner = 'https://jiepujiang.github.io/data/twitter_ner.txt'

conll2003_train = load_dataset(url_conll2003_train)
conll2003_test = load_dataset(url_conll2003_test)
twitter_ner = load_dataset(url_twitter_ner)

print( len(conll2003_train), len(conll2003_test), len(twitter_ner) )

14041 3453 2394


# 3 Compute values for table1

In [187]:
import re
def feat_w(sent, i, offset):
    if i+offset<0:
        return { 'w[%d]' % offset : '<S>' }
    if i+offset>=len(sent):
        return { 'w[%d]' % offset : '</S>' }
    else:
        return { 'w[%d]' % offset : sent[i+offset][0] }

# prefix
def feat_prefix(sent, i, len):
    return { 'prefix[len=%d]' % len: sent[i][0][:len] }
# suffix
def feat_suffix(sent, i, len):
    return { 'suffix[len=%d]' % len: sent[i][0][-len:] }

# a simple word shape feature: uppercase letter-->X, lowercase-->x, digit-->d
def feat_wordshape(sent, i):
    shape = re.sub('[A-Z]', 'X', sent[i][0])
    shape = re.sub('[a-z]', 'x', shape)
    shape = re.sub('[0-9]', 'd', shape)
    return { 'shape':shape }
# shortened word shape
def feat_wordshape_short(sent, i):
    shape = feat_wordshape(sent, i)['shape'];
    shape = re.sub('X+', 'X', shape)
    shape = re.sub('x+', 'x', shape)
    shape = re.sub('d+', 'd', shape)
    return { 'shapeshort':shape }

In [188]:
# prepare features for a sentence
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))

        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )

        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [189]:
def label_sent(sent):
    return [ token[1] for token in sent ]

In [190]:
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in conll2003_test ]
test_Y = [ label_sent(sent) for sent in conll2003_test ]

In [191]:

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [192]:
test_Y_pred = crf.predict(test_X)

In [193]:
# pred_list PER
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))


In [194]:
# pred_list lOC
myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [195]:
# PERSON position
true_PERlist=[]

for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-PER':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-PER' and h==len(conll2003_test[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

In [197]:
# LOC position
true_LOClist=[]
for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-LOC':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-LOC' and h==len(conll2003_test[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [198]:
def evla(predlist,truelist):
  a = [x for x in predlist if x in truelist] 
  recall=len(a)/len(truelist)
  preci=len(a)/len(predlist)
  f1=(2*recall*preci)/(recall+preci)
  return {'precison':preci,'recall':recall,'F1_score':f1}

In [199]:
evla(mylist,true_PERlist) #table_1,all features PER

{'F1_score': 0.852760736196319,
 'precison': 0.8460133901399878,
 'recall': 0.8596165739022882}

In [200]:
evla(myLOClist,true_LOClist) #table_1,all features LOC

{'F1_score': 0.8587809293904647,
 'precison': 0.8645200486026732,
 'recall': 0.8531175059952039}

**Remove the word feature**

In [201]:
# remove the word feature 

def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}

        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )

        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [202]:
def label_sent(sent):
    return [ token[1] for token in sent ]

In [203]:

train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in conll2003_test ]
test_Y = [ label_sent(sent) for sent in conll2003_test ]

crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [204]:
test_Y_pred = crf.predict(test_X)

In [205]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

In [206]:
myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [207]:
true_PERlist=[]

for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-PER':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-PER' and h==len(conll2003_test[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))


In [208]:
true_LOClist=[]
for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-LOC':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-LOC' and h==len(conll2003_test[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [209]:
evla(mylist,true_PERlist) #table_1,remove word features

{'F1_score': 0.8067018305926157,
 'precison': 0.8094645080946451,
 'recall': 0.8039579468150897}

In [210]:
evla(myLOClist,true_LOClist) #table_1,remove word featues

{'F1_score': 0.8190193164933135,
 'precison': 0.8120212139068945,
 'recall': 0.8261390887290168}

**Remove prefix/suffix**

In [211]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))

        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [212]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in conll2003_test ]
test_Y = [ label_sent(sent) for sent in conll2003_test ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [214]:
test_Y_pred = crf.predict(test_X)

In [215]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

In [216]:
myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [217]:
true_PERlist=[]

for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-PER':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-PER' and h==len(conll2003_test[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

In [218]:
true_LOClist=[]
for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-LOC':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-LOC' and h==len(conll2003_test[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [219]:
evla(mylist,true_PERlist) #table_1,remove prefix/suffix

{'F1_score': 0.8190709046454767,
 'precison': 0.8096676737160121,
 'recall': 0.8286951144094001}

In [220]:
evla(myLOClist,true_LOClist) #table_1,remove prefix/suffix

{'F1_score': 0.7977563103770645,
 'precison': 0.8306294613887086,
 'recall': 0.7673860911270983}

**Remove shape**

In [221]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))

        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )
        sent_feats.append(feats)

    return sent_feats

In [222]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in conll2003_test ]
test_Y = [ label_sent(sent) for sent in conll2003_test ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)




CRF(keep_tempfiles=None)

In [223]:
test_Y_pred = crf.predict(test_X)

In [224]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

In [225]:
myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [226]:
true_PERlist=[]

for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-PER':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-PER' and h==len(conll2003_test[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

In [227]:
true_LOClist=[]
for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-LOC':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-LOC' and h==len(conll2003_test[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [228]:
evla(mylist,true_PERlist) #table_1,remove shape

{'F1_score': 0.7796495956873316,
 'precison': 0.8564026646928201,
 'recall': 0.7155225726654298}

In [229]:
evla(myLOClist,true_LOClist) #table_1,remove shape

{'F1_score': 0.8291118948380891,
 'precison': 0.8911095796002757,
 'recall': 0.7751798561151079}

**Only word feature**

In [230]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))
        sent_feats.append(feats)

    return sent_feats

In [231]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in conll2003_test ]
test_Y = [ label_sent(sent) for sent in conll2003_test ]

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)




CRF(keep_tempfiles=None)

In [232]:
test_Y_pred = crf.predict(test_X)

In [233]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

In [234]:
myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [235]:
true_PERlist=[]

for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-PER':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-PER' and h==len(conll2003_test[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

In [236]:
true_LOClist=[]
for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-LOC':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-LOC' and h==len(conll2003_test[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [237]:
evla(mylist,true_PERlist) #table_1,word featue only

{'F1_score': 0.6801195814648731,
 'precison': 0.8593012275731823,
 'recall': 0.5627705627705628}

In [238]:
evla(myLOClist,true_LOClist) #table_1,word feature only

{'F1_score': 0.7303903684786576,
 'precison': 0.9328984156570364,
 'recall': 0.6001199040767387}

**prefix/suffix only**

In [239]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}

        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )
        sent_feats.append(feats)

    return sent_feats

In [240]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in conll2003_test ]
test_Y = [ label_sent(sent) for sent in conll2003_test ]

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [241]:
test_Y_pred = crf.predict(test_X)

In [242]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

In [243]:
myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [244]:
true_PERlist=[]

for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-PER':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-PER' and h==len(conll2003_test[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

In [245]:
true_LOClist=[]
for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-LOC':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-LOC' and h==len(conll2003_test[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [246]:
evla(mylist,true_PERlist) #table_1,prefix/suffix featue only

{'F1_score': 0.7177033492822966,
 'precison': 0.8021390374331551,
 'recall': 0.6493506493506493}

In [247]:
evla(myLOClist,true_LOClist) #table_1,prefix/suffix feature only

{'F1_score': 0.8153846153846154,
 'precison': 0.8375474083438685,
 'recall': 0.7943645083932853}

**Shape only**

In [248]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [249]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in conll2003_test ]
test_Y = [ label_sent(sent) for sent in conll2003_test ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [250]:
test_Y_pred = crf.predict(test_X)

In [251]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

In [252]:
myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [253]:
true_PERlist=[]

for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-PER':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-PER' and h==len(conll2003_test[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

In [254]:
true_LOClist=[]
for i in range(0,len(conll2003_test)):
  if conll2003_test[i][len(conll2003_test[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(conll2003_test[i])-1,len(conll2003_test[i])-1))
  for j in range(0,len(conll2003_test[i])):
    if conll2003_test[i][j][1]=='B-LOC':
      for h in range(1,len(conll2003_test[i])-j):
        if conll2003_test[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  conll2003_test[i][j+h][1]=='I-LOC' and h==len(conll2003_test[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [255]:
evla(mylist,true_PERlist) #table_1,shape featue only

{'F1_score': 0.5708978328173375,
 'precison': 0.5716057036577805,
 'recall': 0.5701917130488559}

In [256]:
evla(myLOClist,true_LOClist) #table_1,shape feature only

{'F1_score': 0.4319097502014504,
 'precison': 0.39124087591240875,
 'recall': 0.48201438848920863}



# 4 Compute values for table3

**Table3**

In [None]:
# all features
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))

        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )

        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [None]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in twitter_ner ]
test_Y = [ label_sent(sent) for sent in twitter_ner ]

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [None]:
test_Y_pred = crf.predict(test_X)

In [None]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [None]:
true_PERlist=[]

for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-PER':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-PER' and h==len(twitter_ner[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

true_LOClist=[]
for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-LOC':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-LOC' and h==len(twitter_ner[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [None]:
evla(mylist,true_PERlist) #table2,all features

{'F1_score': 0.39558573853989815,
 'precison': 0.3196159122085048,
 'recall': 0.5189309576837416}

In [None]:
evla(myLOClist,true_LOClist) #table_2,all features

{'F1_score': 0.4545454545454546,
 'precison': 0.4391891891891892,
 'recall': 0.47101449275362317}

**Remove word feature**

In [137]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )

        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [138]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in twitter_ner ]
test_Y = [ label_sent(sent) for sent in twitter_ner ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [139]:
test_Y_pred = crf.predict(test_X)

In [140]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [141]:
true_PERlist=[]

for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-PER':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-PER' and h==len(twitter_ner[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

true_LOClist=[]
for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-LOC':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-LOC' and h==len(twitter_ner[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [142]:
evla(mylist,true_PERlist) 

{'F1_score': 0.24303232998885174,
 'precison': 0.1620817843866171,
 'recall': 0.48552338530066813}

In [143]:
evla(myLOClist,true_LOClist)

{'F1_score': 0.2638146167557932,
 'precison': 0.2596491228070175,
 'recall': 0.26811594202898553}

**Remove prefix/suffix feature**

In [144]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))


        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [145]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in twitter_ner ]
test_Y = [ label_sent(sent) for sent in twitter_ner ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [146]:
test_Y_pred = crf.predict(test_X)

In [148]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [149]:
true_PERlist=[]

for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-PER':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-PER' and h==len(twitter_ner[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

true_LOClist=[]
for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-LOC':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-LOC' and h==len(twitter_ner[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [150]:
evla(mylist,true_PERlist) 

{'F1_score': 0.34855545200372784,
 'precison': 0.29967948717948717,
 'recall': 0.41648106904231624}

In [151]:
evla(myLOClist,true_LOClist)

{'F1_score': 0.4155405405405405,
 'precison': 0.38924050632911394,
 'recall': 0.44565217391304346}

**Remove shape feature**

In [257]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))

        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )
        sent_feats.append(feats)

    return sent_feats

In [258]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in twitter_ner ]
test_Y = [ label_sent(sent) for sent in twitter_ner ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [259]:
test_Y_pred = crf.predict(test_X)

In [260]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [261]:
true_PERlist=[]

for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-PER':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-PER' and h==len(twitter_ner[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

true_LOClist=[]
for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-LOC':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-LOC' and h==len(twitter_ner[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [262]:
evla(mylist,true_PERlist) 

{'F1_score': 0.38545454545454544,
 'precison': 0.4228723404255319,
 'recall': 0.35412026726057905}

In [263]:
evla(myLOClist,true_LOClist)

{'F1_score': 0.43280182232346237,
 'precison': 0.5828220858895705,
 'recall': 0.3442028985507246}

**Only word feature**

In [165]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word features
        feats.update(feat_w(sent, i, 0))
        feats.update(feat_w(sent, i, 1))
        feats.update(feat_w(sent, i, -1))
        sent_feats.append(feats)

    return sent_feats

In [166]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in twitter_ner ]
test_Y = [ label_sent(sent) for sent in twitter_ner ]

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [167]:
test_Y_pred = crf.predict(test_X)

In [168]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [170]:
true_PERlist=[]

for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-PER':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-PER' and h==len(twitter_ner[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

true_LOClist=[]
for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-LOC':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-LOC' and h==len(twitter_ner[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [171]:
evla(mylist,true_PERlist) 

{'F1_score': 0.16859504132231404,
 'precison': 0.3269230769230769,
 'recall': 0.11358574610244988}

In [172]:
evla(myLOClist,true_LOClist)

{'F1_score': 0.25210084033613445,
 'precison': 0.5555555555555556,
 'recall': 0.16304347826086957}

**Only prefix/suffix feature**

In [173]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}

        # prefix and suffix
        feats.update( feat_prefix(sent, i, 3) )
        feats.update( feat_prefix(sent, i, 4) )
        feats.update( feat_prefix(sent, i, 5) )
        feats.update( feat_suffix(sent, i, 3) )
        feats.update( feat_suffix(sent, i, 4) )
        feats.update( feat_suffix(sent, i, 5) )
        sent_feats.append(feats)

    return sent_feats

In [174]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in twitter_ner ]
test_Y = [ label_sent(sent) for sent in twitter_ner ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [175]:
test_Y_pred = crf.predict(test_X)

In [176]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [177]:
true_PERlist=[]

for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-PER':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-PER' and h==len(twitter_ner[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

true_LOClist=[]
for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-LOC':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-LOC' and h==len(twitter_ner[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [178]:
evla(mylist,true_PERlist) 

{'F1_score': 0.21341107871720116,
 'precison': 0.14454976303317535,
 'recall': 0.40757238307349664}

In [179]:
evla(myLOClist,true_LOClist)

{'F1_score': 0.3227272727272727,
 'precison': 0.4329268292682927,
 'recall': 0.2572463768115942}

**Only Shape feature**

In [180]:
def feat_sent(sent):
    sent_feats = []
    for i in range(len(sent)):
        feats = {}
        # word shape
        feats.update( feat_wordshape(sent, i) )
        feats.update( feat_wordshape_short(sent, i) )
        sent_feats.append(feats)

    return sent_feats

In [181]:
def label_sent(sent):
    return [ token[1] for token in sent ]
    
train_X = [ feat_sent(sent) for sent in conll2003_train]
train_Y = [ label_sent(sent) for sent in conll2003_train]

test_X = [ feat_sent(sent) for sent in twitter_ner ]
test_Y = [ label_sent(sent) for sent in twitter_ner ]


crf = sklearn_crfsuite.CRF()

# train a CRF model on the training set
crf.fit(train_X, train_Y)



CRF(keep_tempfiles=None)

In [182]:
test_Y_pred = crf.predict(test_X)

In [183]:
mylist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-PER':
    mylist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-PER':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-PER':
              mylist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-PER' and h==len(test_Y_pred[i])-j-1:
              mylist.append((i,j,j+h))

myLOClist=[]
for i in range(0,len(test_Y_pred)):
  if test_Y_pred[i][len(test_Y_pred[i])-1]=='B-LOC':
    myLOClist.append((i,len(test_Y_pred[i])-1,len(test_Y_pred[i])-1))
  for j in range(0,len(test_Y_pred[i])):
    if test_Y_pred[i][j]=='B-LOC':
      for h in range(1,len(test_Y_pred[i])-j):
          if test_Y_pred[i][j+h]!='I-LOC':
              myLOClist.append((i,j,j+h-1))
              break
          else:
            if  test_Y_pred[i][j+h]=='I-LOC' and h==len(test_Y_pred[i])-j-1:
              myLOClist.append((i,j,j+h))

In [184]:
true_PERlist=[]

for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-PER':
    true_PERlist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-PER':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-PER':
          true_PERlist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-PER' and h==len(twitter_ner[i])-j-1: # if 'I-PER' in the end
            true_PERlist.append((i,j,j+h))

true_LOClist=[]
for i in range(0,len(twitter_ner)):
  if twitter_ner[i][len(twitter_ner[i])-1][1]=='B-LOC':
    true_LOClist.append((i,len(twitter_ner[i])-1,len(twitter_ner[i])-1))
  for j in range(0,len(twitter_ner[i])):
    if twitter_ner[i][j][1]=='B-LOC':
      for h in range(1,len(twitter_ner[i])-j):
        if twitter_ner[i][j+h][1]!='I-LOC':
          true_LOClist.append((i,j,j+h-1))
          #print((i,j,j+h-1))
          break
        else:
          if  twitter_ner[i][j+h][1]=='I-LOC' and h==len(twitter_ner[i])-j-1: # if 'I-LOC' in the end
            true_LOClist.append((i,j,j+h))

In [185]:
evla(mylist,true_PERlist) 

{'F1_score': 0.16131558339859042,
 'precison': 0.12439613526570048,
 'recall': 0.22939866369710468}

In [186]:
evla(myLOClist,true_LOClist)

{'F1_score': 0.14395393474088292,
 'precison': 0.097911227154047,
 'recall': 0.2717391304347826}