<a href="https://www.kaggle.com/code/pawankumargunjan/named-entity-recognition-with-sklearn-crfsuite?scriptVersionId=112342570" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Importing Necessary Libraries

In [1]:
!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git #egg=sklearn_crfsuite

Collecting git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git
  Cloning https://github.com/MeMartijn/updated-sklearn-crfsuite.git to /tmp/pip-req-build-nish26n7
  Running command git clone --filter=blob:none --quiet https://github.com/MeMartijn/updated-sklearn-crfsuite.git /tmp/pip-req-build-nish26n7
  Resolved https://github.com/MeMartijn/updated-sklearn-crfsuite.git to commit 675038761b4405f04691a83339d04903790e2b95
  Preparing metadata (setup.py) ... [?25l- done
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.4/965.4 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sklearn-crfsuite
  Building wheel for sklearn-crfsuite (setup.py) ... [?25l- \ done
[?25h  Created wheel for sklearn-crfsuite: filename=sklearn_crfsuite-0.3.6-py2.py3-none-any.whl size=10889 sha25

In [2]:
#Make the necessary imports
from nltk.tag import pos_tag
from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import make_scorer,confusion_matrix
from pprint import pprint
from sklearn.metrics import f1_score,classification_report
from sklearn.pipeline import Pipeline
import string
import os
import warnings
warnings.filterwarnings('ignore')

## Load the training/testing data. 
**input:** conll format data, but with only 2 tab separated colums - words and NEtags.

**output:** A list where each item is 2 lists.  sentence as a list of tokens, NER tags as a list for each token.

In [3]:
def load_data_conll(file_path):
    myoutput,words,tags = [],[],[]
    fh = open(file_path)
    for line in fh:
        line = line.strip()
        if "\t" not in line:
            #Sentence ended.
            myoutput.append([words,tags])
            words,tags = [],[]
        else:
            word, tag = line.split("\t")
            words.append(word)
            tags.append(tag)
    fh.close()
    return myoutput

In [4]:
train = load_data_conll('/kaggle/input/conll2003/train.txt')
test = load_data_conll('/kaggle/input/conll2003/test.txt')

In [5]:
print(train[0])
for i in range(len(train[0][0])):
    print(train[0][0][i],'-->',train[0][1][i])

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']]
EU --> B-ORG
rejects --> O
German --> B-MISC
call --> O
to --> O
boycott --> O
British --> B-MISC
lamb --> O
. --> O


## Get features for all words in the sentence

#### Features :
    - word context: a window of 2 words on either side of the current word, and current word.
    
    - POS context: a window of 2 POS tags on either side of the current word, and current tag. 


- **input:** sentence as a list of tokens.
- **output:** list of dictionaries. each dict represents features for that word.

In [6]:
def sent2feats(sentence):
    features = []
    sentence_tags = pos_tag(sentence)            #This format is specific to this POS tagger!
    for i in range(0,len(sentence)):
        word = sentence[i]
        word_features = {}
        
        #word features: word, prev 2 words, next 2 words in the sentence.
        word_features['word'] = word
        
        if i == 0:
            word_features["prevWord"] = word_features["prevSecondWord"] = ""
        elif i==1:
            word_features["prevWord"] = sentence[0]
            word_features["prevSecondWord"] = ""
        else:
            word_features["prevWord"] = sentence[i-1]
            word_features["prevSecondWord"] = sentence[i-2]
            
        #next two words as features
        if i == len(sentence)-2:
            word_features["nextWord"] = sentence[i+1]
            word_features["nextNextWord"] = ""
        elif i==len(sentence)-1:
            word_features["nextWord"] = ""
            word_features["nextNextWord"] = ""
        else:
            word_features["nextWord"] = sentence[i+1]
            word_features["nextNextWord"] = sentence[i+2]
        
        #POS tag features: current tag, previous and next 2 tags.
        word_features['tag'] = sentence_tags[i][1]
        if i == 0:
            word_features["prevTag"] = word_features["prevSecondTag"] = ""
        elif i == 1:
            word_features["prevTag"] = sentence_tags[0][1]
            word_features["prevSecondTag"] = ""
        else:
            word_features["prevTag"] = sentence_tags[i - 1][1]

            word_features["prevSecondTag"] = sentence_tags[i - 2][1]
            
        # Last two words as features
        if i == len(sentence) - 2:
            word_features["nextTag"] = sentence_tags[i + 1][1]
            word_features["nextNextTag"] = ""
        elif i == len(sentence) - 1:
            word_features["nextTag"] = ""
            word_features["nextNextTag"] = ""
        else:
            word_features["nextTag"] = sentence_tags[i + 1][1]
            word_features["nextNextTag"] = sentence_tags[i + 2][1]
        #That is it! You can add whatever you want!
        features.append(word_features)
    return features

## Extract features from the conll data, after loading it.

In [7]:
def get_feats_conll(conll_data):
    feats = []
    labels = []
    for sentence in conll_data:
        feats.append(sent2feats(sentence[0]))
        labels.append(sentence[1])
    return feats, labels

In [8]:
features, labels = get_feats_conll(train)
test_featuress, test_labels = get_feats_conll(test)

print('Train',len(features),len(labels))
print('Test',len(test_featuress),len(test_labels))

Train 14041 14041
Test 3453 3453


In [9]:
print('First Sentence :',' '.join(train[0][0]))
for i in range(len(labels[0])):
    print('-'*125)
    for Key in features[0][i].keys():
        if Key == 'word':
            print(Key,'-->',features[0][i][Key], ' Label >>>',labels[0][i])
        else:
            print(Key,'-->',features[0][i][Key],end=' | ')
    print()


First Sentence : EU rejects German call to boycott British lamb .
-----------------------------------------------------------------------------------------------------------------------------
word --> EU  Label >>> B-ORG
prevWord -->  | prevSecondWord -->  | nextWord --> rejects | nextNextWord --> German | tag --> NNP | prevTag -->  | prevSecondTag -->  | nextTag --> VBZ | nextNextTag --> JJ | 
-----------------------------------------------------------------------------------------------------------------------------
word --> rejects  Label >>> O
prevWord --> EU | prevSecondWord -->  | nextWord --> German | nextNextWord --> call | tag --> VBZ | prevTag --> NNP | prevSecondTag -->  | nextTag --> JJ | nextNextTag --> NN | 
-----------------------------------------------------------------------------------------------------------------------------
word --> German  Label >>> B-MISC
prevWord --> rejects | prevSecondWord --> EU | nextWord --> call | nextNextWord --> to | tag --> JJ | prevTa

## Build the Model

[sklearn_crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html)

In [10]:
# crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True)
crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)#, all_possible_states=True)

In [11]:
#Just to fit on training data
crf.fit(features, labels)
labels = list(crf.classes_)

In [12]:
print(labels)
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(sorted_labels)

['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']
['O', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']


## Evaluation

In [13]:
# test_featuress, test_labels
y_pred = crf.predict(test_featuress)

In [14]:
print(test_labels[0])
y_pred[0]

['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']


['O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O']

## f1-score

In [15]:
metrics.flat_f1_score(test_labels, y_pred, average='weighted', labels=labels)

0.9254909683914324

## Classification Report

In [16]:
print(metrics.flat_classification_report(test_labels, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

       B-ORG      0.674     0.559     0.611      1661
           O      0.973     0.981     0.977     38323
      B-MISC      0.643     0.308     0.416       702
       B-PER      0.766     0.772     0.769      1617
       I-PER      0.813     0.890     0.850      1156
       B-LOC      0.706     0.759     0.732      1668
       I-ORG      0.559     0.701     0.622       835
      I-MISC      0.632     0.500     0.558       216
       I-LOC      0.756     0.482     0.589       257

    accuracy                          0.928     46435
   macro avg      0.725     0.661     0.680     46435
weighted avg      0.926     0.928     0.925     46435



## Confusion Matrix

In [17]:
def print_cm(cm, labels):
    print("\n")
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(columnwidth) % cm[i, j]
            sum =  sum + int(cell)
            print(cell, end=" ")
        print(sum) #Prints the total number of instances per cat at the end.

In [18]:
#python-crfsuite does not have a confusion matrix function, 
#so writing it using sklearn's confusion matrix and print_cm from github
def get_confusion_matrix(y_true,y_pred,labels):
    trues,preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
        cm = confusion_matrix(trues,preds)
    print_cm(cm,labels)

In [19]:
get_confusion_matrix(test_labels, y_pred, labels=sorted_labels)



                O  B-LOC  I-LOC B-MISC I-MISC  B-ORG  I-ORG  B-PER  I-PER 
         O   1266     38     91    100      1      1     13      6    152 1668
     B-LOC     45    216     53     21      1      2     11      3    350 702
     I-LOC    226     47    928    160      0      2     21      6    271 1661
    B-MISC    120      6     91   1249      0      2     44     13     92 1617
    I-MISC      6      1      0      0    124      7     51     36     32 257
     B-ORG      1      4      2      0      3    108     27     12     59 216
     I-ORG     14      2     20      8     17     14    585     97     78 835
     B-PER      1      0      2      0     14      4     79   1029     27 1156
     I-PER    114     22    189     92      4     31    216     64  37591 38323
