<a href="https://colab.research.google.com/github/PawinData/TM/blob/main/TM_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.7MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [None]:

from itertools import chain
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV



# Pre-processing

Build the [reader of dataset](https://www.nltk.org/_modules/nltk/corpus/reader/conll.html), extract features and labels from training and test data, and represent them in the structure suitable for Conditional Random Field (CRF).

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.corpus.reader.conll import ConllCorpusReader
# a .ConLL file reader
READER = ConllCorpusReader(root="./", fileids=".conll", columntypes=('words','pos','tree','chunk','ne','srl','ignore'))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
# extract features and labels
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {'bias': 1.0,
                'word.lower()': word.lower(),
                'word[-3:]': word[-3:],
                'word[-2:]': word[-2:],
                'word.isupper()': word.isupper(),
                'word.istitle()': word.istitle(),
                'word.isdigit()': word.isdigit(),
                'postag': postag,
                'postag[:2]': postag[:2],
               }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({'-1:word.lower()': word1.lower(),
                          '-1:word.istitle()': word1.istitle(),
                          '-1:word.isupper()': word1.isupper(),
                          '-1:postag': postag1,
                          '-1:postag[:2]': postag1[:2],
                      })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({'+1:word.lower()': word1.lower(),
                         '+1:word.istitle()': word1.istitle(),
                         '+1:word.isupper()': word1.isupper(),
                         '+1:postag': postag1,
                         '+1:postag[:2]': postag1[:2],
                       })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

In [17]:
# represent data in structure suitable for CRF
def represent(filename):
    word_pos = [nltk.pos_tag(sentence) for sentence in READER.sents(filename)]
    word_obi = list(READER.tagged_sents(filename))
    data_sents = [[(a,b,d) for (a,b),(c,d) in zip(lst1, lst2)] for lst1,lst2 in zip(word_pos,word_obi)]
    return [sent2features(s) for s in data_sents], [sent2labels(s) for s in data_sents]

In [22]:
# structured training set
X_train,y_train = represent("wnut17train.conll") 

# structured test set
X_test, y_test  = represent("emerging.test.annotated")  

In [None]:
dev_sents = list()
for sentence in READER.sents("emerging.dev.conll"):
    dev_sents.append(sentence)   

In [None]:
word_obi = list()
for pair in READER.tagged_sents("emerging.dev.conll"):
    word_obi.append(pair)

In [34]:
word_pos = [nltk.pos_tag(sentence) for sentence in dev_sents]
data_sents = [[(a,b,d) for (a,b),(c,d) in zip(lst1, lst2)] for lst1,lst2 in zip(word_pos,word_obi)]

# structured development set
X_dev, y_dev = [sent2features(s) for s in data_sents], [sent2labels(s) for s in data_sents]

# Baseline

In [41]:
from sklearn_crfsuite import CRF, metrics

In [None]:
# training
baseline = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
baseline.fit(X_train, y_train)

In [45]:
# evaluate
y_pred = baseline.predict(X_test)

labels = list(baseline.classes_)
labels.remove('O')
sorted_labels = sorted(labels, key = lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=4))

                 precision    recall  f1-score   support

  B-corporation     0.0000    0.0000    0.0000        66
  I-corporation     0.0000    0.0000    0.0000        22
B-creative-work     0.3333    0.0352    0.0637       142
I-creative-work     0.2963    0.0367    0.0653       218
        B-group     0.3000    0.0364    0.0649       165
        I-group     0.3571    0.0714    0.1190        70
     B-location     0.3846    0.2333    0.2905       150
     I-location     0.2308    0.0638    0.1000        94
       B-person     0.5514    0.1375    0.2201       429
       I-person     0.5472    0.2214    0.3152       131
      B-product     0.6000    0.0236    0.0455       127
      I-product     0.3750    0.0476    0.0845       126

      micro avg     0.4297    0.0931    0.1530      1740
      macro avg     0.3313    0.0756    0.1141      1740
   weighted avg     0.4009    0.0931    0.1422      1740



# Hyperparameters Optimization

# Experiments with Features