# Getting milliyetNER train data and parsing sentences for building CRF model

In [1]:
words = []
entity = []
train_sentences = []
# tokens = []
# entitys = []
with open("./train.txt", encoding="utf8") as f:
    lines = f.readlines()
    for line in lines:
        if line=='\n':
            train_sentences.append(list(zip(words, entity)))
            words=[]
            entity=[]
            continue
        line_split = line.split()
#         tokens.append(line_split[0])
#         entitys.append(line_split[1])
        words.append(line_split[0])
        entity.append(line_split[1])


In [2]:
train_sentences[0]

[('ATİLLA', 'B-PERSON'),
 ('Mutman', 'I-PERSON'),
 (',', 'O'),
 ('İzmir', 'B-LOCATION'),
 ('milletvekili', 'O')]

# Defining features function

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
#         'postag': postag,
#         'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
#             '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

In [4]:
sent2features(train_sentences[0])[0]

{'bias': 1.0,
 'word.lower()': 'ati̇lla',
 'word[-3:]': 'LLA',
 'word[-2:]': 'LA',
 'word.isupper()': True,
 'word.istitle()': False,
 'word.isdigit()': False,
 'BOS': True,
 '+1:word.lower()': 'mutman',
 '+1:word.istitle()': True,
 '+1:word.isupper()': False}

# getting test data for test our model and get prediction values

In [5]:
test_sentences = []
words = []
entity = []
with open("./test.txt", encoding="utf8") as f:
    lines = f.readlines()
    for line in lines:
        if line=='\n':
            test_sentences.append(list(zip(words, entity)))
            words = []
            entity = []
            continue
        line_split = line.split()
        words.append(line_split[0])
        entity.append(line_split[1])

In [6]:
X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

# Building CRF model

In [7]:
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

# There is much more O entities in data set, but we're more interested in other entities.

In [8]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-PERSON',
 'I-PERSON',
 'B-LOCATION',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'I-LOCATION']

# Extracting features for test data

In [9]:
X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

# Getting prediction for test data

In [10]:
y_pred = crf.predict(X_test)     

# Printing comparing results

In [11]:
metric_table_test_crf = metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3)
print(metric_table_test_crf)

                precision    recall  f1-score   support

      B-PERSON      0.926     0.871     0.898      1603
      I-PERSON      0.929     0.928     0.928       803
    B-LOCATION      0.946     0.893     0.919      1126
B-ORGANIZATION      0.908     0.855     0.880       873
I-ORGANIZATION      0.906     0.838     0.871       864
    I-LOCATION      0.891     0.697     0.782       211

     micro avg      0.924     0.870     0.896      5480
     macro avg      0.918     0.847     0.880      5480
  weighted avg      0.923     0.870     0.895      5480



# Getting dev data and extracting features

In [12]:
dev_sentences = []
words = []
entity = []
with open("./dev.txt", encoding="utf8") as f:
    lines = f.readlines()
    for line in lines:
        if line=='\n':
            dev_sentences.append(list(zip(words, entity)))
            words = []
            entity = []
            continue
        line_split = line.split()
        words.append(line_split[0])
        entity.append(line_split[1])

In [13]:
X_test = [sent2features(s) for s in dev_sentences]
y_test = [sent2labels(s) for s in dev_sentences]

# Getting prediction values for dev data

In [14]:
y_pred = crf.predict(X_test)

# Printing comparing results for dev data

In [15]:
metric_table_dev_crf = metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3)
print(metric_table_dev_crf)

                precision    recall  f1-score   support

      B-PERSON      0.947     0.926     0.937      1400
      I-PERSON      0.918     0.943     0.930       680
    B-LOCATION      0.970     0.930     0.950       942
B-ORGANIZATION      0.943     0.924     0.933       842
I-ORGANIZATION      0.876     0.932     0.903       589
    I-LOCATION      0.930     0.869     0.899       107

     micro avg      0.936     0.929     0.932      4560
     macro avg      0.931     0.921     0.925      4560
  weighted avg      0.937     0.929     0.933      4560



# Comparing metric tables from previous assignment

In [47]:
class color:
   BLUE = '\033[94m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

print('\t\t\t    ' + color.UNDERLINE + color.BOLD + color.BLUE + 'TEST DATA CRF MODEL' + color.END + '\n')

print(metric_table_test_crf)
print('\t\t\t\t' + color.UNDERLINE + color.BOLD + color.BLUE + 'TEST DATA HMM' + color.END + '\n')
with open('metric_table_test.txt') as f:
    contents = f.read()
    print(contents)

			    [4m[1m[94mTEST DATA CRF MODEL[0m

                precision    recall  f1-score   support

      B-PERSON      0.926     0.871     0.898      1603
      I-PERSON      0.929     0.928     0.928       803
    B-LOCATION      0.946     0.893     0.919      1126
B-ORGANIZATION      0.908     0.855     0.880       873
I-ORGANIZATION      0.906     0.838     0.871       864
    I-LOCATION      0.891     0.697     0.782       211

     micro avg      0.924     0.870     0.896      5480
     macro avg      0.918     0.847     0.880      5480
  weighted avg      0.923     0.870     0.895      5480

				[4m[1m[94mTEST DATA HMM[0m

                precision    recall  f1-score   support

      B-PERSON       0.89      0.75      0.82      1603
      I-PERSON       0.91      0.52      0.67       803
    B-LOCATION       0.79      0.86      0.83      1126
B-ORGANIZATION       0.85      0.55      0.67       873
I-ORGANIZATION       0.63      0.15      0.24       864
    I-LOCATION     

In [48]:
print('\t\t\t    ' + color.UNDERLINE + color.BOLD + color.BLUE + 'DEV DATA CRF MODEL' + color.END + '\n')
print(metric_table_dev_crf)
print('\t\t\t\t' + color.UNDERLINE + color.BOLD + color.BLUE + 'DEV DATA HMM' + color.END + '\n')
with open('metric_table_dev.txt') as f:
    contents = f.read()
    print(contents)

			    [4m[1m[94mDEV DATA CRF MODEL[0m

                precision    recall  f1-score   support

      B-PERSON      0.947     0.926     0.937      1400
      I-PERSON      0.918     0.943     0.930       680
    B-LOCATION      0.970     0.930     0.950       942
B-ORGANIZATION      0.943     0.924     0.933       842
I-ORGANIZATION      0.876     0.932     0.903       589
    I-LOCATION      0.930     0.869     0.899       107

     micro avg      0.936     0.929     0.932      4560
     macro avg      0.931     0.921     0.925      4560
  weighted avg      0.937     0.929     0.933      4560

				[4m[1m[94mDEV DATA HMM[0m

                precision    recall  f1-score   support

      B-PERSON       0.90      0.86      0.88      1400
      I-PERSON       0.90      0.68      0.78       680
    B-LOCATION       0.81      0.92      0.86       942
B-ORGANIZATION       0.87      0.66      0.75       842
I-ORGANIZATION       0.68      0.26      0.37       589
    I-LOCATION       

# Conclusion
As we can see from tables, CRF model is better HMM. For CRF model again we get better results from dev data than test data.