# CS 445 Natural Language Processing
## Project 4: Named Entity Recognition
### Due Date: January 24, 23:55


## Mount Drive

In [10]:
from google.colab import drive
drive.mount("gdrive")

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


## Importing Necessary Modules

In [11]:
!pip install sklearn-crfsuite
!pip install eli5
import sys
import os

py_file_location = "/content/gdrive/MyDrive/CS-445"
sys.path.append(os.path.abspath(py_file_location))
from Gazetteers import gazetteers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import cross_validate
import eli5



## Importing dataset and morphological analysis of the words in it

In [12]:
with open("/content/gdrive/MyDrive/CS-445/NE.txt",encoding="utf-8",) as file:
  lines = file.readlines()

In [13]:
with open("/content/gdrive/MyDrive/CS-445/NE.ma.txt",encoding="utf-8",) as file_ma:
  lines_ma = file_ma.readlines()

Following features will be used while developing the CRF model. Most of these features were already existing in the morphological analysis of the training set. In order to train the model, it was a necessity to parse those features.

- Root (Stem)
- Part-of-Speech (POS)
- Proper Noun (PROP)
- Noun Case (NCS)
- Orthographic Case (OCS)
- All Inflectional Features (INF)
- Start of the Sentence (SS)


## Pre-processing

### Label Parsing

In [14]:
train_sents = []
for index,line in enumerate(lines):
  splitted = line.split(" ")
  splitted = splitted[:-1]
  processed = []
  while(len(splitted)!=0):
    el = splitted.pop(0)
    if (el.find("<b_enamex")==-1 and el!=''):
      if (el.find("'")!=-1 and el.find("' '")!=-1):
        processed[-1][0]+=el
      else:
        processed.append([el,{},"O"])
    elif (el!=''):
      concat = el
      while (el.find("<e_enamex")==-1):
        el = splitted.pop(0)
        concat = concat + " " +el
      ext = re.findall(r'>([\w\d\s\W\D\S]*)<e_enamex>',concat)
      id = re.findall(r'TYPE="(\w{3})',concat)[0]
      if (len(ext[0].split(" "))>1):
        for index, e in enumerate(ext[0].split(" ")):
          if (index == 0):
            processed.append([e,{},f"B-{id}"])
          elif (index == len(ext[0].split(" "))-1):
            processed.append([e,{},f"L-{id}"])
          else:
            processed.append([e,{},f"I-{id}"]) 
      else:
        processed.append([ext[0],{},f"U-{id}"])
  train_sents.append(processed)

In [15]:
print((train_sents[9998]))

[['Başörtülülerin', {}, 'O'], [',', {}, 'O'], ['üniversitelere', {}, 'O'], ['kayıt', {}, 'O'], ['yaptıramaması', {}, 'O'], ['da', {}, 'O'], [',', {}, 'O'], ['DSP', {}, 'U-ORG'], ["'ye", {}, 'O'], ['-', {}, 'O'], ['bir', {}, 'O'], ['ölçüde', {}, 'O'], ['-', {}, 'O'], ['prim', {}, 'O'], ['kazandırdı', {}, 'O']]


### Bining the dataset
k-Fold process developed as manually.

In [16]:
folds_X = [[],[],[],[],[]]
folds_y = [[],[],[],[],[]]
for index,inst in enumerate(train_sents):
  folds_X[index%5].append(inst)
train_sents_ = folds_X[0] + folds_X[1] + folds_X[2] + folds_X[3] + folds_X[4] 


In [17]:
print((train_sents_[9998]))

[['Hükümetin', {}, 'O'], ['en', {}, 'O'], ['başarılı', {}, 'O'], ['olduğu', {}, 'O'], ['8', {}, 'O'], ['yıl', {}, 'O'], ['kesintisiz', {}, 'O'], ['eğitim', {}, 'O'], ['yasası', {}, 'O'], [',', {}, 'O'], ['Milli', {}, 'O'], ['Eğitim', {}, 'O'], ['Bakanı', {}, 'O'], ['Hikmet', {}, 'B-PER'], ['Uluğbay', {}, 'L-PER'], ['nedeniyle', {}, 'O'], [',', {}, 'O'], ['DSP', {}, 'U-ORG'], ["'ye", {}, 'O'], ['puan', {}, 'O'], ['kazandırdı', {}, 'O']]


### Extracting a full list of the features

In [18]:
feature_set = set()
for line in lines_ma:
  splitted = line.split(" ")
  MAs = splitted[2].split("+")[1:]
  for MA in MAs:
    MA = MA.rstrip()
    if(MA not in feature_set):
      feature_set.add(MA) 

In [19]:
val = list(feature_set)

## Part-Of-Speech Tagging 
###(From Morph. Analysis)

In [20]:
POS_dict = {}
for index,line in enumerate(lines_ma):
  line = line.rstrip()
  splitted = line.split(" ")
  splitted[0] = int(splitted[0]) 
  MA = splitted[2].split("+")
  stem = MA[0]
  MA = set(MA[1:])
  MA
  tmp = {
      'stem':stem,
  }
  for v in val:
    tmp[v] = 1 if (v in MA) else 0
  if (splitted[0] not in POS_dict):
    POS_dict[splitted[0]] = {splitted[1]:tmp}
  else:
    POS_dict[splitted[0]][splitted[1]]=tmp

In [21]:
for index,train_sent in enumerate(train_sents):
  p_index= index + 1
  for ind,word in enumerate(train_sent):
    train_sents[index][ind][1] = POS_dict[p_index][word[0]] if (word[0] in POS_dict[p_index]) else ""

In [22]:
train_sents[0][0]

['Müzik',
 {'A1pl': 0,
  'A1sg': 0,
  'A2pl': 0,
  'A2sg': 0,
  'A3pl': 0,
  'A3pl^DB': 0,
  'A3sg': 1,
  'A3sg^DB': 0,
  'Abl': 0,
  'Abl^DB': 0,
  'Able': 0,
  'Able^DB': 0,
  'Acc': 0,
  'Acquire': 0,
  'Acquire^DB': 0,
  'ActOf': 0,
  'Adamantly': 0,
  'Adj': 0,
  'Adj^DB': 0,
  'Adverb': 0,
  'AfterDoingSo': 0,
  'Agt': 0,
  'Aor': 0,
  'AorPart': 0,
  'AorPart^DB': 0,
  'Aor^DB': 0,
  'AsIf': 0,
  'AsIf^DB': 0,
  'AsLongAs': 0,
  'Become': 0,
  'Become^DB': 0,
  'ByDoingSo': 0,
  'Card': 0,
  'Card^DB': 0,
  'Caus': 0,
  'Caus^DB': 0,
  'Cond': 0,
  'Conj': 0,
  'Cop': 0,
  'Dat': 0,
  'Dat^DB': 0,
  'Demons': 0,
  'Desr': 0,
  'Det': 0,
  'Dim': 0,
  'Dist': 0,
  'Dist^DB': 0,
  'Distrib': 0,
  'Dup': 0,
  'Equ': 0,
  'EverSince': 0,
  'EverSince^DB': 0,
  'FeelLike': 0,
  'Fut': 0,
  'FutPart': 0,
  'Gen': 0,
  'Gen^DB': 0,
  'Hastily': 0,
  'Imp': 0,
  'InBetween': 0,
  'Inf1': 0,
  'Inf2': 0,
  'Inf3': 0,
  'Ins': 0,
  'Ins^DB': 0,
  'Interj': 0,
  'Loc': 0,
  'Loc^DB': 0,
  

In [23]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.gazetteers()':1 if (word in gazetteers) else 0
    }
    features.update(postag)
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        postag_before = {}
        for pos in postag1:
          postag_before[pos+"_before"] = postag1[pos]
        features.update(postag_before)
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word.isdigit(),
            '-1:word.gazetteers()':1 if (word in gazetteers) else 0
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        postag_after = {}
        for pos in postag1:
          postag_after[pos+"_after"] = postag1[pos]
        features.update(postag_after)
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word.isdigit(),
            '+1:word.gazetteers()':1 if (word in gazetteers) else 0

        })
    else:
        features['EOS'] = True
    
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]


X_train = [sent2features(s) for s in train_sents]

y_train = [sent2labels(s) for s in train_sents]
'''
X_test = X_train[:2000]
y_test = y_train[:2000]

X_train = X_train[2000:]
y_train = y_train[2000:]

'''

It can be observed that all features including gazetteers are implemented in the above cell. Initially, we transfer the raw dataset with words containing morphological analysis and labels into the *sent2features* function which returns words with their features (in dictionary format). In the next step, we transfer the same raw data to *sent2labels* function which simply extracts the label of the sentence.

## Checking whether the method and model works

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train,y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

Model takes a very long training time. Without accessing a higher RAM, it is very hard to fine-tune CRF.

### Precision, Recall and F-1 Score

In [None]:
y_pred = crf.predict(X_test)

In [None]:
print(metrics.flat_classification_report(y_test,y_pred,digits=3))

              precision    recall  f1-score   support

       B-LOC      0.712     0.529     0.607        70
       B-ORG      0.731     0.677     0.703       229
       B-PER      0.822     0.898     0.858       432
       I-LOC      0.833     0.541     0.656        37
       I-ORG      0.722     0.670     0.695       194
       I-PER      0.667     0.750     0.706        32
       L-LOC      0.808     0.600     0.689        70
       L-ORG      0.745     0.690     0.717       229
       L-PER      0.828     0.905     0.865       432
           O      0.986     0.990     0.988     27990
       U-LOC      0.858     0.862     0.860       645
       U-ORG      0.873     0.774     0.821       372
       U-PER      0.858     0.776     0.815       539

    accuracy                          0.970     31271
   macro avg      0.803     0.743     0.768     31271
weighted avg      0.969     0.970     0.969     31271



Results are close to results which has been stated by Yeniterzi in her article. In the following parts of this notebook, we will try to increase the performance of the model.

### Model Weights

In [None]:
eli5.show_weights(crf, top=30)



From \ To,O,B-LOC,I-LOC,L-LOC,U-LOC,B-ORG,I-ORG,L-ORG,U-ORG,B-PER,I-PER,L-PER,U-PER
O,2.426,0.098,-1.907,-2.533,1.126,0.461,-3.37,-3.294,1.052,1.697,-2.405,-3.192,1.49
B-LOC,-3.064,-0.081,5.889,6.182,-1.651,-0.314,-0.436,-0.699,-0.305,-0.665,-0.365,-0.717,-1.173
I-LOC,-3.358,-0.386,5.118,5.097,-0.883,-0.455,-0.154,-0.631,-0.248,-0.761,0.0,-0.54,-0.92
L-LOC,0.252,0.837,-0.291,-0.285,0.0,-0.172,-0.356,-0.526,-0.547,-0.907,0.0,-0.747,-0.81
U-LOC,1.505,0.93,-1.13,-1.502,1.181,-0.867,-2.316,-1.852,0.194,-0.863,-0.436,-1.606,-1.043
B-ORG,-4.25,-0.935,-0.488,-0.913,-2.071,-1.231,5.376,4.525,-1.301,-1.507,-0.499,-1.586,-2.455
I-ORG,-4.808,-0.958,-0.644,-0.988,-1.866,-2.234,4.939,3.762,-2.123,-1.752,-0.846,-1.263,-1.993
L-ORG,0.654,-0.034,-0.325,-0.352,-0.039,-1.245,-1.364,-1.419,-0.267,-1.188,-0.016,-0.771,-1.455
U-ORG,1.698,0.959,-0.0,-0.34,0.71,-0.434,-1.196,-1.316,-0.368,-0.466,0.0,-0.807,-0.481
B-PER,-4.203,-0.91,-0.813,-0.765,-2.209,-1.231,-1.344,-1.191,-1.421,-2.478,4.326,5.36,-3.473

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+5.926,word.lower():o,,,,,,,,,,,
+5.235,word.lower():başkan,,,,,,,,,,,
+4.510,Adverb,,,,,,,,,,,
+4.414,word.lower():b,,,,,,,,,,,
+4.239,word.lower():g-7,,,,,,,,,,,
+4.072,word.lower():x,,,,,,,,,,,
+4.063,bias,,,,,,,,,,,
+3.933,Conj,,,,,,,,,,,
+3.599,word.lower():başbakan,,,,,,,,,,,
+3.512,word.lower():i̇talyanlar,,,,,,,,,,,

Weight?,Feature
+5.926,word.lower():o
+5.235,word.lower():başkan
+4.510,Adverb
+4.414,word.lower():b
+4.239,word.lower():g-7
+4.072,word.lower():x
+4.063,bias
+3.933,Conj
+3.599,word.lower():başbakan
+3.512,word.lower():i̇talyanlar

Weight?,Feature
+1.900,+1:word.lower():cumhuriyeti
+1.854,+1:word.lower():mahallesi
+1.817,+1:word.lower():avrupa
+1.597,+1:word.lower():kore
+1.563,+1:word.lower():sokak
+1.526,+1:word.lower():park
+1.473,stem:yeni
+1.428,stem_after:meyhane
+1.423,+1:word.lower():meyhanesi
+1.420,+1:word.lower():havalimanı

Weight?,Feature
+1.484,+1:word.lower():bahçe
+1.010,+1:word.lower():salonu
+0.876,Conj
+0.862,+1:word.lower():caddesi
+0.857,stem:yıl
+0.828,word.lower():yıl
+0.811,+1:word.lower():stadı
+0.765,+1:word.lower():galirisi
+0.730,+1:word.lower():evleri
+0.722,-1:word.lower():yurt

Weight?,Feature
+2.679,+1:word.lower():'nda
+2.233,word.lower():avrupa
+1.787,word.lower():cumhuriyeti
+1.615,word.lower():afrika
+1.559,word.lower():kore
+1.537,word.lower():ereğli̇
+1.529,stem:Ereğli
+1.514,word.lower():park
+1.448,word.lower():meyhanesi
+1.447,stem:bar

Weight?,Feature
+7.119,word.lower():ıstanbul
+6.224,word.lower():türkiye
+5.785,word.lower():abd
+5.085,word.lower():türkıye
+5.000,word.lower():fenerbahçe
+4.947,word.lower():i̇stanbul
+4.545,word.lower():galatasaray
+4.300,word.lower():rusya
+4.241,word.lower():beşiktaş
+4.204,word.lower():ankara

Weight?,Feature
+2.761,+1:word.lower():üniversitesi
+2.179,+1:word.lower():dgm
+1.956,+1:word.lower():saray
+1.941,+1:word.lower():partisi
+1.893,+1:word.lower():bankası
+1.772,+1:word.lower():parti
+1.761,word.lower():
+1.742,+1:word.lower():belediyesi
+1.648,+1:word.lower():ofisi
+1.642,+1:word.lower():konseyi

Weight?,Feature
+2.497,-1:word.lower():
+2.229,+1:word.lower():i̇ş
+1.615,+1:word.lower():başkanlığı
+1.214,+1:word.lower():komisyonu
+1.158,word.lower():&
+1.156,Card_before
+1.117,stem_before:Türkiye
+1.108,+1:word.lower():müzesi
+1.108,stem_after:ış
+1.108,+1:word.lower():ış

Weight?,Feature
+3.005,word.lower():kurulu
+2.854,word.lower():dgm
+2.546,word.lower():saray
+2.369,word.lower():parti
+2.265,word.lower():belediyesi
+2.150,word.lower():partisi
+2.118,word.lower():merkezi
+2.117,word.lower():hastanesi
+2.095,word.lower():bakanlığı
+2.013,+1:word.lower():'nce

Weight?,Feature
+5.699,word.lower():meclis
+5.424,word.lower():juventus
+5.389,word.lower():trabzonspor
+5.301,word.lower():milliyet
+5.300,word.lower():erdemir
+5.110,word.lower():dıe
+5.006,word.lower():shell
+4.999,word.lower():parma
+4.910,word.lower():bursaspor
+4.783,word.lower():kongre

Weight?,Feature
+1.630,stem_before:başbakan
+1.530,+1:word.lower():çiller
+1.524,+1:word.lower():clinton
+1.486,BOS
+1.408,PresPart_after
+1.393,word.lower():i̇hsan
+1.330,+1:word.lower():terim
+1.259,stem:Arif
+1.206,stem_before:milletvekili
+1.203,+1:word.lower():yakın

Weight?,Feature
+1.749,+1:word.lower():hakan
+0.984,Punct_before
+0.920,stem_after:.
+0.920,+1:word.lower():.
+0.841,stem_before:.
+0.841,-1:word.lower():.
+0.801,stem_before:b
+0.800,-1:word.lower():b
+0.797,Punct_after
+0.771,stem:.

Weight?,Feature
+2.119,word.lower():clinton
+2.057,word.lower():hakan
+2.001,+1:word.lower():'in
+1.632,EOS
+1.505,+1:word.lower():'le
+1.472,word.lower():çiller
+1.365,+1:word.lower():'ı
+1.354,stem:Clinton
+1.346,+1:word.lower():'la
+1.305,+1:word.lower():'ün

Weight?,Feature
+4.475,word.lower():lippi
+4.339,word.lower():demirel
+4.271,word.lower():clinton
+4.163,word.lower():yılmaz
+4.106,word.lower():erbakan
+4.092,word.lower():balic
+4.056,word.lower():kutan
+4.035,+1:word.lower():bey
+3.979,word.lower():starr
+3.936,word.lower():hakan


## Cross Validation
Model tends to consume large amount of memory and time. Running in a local runtime with higher RAM can be a better solution.

In [31]:
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

f1_scorer = make_scorer(metrics.flat_f1_score, average='macro') 
precision_scorer = make_scorer(metrics.flat_precision_score, average='macro') 
recall_scorer = make_scorer(metrics.flat_recall_score, average='macro') 
scoring_={
    "f1":f1_scorer,
    "precision":precision_scorer,
    "recall":recall_scorer
}
scores = cross_validate(crf, X_train, y_train, scoring=scoring_, cv=5)



In [38]:
from statistics import mean
print(f"Average F-1 Score:{mean(scores['test_f1'])}")
print(f"Average Precision Score:{mean(scores['test_precision'])}")
print(f"Average Recall Score:{mean(scores['test_recall'])}")

Average F-1 Score:0.7999979707545355
Average Precision Score:0.83801794691663
Average Recall Score:0.7739647456810598
