In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn_crfsuite import CRF
# ref for first module https://scikit-learn.org/stable/modules/cross_validation.html
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn_crfsuite.metrics import flat_classification_report

In [2]:
def read_all_line( directory , file ):
    file = open( directory + "/" + file , "r" , encoding="utf-8" )
    contents = file.read()
    file.close()
    return contents

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    dict_type = sent[i][2]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'dictionary' : dict_type
    }
    if i > 0:
        word1 = sent[i-1][0]
        dict_type1 = sent[i-1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:dictionay' : dict_type1
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        dict_type1 = sent[i+1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '-1:dictionay' : dict_type1
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label, dict_type in sent]

def sent2tokens(sent):
    return [token for token, label, dict_type in sent]

def sent2dict_type( sent ):
    return [dict_type for token, label, dict_type in sent]

In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t, d) for w, t, d in zip( s["Word"].values.tolist(),
                                                            s["Tag"].values.tolist(),
                                                            s["Dictionary"].values.tolist() 
                                                          )]
        self.grouped = self.grouped_sentence()
        self.sentences = [s for s in self.grouped]
    
    def grouped_sentence( self ):
        save_sentence = 0
        major_index = 0
        temp_grouped = []
        for run in range( 0 , self.data.shape[0] ):
            temp_sentence = num_of_sentence( self.data["Sentence #"][ run ] )
            if save_sentence != temp_sentence :
                major_index = temp_sentence - 1
                save_sentence = temp_sentence
                temp_grouped.append( [] )
            temp_grouped[ major_index ].append( (self.data["Word"][run], 
                                                 self.data["Tag"][run],
                                                 self.data["Dictionary"][run] ) )
        return temp_grouped
    
    def get_next(self):
        try:
            s = self.grouped[ self.n_sent ]
            self.n_sent += 1
            return s  
        except:
            self.empty = True
            return None, None

In [5]:
def get_tag_group_sentence( data ):
    old_tag = []
    save_sentence = ""
    run_old = -1
    for run in range(  0 , data.shape[0] ):
        if( save_sentence != data["Sentence #"][ run ] ):
            old_tag.append( [] )
            run_old += 1
        else:
            None
        if data.Tag[run] in ( '' ):
            old_tag[ run_old ].append( 'O' )
        else:
            old_tag[ run_old ].append( data.Tag[ run ] )
    return old_tag

def merge_all_tag( data ):
    old_tag = []
    save_sentence = ""
    run_old = -1
    for run in range(  0 , data.shape[0] ):
        if( save_sentence != data["Sentence #"][ run ] ):
            old_tag.append( [] )
            run_old += 1
        else:
            None
        if data.Tag[run] in ( '' ):
            old_tag[ run_old ].append( 'O' )
            data.Tag[run] = 'O'
        elif data.Tag[run] in ('loc_cont','loc_end','loc_start'):
            old_tag[ run_old ].append( data.Tag[ run ] )
            data.Tag[run] = 'loc'
        elif data.Tag[run] in ('org_cont','org_end','org_start'):
            old_tag[ run_old ].append( data.Tag[ run ] )
            data.Tag[run] = 'org'
        elif data.Tag[run] in ('per_cont','per_end','per_start'):
            old_tag[ run_old ].append( data.Tag[ run ] )
            data.Tag[run] = 'per'
        else:
            old_tag[ run_old ].append( data.Tag[ run ] )
            None
    return old_tag

# number is current index
# target is -1 -2 +1 +2 what do you want to file
def helper_get_tag( data , number , target , limit ):
    if( number + target == limit ):
        return 'O'
    elif( data["Sentence #"][ number+target ] != data["Sentence #"][ number ] ):
        return 'O'
    else:
        return data.Tag[number + target ]
            
def split_all_tag( data ):
    limit_run = data.shape[0]
    for run in range( 0 , limit_run ):
        if data.Tag[ run ] == "loc" :
            if( helper_get_tag( data , run , -1, limit_run  ) in ( "loc_start" , "loc_cont" ) ):
                if( helper_get_tag( data , run , +1 , limit_run ) == "loc" ):
                    data.Tag[ run ] = "loc_cont"
                else:
                    data.Tag[ run ] = "loc_end"
            elif( helper_get_tag( data , run , +1 , limit_run ) == "loc" ):
                data.Tag[ run ] = "loc_start"
        elif data.Tag[ run ] == "org" :
            if( helper_get_tag( data , run , -1 , limit_run ) in ( "org_start" , "org_cont" ) ):
                if( helper_get_tag( data , run , +1 , limit_run ) == "org" ):
                    data.Tag[ run ] = "org_cont"
                else:
                    data.Tag[ run ] = "org_end"
            elif( helper_get_tag( data , run , +1 , limit_run ) == "org" ):
                data.Tag[ run ] = "org_start"
        elif data.Tag[ run ] == "per" :
            if( helper_get_tag( data , run , -1 , limit_run ) in ( "per_start" , "per_cont" ) ):
                if( helper_get_tag( data , run , +1, limit_run ) == "per" ):
                    data.Tag[ run ] = "per_cont"
                else:
                    data.Tag[ run ] = "per_end"
            elif( helper_get_tag( data , run , +1 , limit_run ) == "per" ):
                data.Tag[ run ] = "per_start"
            else:
                None
        else:
            None

def convert_list_to_data_frame( data_frame , data_list ):
    save_sentence = 0
    major_index_list = 0
    minor_index_list = 0
    for run in range( 0 , data_frame.shape[0] ):
        temp_sentence = num_of_sentence( data_frame["Sentence #"][ run ] )
        if save_sentence != temp_sentence :
            major_index_list = temp_sentence - 1
            save_sentence = temp_sentence
            minor_index_list = 0
        data_frame["Tag"][run] = data_list[ major_index_list][ minor_index_list ]
        minor_index_list += 1

In [6]:
def prepare_dictionary( directory , file_dict ):
    dictionary = {}
    for key in file_dict.keys():
        dictionary[ key ] = []
        list_file = file_dict[ key ]
        if type( list_file ) == type( "test_string" ):
            dictionary[ key ] = read_all_line( directory , list_file ).split('\n')
        else:
            for file in list_file:
                dictionary[ key ] += read_all_line( directory , file ).split('\n')
    return dictionary

# If found in dict will return key other return NO
def search_in_dictionary( word , dictionary ):
    answer = "NO"
    for key in dictionary.keys():
        if word in dictionary[key]:
            answer = key
            break
    return answer

In [7]:
def idividual_read_file( raw_data , split_sentence , count ,remove = [] ):
    pre_data_frame = { "Sentence #" : [] , "Word" : [] , "Tag" : [] , "Dictionary" : [] }
    for word in raw_data:
        if word in remove :
            None
        elif word in split_sentence :
            count += 1
        else: 
            check = word.find('(')
            if check > 0:
                temp_word = word[ 0 : word.find('(')]
                pre_data_frame["Sentence #"].append( "Sentence: " + str(count) )
                pre_data_frame["Word"].append( temp_word )
                pre_data_frame["Tag"].append( word[ word.find('(') + 1 : word.find(')')] )
                pre_data_frame["Dictionary"].append( search_in_dictionary( temp_word , dictionary ) )
            else:
                pre_data_frame["Sentence #"].append( "Sentence: " + str(count) )
                pre_data_frame["Word"].append( word )
                pre_data_frame["Tag"].append( "O" )
                pre_data_frame["Dictionary"].append( search_in_dictionary( word , dictionary ) )
    return pre_data_frame , count

def read_file( directory , list_file , count = 1 , individual = False ):
    data_frame = pd.DataFrame( { "Sentence #" : [] , "Word" : [] , "Tag" : [] , "Dictionary" : []} )
    if( individual ):
        word = read_all_line( directory , list_file ).split('|')
        pre_data_frame , count = idividual_read_file( word , # raw_data
                                              ["\n"] , # word show split sentence
                                              count , # order of sentence
                                              [ " " , '' , '\0'] ) # word to delete or prevent
        data_frame  = data_frame.append( pd.DataFrame( pre_data_frame ), ignore_index=True )
    else:
        for file in list_file :
            word = read_all_line( directory , file ).split('|')
            pre_data_frame , count = idividual_read_file( word ,
                                              ["\n"] ,
                                              count ,
                                              [ " " , '' , '\0'] )
            data_frame  = data_frame.append( pd.DataFrame( pre_data_frame ), ignore_index=True )
    return data_frame
def num_of_sentence( sentence ):
    words = sentence.split( " ")
    return int( words[1] )

In [8]:
dictionary_directory = "dictionary_directory"
dictionary_files = {
    "front_person" : ("clue front.txt" , "clue word person.txt" , "clue_royal.txt")
    ,"front_country" : ("คำนำหน้าชื่อประเทศ.txt")
    ,"front_org" : ("คำนำหน้าองค์กรจาก dict.txt")
    ,"location_name" : ("ชื่อกิ่งอำเภอ.txt" , "ชื่อคลอง.txt" , "ชื่อจังหวัด.txt" , "ชื่อตำบล.txt" , "ชื่อมลรัฐ.txt" , "ชื่อสถานที่.txt")
}
dictionary = prepare_dictionary( dictionary_directory , dictionary_files )

In [9]:
test_directory = "TestwithTag"
test_files = ( "POL1108.CUT" , "POL1109.CUT" , "POL1111.CUT" , "POL1112.CUT" , 
               "POL1113.CUT" , "POL1803.CUT" , "POL1804.CUT" , "POL1805.CUT" ,
               "POL1806.CUT" , "POL1807.CUT" , "POL1808.CUT" , "POL1809.CUT" ,
               "POL1810.CUT" , "POL1811.CUT" , "POL1812.CUT" , "POL1813.CUT" ,
               "POL1814.CUT" , "POL1815.CUT" , "POL1816.CUT" , "POL1817.CUT" ,
               "POL1818.CUT" , "POL1819.CUT" , "POL2001.CUT" , "POL2002.CUT" ,
               "POL2004.CUT" , "POL2005.CUT" , "POL2006.CUT" , "POL2007.CUT" ,
               "POL2008.CUT" , "POL2009.CUT" , "POL2010.CUT" , "POL2011.CUT" ,
               "POL2012.CUT" , "POL2014.CUT" , "POL2015.CUT" , "POL2017.CUT" ,
               "POL2018.CUT" , "POL2019.CUT" , "POL2020.CUT"
             )

In [10]:
model_directory = "model_directory"
model_name = "CRF_dictionary.sav"
model_file = model_directory + "/" + model_name

In [11]:
load_model = pickle.load( open( model_file , 'rb') )

In [12]:
data_test = read_file( test_directory , test_files , 1 )
data_test.to_csv( 
        path_or_buf = test_directory + "/original.csv" ,
        index=True );
data_test.fillna( method="ffill")
old_tag_test = merge_all_tag( data_test )
getter_test = SentenceGetter( data_test )
sentences_test = getter_test.sentences
test_set = [sent2features(s) for s in sentences_test]
test_predict = load_model.predict( test_set )
convert_list_to_data_frame( data_test , test_predict )
split_all_tag( data_test )
test_predict = get_tag_group_sentence( data_test )

In [13]:
report = flat_classification_report(y_pred=test_predict, y_true=old_tag_test)
print(report)

              precision    recall  f1-score   support

           O       0.97      0.99      0.98     11788
         loc       0.90      0.73      0.81       306
    loc_cont       0.33      0.04      0.08        23
     loc_end       0.14      0.05      0.08        19
   loc_start       0.14      0.05      0.08        19
         org       0.91      0.84      0.88       359
    org_cont       0.74      0.53      0.62       219
     org_end       0.68      0.57      0.62       189
   org_start       0.71      0.59      0.65       189
         per       0.96      0.66      0.78       226
    per_cont       0.96      0.97      0.96       256
     per_end       0.97      0.96      0.96       242
   per_start       0.97      0.97      0.97       242

    accuracy                           0.95     14077
   macro avg       0.72      0.61      0.65     14077
weighted avg       0.95      0.95      0.95     14077



In [14]:
data_test.to_csv( 
        path_or_buf = test_directory + "/result.csv" ,
        index=True );