In [1]:
import glob
import pandas as pd
import json
import numpy as np

## Set path to download the data

In [16]:
max_levels   = 8
# path_load    = '/home/module/data/vistec_corpus/nested_ner/maintags/'
path_load     = '/home/module/data/vistec_newmm4L/nested_ner/subtags/'
filenames     = [f for f in glob.glob(path_load+'/*.data', recursive=True)]

# List of files name
[print(f) for f in filenames]

/home/module/data/vistec_newmm4L/nested_ner/subtags/datalot1_nested.data
/home/module/data/vistec_newmm4L/nested_ner/subtags/datalot2_nested.data
/home/module/data/vistec_newmm4L/nested_ner/subtags/datalot3_nested.data
/home/module/data/vistec_newmm4L/nested_ner/subtags/datalot4_nested.data


[None, None, None, None]

## Utils

In [17]:
def check_start_en(en):
    en = en.split('-')[0]

    # Inside tag
    if(en == 'O' or en in ['I', 'E']):
        return 0

    # Start tag
    elif( en.split('-')[0] in ['B', 'S'] ):
        return 1

    # entity doesn't match
    else:
        assert False, "Entity doesn't match"

## Load data and extract entities.

In [18]:
sents       = []
tokens      = []
check_begin = []

for fn in filenames:
    ## Load data.
    data = open(fn, 'r')
    temp_token = []
    for idx, token in enumerate(data):
        ttoken = token.strip().split('|')
        ltoken = len(ttoken)
        if( ltoken == 1):  
            # End of sentences
            sents.append(temp_token)
            temp_token = []
        
        # Max entity
        elif( ltoken == max_levels+1): 
            temp_token.append(ttoken)
            tokens.append(ttoken)
            
            # Check start mentions
            check_begin.append([ check_start_en(m) for m in ttoken[1:]])
        else: assert False, 'Error!!! Entity level miss match'

## Check all sentences and tokens

In [19]:
## Check all sentences and tokens
sents = np.array(sents)
print('Sentences : ', sents.shape[0])
print('Tokens    : ', len(tokens))
'''
attacut
    Sentences :  2000
    Tokens    :  652426
    
newmm
    Sentences :  2000
    Tokens    :  590316
'''
pass

Sentences :  2865
Tokens    :  875338


### Check all tokens in sentences

In [20]:
columns = ['l'+str(x+1) for x in range(max_levels)]
columns.insert(0,'tokens')

In [21]:
## Check all tokens in sentences
df_tokens = pd.DataFrame(tokens, columns=columns)
print('Tokens        : ', df_tokens.shape[0])
print('Show details  :')
df_tokens.describe()

Tokens        :  875338
Show details  :


Unnamed: 0,tokens,l1,l2,l3,l4,l5,l6,l7,l8
count,875338.0,875338,875338,875338,875338,875338,875338,875338,875338
unique,27482.0,401,370,244,118,21,2,1,1
top,,O,O,O,O,O,O,O,O
freq,164368.0,677762,783300,866588,874684,875296,875337,875338,875338


## Counting mentions each level.

In [22]:
## Counting mentions each level.
check_begin = np.array(check_begin)
mentions = []
print('Counting mentions each level')

## Count only B- and S-
for lx in range(len(check_begin[0])):
    try:
        temp_mt = np.unique(check_begin[:,lx], return_counts=True)
        assert temp_mt[0][1] == 1 , "Counting error"
        temp_mt = temp_mt[1][1]
        mentions.append(temp_mt)
        
    except:
        temp_mt = 0
        
    print("Mention level{} = {}".format(lx+1, temp_mt))

print('All mention entity level :', sum(mentions))
'''
Counting mentions each level
Mention level1 = 42264
Mention level2 = 37864
Mention level3 = 4768
Mention level4 = 545
Mention level5 = 78
Mention level6 = 16
Mention level7 = 3
Mention level8 = 1
All mention entity level : 85539
'''
pass

Counting mentions each level
Mention level1 = 65234
Mention level2 = 59092
Mention level3 = 6457
Mention level4 = 502
Mention level5 = 37
Mention level6 = 1
Mention level7 = 0
Mention level8 = 0
All mention entity level : 131323


## Counting the frequency of words.

In [23]:
# Counting the frequency of words.
check_entity = 5   # 0:tokens, 1-5: entities level 
top_freq     = 50
check_fre    = []

for colum_name in df_tokens.columns:
    check_fre.append(df_tokens[colum_name].value_counts()[1:top_freq].items())

Format       = '{:<20} \t{:<1}'
print('Counting the frequency of :',df_tokens.columns[check_entity])
print(Format.format('Words', 'Frequency'))
[print(Format.format(freq[0], freq[1])) for freq in check_fre[check_entity]]
pass

Counting the frequency of : l5
Words                	Frequency
S-country            	11
S-province           	5
S-cardinal           	3
S-month              	3
S-unit               	2
S-day                	2
I-firstname          	2
S-role               	2
E-month              	1
E-firstname          	1
S-loc:others         	1
S-org:edu            	1
B-firstname          	1
S-nationality        	1
B-month              	1
S-continent          	1
B-mountian           	1
S-firstname          	1
S-religion           	1
E-mountian           	1


## Double check entities from tokens and sentences by level

In [24]:
level       = 6
max_shows   = 10

# Double 1 check entities from tokens
check_tokens= tokens.copy()
count_no = 0
print('Check entities from the list of tokens')
print('{:<10}'.format('Token n.'))
for idx, ckl in enumerate(check_begin[:,level-1]):
    if(ckl==1):
        count_no+=1
        if(max_shows>count_no):
            print('{:<10}{:<10}::'.format(idx, ckl), end='')
            [print('{:<15}\t'.format(t), end='|') for t in check_tokens[idx]]
            print()
print('All entities :',count_no)

# Double 2 check entities from sentence.
print('\n\nCheck entities from the list of sentences')
print('{:<10}{:<10}'.format('Token n.','Sent n.'))
count_no    = 0
accu_tokens = 0
for no, st in enumerate(sents):
    for idx, token in enumerate(st):
        if(token[level] != 'O' and token[level].split('-')[0] in ['B', 'S']):
            count_no += 1
            if(max_shows>count_no): 
                print('{:<10}{:<10}::'.format(accu_tokens+idx, no), end='')
                [print('{:<15}\t'.format(t), end='|') for t in token]
                print()
    accu_tokens += idx+1
print('All entities :',count_no)

Check entities from the list of tokens
Token n.  
821513    1         ::ตุลาฯ          	|E-role         	|E-org:other    	|E-org:other    	|E-org:other    	|E-month        	|S-month        	|O              	|O              	|
All entities : 1


Check entities from the list of sentences
Token n.  Sent n.   
821513    2700      ::ตุลาฯ          	|E-role         	|E-org:other    	|E-org:other    	|E-org:other    	|E-month        	|S-month        	|O              	|O              	|
All entities : 1


In [25]:
check_tokens[19]

['จะ', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [26]:
# Show nested entities by sentence number.
idx_sent_no = 2700# lot1=800, lot2=600, lot3=600

temp_text = "\nSentence no.{}\n".format(idx_sent_no)
print("\nSentence no.",idx_sent_no)

for idx, token in enumerate(sents[idx_sent_no]):
    temp_text  = "{:<5}".format(idx)
    temp_text += ''.join(['{:<15} \t|'.format(t) for t in token])
    print(temp_text)


Sentence no. 2700
0    นา              	|B-person        	|S-title         	|O               	|O               	|O               	|O               	|O               	|O               	|
1    ย               	|I-person        	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|
2    จัตุ            	|I-person        	|B-firstname     	|O               	|O               	|O               	|O               	|O               	|O               	|
3    รน              	|I-person        	|I-firstname     	|O               	|O               	|O               	|O               	|O               	|O               	|
4    ต์              	|I-person        	|E-firstname     	|O               	|O               	|O               	|O               	|O               	|O               	|
5                    	|I-person        	|O               	|O               	|O               	|O               	|O               	|O         

## Randoms check sentences

In [13]:
## random sentences
# uarray = sorted(np.random.choice(np.arange(0, 1956), replace=False, size=(1, 100))[0])
# print(', '.join([str(x) for x in sorted(uarray)]))
# print('\n',uarray)

# randoms_check = open('test_data/randoms_check_100_sentences.txt', 'w')
# randoms_check.writelines(', '.join([str(x) for x in sorted(uarray)]))

# for sno in uarray:
#     # Show nested entities by sentence number.
#     idx_sent_no = sno# lot1=800, lot2=600, lot3=600
# #     print("\nSentence no.",idx_sent_no)
#     temp_text = "\nSentence no.{}\n".format(idx_sent_no)
#     randoms_check.writelines(temp_text)
#     for idx, token in enumerate(sents[idx_sent_no]):
#         temp_text  = "{:<5}".format(idx)
#         temp_text += ''.join(['{:<25} |'.format(t) for t in token])
#         randoms_check.writelines(temp_text+'\n')
# #         print(temp_text)
# randoms_check.close()