In [1]:
import glob
import pandas as pd
import json
import numpy as np

In [2]:
def check_start_en(en):
    en = en.split('-')[0]

    # Inside tag
    if(en == 'O' or en in ['I', 'E']):
        return 0
    
    # Start tag
    elif( en.split('-')[0] in ['B', 'S'] ):
        return 1
    
    # entity doesn't match
    else:
        assert False, "Entity doesn't match"

## Set path to download the data

In [3]:
max_levels   = 8
# path_load    = '/home/module/data/vistec_corpus/nested_ner/maintags/'
path_load    = '/home/module/data/vistec_corpus/nested_ner/subtags/'
filenames     = [f for f in glob.glob(path_load+'/*.data', recursive=True)]

# List of files name
[print(f) for f in filenames]

/home/module/data/vistec_corpus/nested_ner/subtags/dataLot1_nested.data
/home/module/data/vistec_corpus/nested_ner/subtags/datalot2_nested.data


[None, None]

## Load data and extract entities.

In [4]:
sents       = []
tokens      = []
check_begin = []

for fn in filenames:
    ## Load data.
    data = open(fn, 'r')
    temp_token = []
    for idx, token in enumerate(data):
        ttoken = token.strip().split('|')
        ltoken = len(ttoken)
        if( ltoken == 1):  
            # End of sentences
            sents.append(temp_token)
            temp_token = []
        
        # Max entity
        elif( ltoken == max_levels+1): 
            temp_token.append(ttoken)
            tokens.append(ttoken)
            
            # Check start mentions
            check_begin.append([ check_start_en(m) for m in ttoken[1:]])
        else: assert False, 'Error!!! Entity level miss match'

## Check all sentences and tokens

In [5]:
## Check all sentences and tokens
sents = np.array(sents)
print('Sentences : ', sents.shape[0])
print('Tokens    : ', len(tokens))

Sentences :  1190
Tokens    :  352554


### Check all tokens in sentences

In [6]:
columns = ['l'+str(x+1) for x in range(max_levels)]
columns.insert(0,'tokens')

In [7]:
## Check all tokens in sentences
df_tokens = pd.DataFrame(tokens, columns=columns)
print('Tokens        : ', df_tokens.shape[0])
print('Show details  :')
df_tokens.describe()

Tokens        :  352554
Show details  :


Unnamed: 0,tokens,l1,l2,l3,l4,l5,l6,l7,l8
count,352554.0,352554,352554,352554,352554,352554,352554,352554,352554
unique,17021.0,376,313,200,75,18,6,3,2
top,,O,O,O,O,O,O,O,O
freq,60230.0,291841,326366,349675,352265,352517,352543,352552,352553


## Counting mentions each level.

In [8]:
## Counting mentions each level.
check_begin = np.array(check_begin)
mentions = []
print('Counting mentions each level')

## Count only B- and S-
for lx in range(len(check_begin[0])):
    try:
        temp_mt = np.unique(check_begin[:,lx], return_counts=True)
        assert temp_mt[0][1] == 1 , "Counting error"
        temp_mt = temp_mt[1][1]
        mentions.append(temp_mt)
        
    except:
        temp_mt = 0
        
    print("Mention level{} = {}".format(lx+1, temp_mt))

print('All mention entity level :', sum(mentions))

Counting mentions each level
Mention level1 = 21851
Mention level2 = 18505
Mention level3 = 2117
Mention level4 = 241
Mention level5 = 31
Mention level6 = 8
Mention level7 = 2
Mention level8 = 1
All mention entity level : 42756


## Counting the frequency of words.

In [9]:
# Counting the frequency of words.
check_entity = 5   # 0:tokens, 1-5: entities level 
top_freq     = 50
check_fre    = []

for colum_name in df_tokens.columns:
    check_fre.append(df_tokens[colum_name].value_counts()[1:top_freq].items())

Format       = '{:<20} \t{:<1}'
print('Counting the frequency of :',df_tokens.columns[check_entity])
print(Format.format('Words', 'Frequency'))
[print(Format.format(freq[0], freq[1])) for freq in check_fre[check_entity]]
pass

Counting the frequency of : l5
Words                	Frequency
S-country            	6
S-date               	5
S-month              	5
S-role               	4
S-cardinal           	3
B-goverment          	2
E-goverment          	2
I-goverment          	1
B-role               	1
S-year               	1
S-title              	1
B-district           	1
E-sub_district       	1
B-sub_district       	1
E-district           	1
E-role               	1
S-mult               	1


## Double check entities from tokens and sentences by level

In [10]:
level       = 7
max_shows   = 10

# Double 1 check entities from tokens
check_tokens= tokens.copy()
count_no = 0
print('Check entities from the list of tokens')
print('{:<10}'.format('Token n.'))
for idx, ckl in enumerate(check_begin[:,level-1]):
    if(ckl==1):
        count_no+=1
        if(max_shows>count_no):
            print('{:<10}{:<10}::'.format(idx, ckl), end='')
            [print('{:<15}\t'.format(t), end='|') for t in check_tokens[idx]]
            print()
print('All entities :',count_no)

# Double 2 check entities from sentence.
print('\n\nCheck entities from the list of sentences')
print('{:<10}{:<10}'.format('Token n.','Sent n.'))
count_no    = 0
accu_tokens = 0
for no, st in enumerate(sents):
    for idx, token in enumerate(st):
        if(token[level] != 'O' and token[level].split('-')[0] in ['B', 'S']):
            count_no += 1
            if(max_shows>count_no): 
                print('{:<10}{:<10}::'.format(accu_tokens+idx, no), end='')
                [print('{:<15}\t'.format(t), end='|') for t in token]
                print()
    accu_tokens += idx+1
print('All entities :',count_no)

Check entities from the list of tokens
Token n.  
121289    1         ::นายก           	|I-role         	|E-role         	|E-role         	|E-role         	|E-goverment    	|E-goverment    	|S-role         	|S-role         	|
122146    1         ::ศิลปากร        	|I-role         	|I-role         	|E-role         	|E-goverment    	|E-goverment    	|E-goverment    	|S-org:edu      	|O              	|
All entities : 2


Check entities from the list of sentences
Token n.  Sent n.   
121289    561       ::นายก           	|I-role         	|E-role         	|E-role         	|E-role         	|E-goverment    	|E-goverment    	|S-role         	|S-role         	|
122146    563       ::ศิลปากร        	|I-role         	|I-role         	|E-role         	|E-goverment    	|E-goverment    	|E-goverment    	|S-org:edu      	|O              	|
All entities : 2


In [11]:
check_tokens[19]

['จะ', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [12]:
# Show nested entities by sentence number.
idx_sent_no = 0# lot1=800, lot2=600, lot3=600

temp_text = "\nSentence no.{}\n".format(idx_sent_no)
print("\nSentence no.",idx_sent_no)

for idx, token in enumerate(sents[idx_sent_no]):
    temp_text  = "{:<5}".format(idx)
    temp_text += ''.join(['{:<15} \t|'.format(t) for t in token])
    print(temp_text)


Sentence no. 0
0    Sukishi         	|S-restaurant    	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|
1                    	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|
2    สาขา            	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|
3                    	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|O               	|
4    Mega            	|B-facility:other 	|S-loc:others    	|O               	|O               	|O               	|O               	|O               	|O               	|
5                    	|I-facility:other 	|O               	|O               	|O               	|O               	|O               	|O          

## Show nested entities by sentence number.

In [13]:
uarray = sorted(np.random.choice(np.arange(0, 1956), replace=False, size=(1, 100))[0])
print(', '.join([str(x) for x in sorted(uarray)]))
print('\n',uarray)

17, 19, 22, 29, 35, 64, 77, 97, 115, 131, 144, 151, 185, 189, 221, 246, 252, 270, 338, 342, 367, 379, 405, 409, 425, 434, 438, 496, 508, 525, 553, 563, 591, 594, 611, 664, 711, 724, 786, 796, 813, 818, 864, 881, 884, 926, 941, 943, 956, 959, 1001, 1019, 1022, 1035, 1063, 1071, 1072, 1087, 1112, 1124, 1128, 1156, 1178, 1234, 1246, 1249, 1273, 1281, 1299, 1307, 1310, 1342, 1375, 1413, 1415, 1433, 1454, 1463, 1501, 1582, 1653, 1655, 1661, 1677, 1684, 1694, 1697, 1706, 1711, 1735, 1748, 1767, 1781, 1811, 1813, 1856, 1892, 1926, 1932, 1949

 [17, 19, 22, 29, 35, 64, 77, 97, 115, 131, 144, 151, 185, 189, 221, 246, 252, 270, 338, 342, 367, 379, 405, 409, 425, 434, 438, 496, 508, 525, 553, 563, 591, 594, 611, 664, 711, 724, 786, 796, 813, 818, 864, 881, 884, 926, 941, 943, 956, 959, 1001, 1019, 1022, 1035, 1063, 1071, 1072, 1087, 1112, 1124, 1128, 1156, 1178, 1234, 1246, 1249, 1273, 1281, 1299, 1307, 1310, 1342, 1375, 1413, 1415, 1433, 1454, 1463, 1501, 1582, 1653, 1655, 1661, 1677, 1684, 1694

## Randoms check sentences

In [14]:
# randoms_check = open('test_data/randoms_check_100_sentences.txt', 'w')
# randoms_check.writelines(', '.join([str(x) for x in sorted(uarray)]))

# for sno in uarray:
#     # Show nested entities by sentence number.
#     idx_sent_no = sno# lot1=800, lot2=600, lot3=600
# #     print("\nSentence no.",idx_sent_no)
#     temp_text = "\nSentence no.{}\n".format(idx_sent_no)
#     randoms_check.writelines(temp_text)
#     for idx, token in enumerate(sents[idx_sent_no]):
#         temp_text  = "{:<5}".format(idx)
#         temp_text += ''.join(['{:<25} |'.format(t) for t in token])
#         randoms_check.writelines(temp_text+'\n')
# #         print(temp_text)
# randoms_check.close()