In [1]:
__author__      = 'Roy Gardner'
__copyright__   = 'Copyright 2021, Ontonomic'

import json
import os
import csv
from termcolor import colored 

import textract
import spacy
from spacy.lang.en import English
        
def process(file_dir,nlp):
    """
    Do the work
    """
    document_dict = {}
    sentence_dict = {}

    _, _, files = next(os.walk(file_dir))
    
    file_list = sorted([file_dir + f for f in files if not f[0] == '.'])
    word_length = 3
    for n,file in enumerate(file_list):
        try:
            # Get the text from the file
            text = textract.process(file).decode('utf-8')
            doc_id = str(n)
            base=os.path.basename(file)
            doc_title = os.path.splitext(base)[0]
            document_dict[doc_id] = {}
            document_dict[doc_id]['name'] = doc_title
            
            # Create a spaCy doc object
            doc = nlp(text)
            sentence_list = []
            
            for i,sent in enumerate(doc.sents):
                sentence_id = doc_id + '/' + str(i)
                sentence_dict[sentence_id] = sent.text
        except:
            print('Error processing:',doc_title)
    return document_dict,sentence_dict

        
# Using basic spaCy model for sentence segmentation
nlp_en = English()  
sentencizer = nlp_en.add_pipe('sentencizer')
sentencizer.punct_chars.add(';')

In [2]:
doc_directory = './docs/'

regex = u'[,]\s+(?=[A-Z])' # For COP docs
#regex = '' # For other docs

document_dict,sentence_dict = process(doc_directory,nlp_en)

# Serialise the dictionaries for later use
filename = './serialised/document_dict.json'
with open(filename, 'w') as f:
    json.dump(document_dict, f)
    f.close()
filename = './serialised/sentence_dict.json'
with open(filename, 'w') as f:
    json.dump(sentence_dict, f)
    f.close()

# Generate CSVs containing the sentences - one per doc
selected_docs = list(document_dict.keys())
for i,doc_id in enumerate(selected_docs):
    csv_data = []
    csv_data.append(['Sentence number','Sentence text'])

    for k,v in sentence_dict.items():

        d_id = k.split('/')[0] 
        s_id = k.split('/')[1] 

        if d_id == doc_id:
            csv_row = []
            csv_row.append(s_id)
            csv_row.append(v)
            csv_data.append(csv_row)
            #print(n,k,v)
        else:
            n = 0

    with open('./segmented_documents/' + document_dict[doc_id]['name'] + '.csv', 'w', encoding="utf-8", newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(csv_data)
    outfile.close()
    
print('Finished')

Finished


In [3]:
print(colored('Number of documents','blue'),len(document_dict))
print(colored('Number of sentences','blue'),len(sentence_dict))
print('')

# Accessing data

# Name of first doc
doc_id = list(document_dict.keys())[0]
print(doc_id,document_dict[doc_id]['name'])
print('')

# Text of sentence
sentence_id = list(sentence_dict.keys())[10]
print(sentence_id,sentence_dict[sentence_id].strip())


[34mNumber of documents[0m 36
[34mNumber of sentences[0m 1435

0 1321

0/10 05) Upon consultation between the Chin State Government and the Chin National Front, it is agreed to form an independent Chin human rights committee.


In [5]:
from spacy.lang.en.stop_words import STOP_WORDS

nlp_sm = spacy.load("en_core_web_sm")
nlp_sm.vocab["‘s"].is_stop = False

def sanitize_entity(text,nlp):
    clean_list = []
    doc = nlp(text, disable=['ner', 'parser'])
    for token in doc:
        if token.text != '’s' and nlp.vocab[token.text].is_stop == True:
            continue
        clean_list.append(token.text.strip())
    return clean_list


org_list = []
org_list.append('ORG')
org_list.append('GPE')
org_list.append('LOC')
org_list.append('PERSON')

print(colored('doc_id','blue'),'\t',colored('sentence_number','blue'),'\t','entity type','\t','entity')
print('')
for sentence_id,sentence_text in sentence_dict.items():
    sentence_doc = nlp_sm(sentence_text)
    entities = [(i.text,i.label_) for i in sentence_doc.ents if i.label_ in org_list]
    for _,entity_data in enumerate(entities):
        entity = ' '.join(sanitize_entity(entity_data[0],nlp_sm))
        
        # Just using IDs here but could be translated in document names and sentence_id values
        # used to sentence text from dictionaries
        doc_id = sentence_id.split('/')[0]
        sentence_number = sentence_id.split('/')[1]
        
        print(colored(doc_id,'blue'),'\t',colored(sentence_number,'blue'),'\t',entity_data[1],'\t',entity)

        print()

[34mdoc_id[0m 	 [34msentence_number[0m 	 entity type 	 entity

[34m0[0m 	 [34m0[0m 	 ORG 	 CNF - Govt Agreements

[34m0[0m 	 [34m0[0m 	 ORG 	 Chinland Guardian

[34m0[0m 	 [34m0[0m 	 ORG 	 Chin National ( CNF

[34m0[0m 	 [34m0[0m 	 ORG 	 Peace Committee Burma

[34m0[0m 	 [34m0[0m 	 GPE 	 Rangoon

[34m0[0m 	 [34m1[0m 	 ORG 	 Chin National Union

[34m0[0m 	 [34m1[0m 	 ORG 	 Peace Working Committee

[34m0[0m 	 [34m1[0m 	 ORG 	 Peace Talks 

[34m0[0m 	 [34m1[0m 	 ORG 	 Union Burma

[34m0[0m 	 [34m1[0m 	 ORG 	 Chin National

[34m0[0m 	 [34m1[0m 	 ORG 	 Union Peace Working Committee

[34m0[0m 	 [34m1[0m 	 ORG 	 Union

[34m0[0m 	 [34m1[0m 	 ORG 	 Panglong

[34m0[0m 	 [34m1[0m 	 ORG 	 Chin National Peace Negotiating Team

[34m0[0m 	 [34m1[0m 	 ORG 	 Chin State Government

[34m0[0m 	 [34m1[0m 	 GPE 	 Hakha

[34m0[0m 	 [34m1[0m 	 GPE 	 Chin State

[34m0[0m 	 [34m1[0m 	 ORG 	 Chin National

[34m0[0m 	 [34m1[0m 	 ORG 	

In [7]:
import pandas as pd

In [9]:
df=pd.DataFrame.from_dict(document_dict)

In [11]:
df1=df.transpose()

In [13]:
df1=df1.reset_index()

In [15]:
df2=df1.rename(columns={'index':'doc_id', 'name':'agreement_id'})

In [17]:
results=pd.read_csv('myanmar_results_ents.csv', sep='\t')

In [24]:
list(df2)

['doc_id', 'agreement_id']

In [25]:
list(results)

['doc_id ', ' sentence_number ', ' entity type ', ' entity']

In [27]:
res=results.rename(columns={'doc_id ': 'doc_id'})

In [33]:
res.dtypes

doc_id                int64
 sentence_number      int64
 entity type         object
 entity              object
dtype: object

In [34]:
df2.dtypes

doc_id          object
agreement_id    object
dtype: object

In [36]:
df2['doc_id']=df2['doc_id'].astype('int64')

In [37]:
df3=pd.merge(res, df2, on='doc_id', how='inner')

In [38]:
df3

Unnamed: 0,doc_id,sentence_number,entity type,entity,agreement_id
0,0,0,ORG,CNF - Govt Agreements,1321
1,0,0,ORG,Chinland Guardian,1321
2,0,0,ORG,Chin National ( CNF,1321
3,0,0,ORG,Peace Committee Burma,1321
4,0,0,GPE,Rangoon,1321
...,...,...,...,...,...
1372,35,4,ORG,Rakhine State Peace Discussion Group,776
1373,35,4,ORG,State Security,776
1374,35,4,PERSON,Htein Lin,776
1375,35,5,ORG,Rakhine State Liberation Party,776


In [39]:
df3.to_csv('full_myanmar_entities.csv')