In [1]:
import os
import json
import time
import pickle

from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
# import nltk
# nltk.download('punkt')

In [3]:
kdwd_path = '../../data/kdwd/'
for dirname, _, filenames in os.walk(kdwd_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../../data/kdwd/item_aliases.csv
../../data/kdwd/page.csv
../../data/kdwd/statements.csv
../../data/kdwd/link_annotated_text_short.jsonl
../../data/kdwd/property.csv
../../data/kdwd/item.csv
../../data/kdwd/processed_kdwd.pickle
../../data/kdwd/property_aliases.csv
../../data/kdwd/processed_kdwd.txt
../../data/kdwd/link_annotated_text.jsonl


Link to check Wikipedia ID: `https://en.wikipedia.org/?curid={wikipedia_ID}`

---

In [4]:
input_file_path = os.path.join(kdwd_path, "link_annotated_text_short.jsonl")

In [5]:
with open(input_file_path) as file:
    input_data = file.readlines()

In [6]:
len(input_data)

301

In [7]:
# create new file to sale processed data
output_file_path = os.path.join(kdwd_path, "processed_kdwd.txt")
output_file = open(output_file_path, 'w')

# initialize counters for pages and units (sentences)
page_number = 0
unit_number = 0

# start the clock
start_time = time.time()

# loop over each page in input
for page in input_data:
    
    # extract page-level info
    page_json = json.loads(page)
    page_id = page_json['page_id']
    
    # loop over each section in the current page
    for section in page_json['sections']:
        
        # extract section-level info
        section_name = section['name']
        section_text = section['text']
        section_link_lengths = section['link_lengths']
        section_link_offsets = section['link_offsets']
        section_target_page_ids = section['target_page_ids']
        
        # running_index helps determine whether current word is identified as an entity
        running_index = 0
        
        # initialize section-specific params / first 
        entity_start = section_link_offsets[0]
        entity_end = entity_start + section_link_lengths[0]
        
        # split section into sentences
        sentences = sent_tokenize(section_text)
        
        # loop over each sentence in the current section
        for sentence in sentences:
            
            # unpdate running index to beginning of sentence
            running_index = section_text.find(sentence, running_index)
            
            # split sentence into words
            words = word_tokenize(sentence)
            
            # loop over each word in the current sentence
            for word in words:
                
                # unpdate running index to beginning of word
                running_index = section_text.find(word, running_index)
                
                # check whether this word corresponds to an entity in the given list
                if running_index in section_link_offsets:
                    
                    # position of entity in the given list
                    entity_pos = section_link_offsets.index(running_index)
                    
                    # entity start and end indices
                    entity_start = section_link_offsets[entity_pos]
                    entity_end = entity_start + section_link_lengths[entity_pos]
                    
                    # extract entity name and id
                    entity_name = section_text[entity_start:entity_end]
                    entity_id = section_target_page_ids[entity_pos]
                
                # check whether this word is latter part of the entity name
                elif running_index > entity_start and running_index < entity_end:
                    
                    # entity name and id still true
                    pass
                        
                else:
                    
                    # erase entity name and id
                    entity_name = None
                    entity_id = None
                
                # append current token to dataframe 
                observation = {
                    'page_id': page_id,
                    'section_name': section_name,
                    'token': word,
                    'full_mention': entity_name,
                    'wikipedia_ID': entity_id,
                    'unit_number': unit_number,
                    'full_sentence': sentence
                }
                output_file.write(str(observation)+'\n')
                
                # update running index to end of word
                running_index += len(word)
            
            # increment unit number
            unit_number += 1
            
    # increment page number
    page_number += 1
    
# stop the clock
end_time = time.time()

print(f'time taken: {end_time-start_time} seconds')
print(f'time taken per page: {(end_time-start_time)/page_number} seconds')

time taken: 37.53458499908447 seconds
time taken per page: 0.12469961793715771 seconds


---

In [16]:
full_file = os.path.join(kdwd_path, "link_annotated_text.jsonl")

In [17]:
with open(full_file) as file:
    total_pages = sum(1 for line in file)

In [18]:
total_pages

5343565