In [46]:
import os
import json
import time

import pandas as pd
import numpy as np

from nltk.tokenize import sent_tokenize, word_tokenize

pd.set_option("display.max_rows", 101)

In [47]:
try: 
    text = 'Hello world. My name is NLTK.'
    token_text = sent_tokenize(text)
except LookupError:
    # 'punkt' required for sentence and word tokenization
    import nltk
    nltk.download('punkt')

In [48]:
kdwd_path = '../../data/kdwd/'
for dirname, _, filenames in os.walk(kdwd_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../../data/kdwd/item_aliases.csv
../../data/kdwd/page.csv
../../data/kdwd/statements.csv
../../data/kdwd/link_annotated_text_short.jsonl
../../data/kdwd/property.csv
../../data/kdwd/item.csv
../../data/kdwd/property_aliases.csv
../../data/kdwd/link_annotated_text.jsonl


Link to check Wikipedia ID: `https://en.wikipedia.org/?curid={wikipedia_ID}`

---

In [5]:
short_file = os.path.join(kdwd_path, "link_annotated_text_short.jsonl")

In [6]:
with open(short_file) as file:
    short_data = file.readlines()

In [7]:
len(short_data)

301

In [9]:
page_data = short_data[0:3]

In [14]:
# create dataframe with similar columns as ACY
df = pd.DataFrame(columns=['page_id', 'section_name', 
                           'token', 'full_mention', 'wikipedia_ID', 'unit_number', 'full_sentence'])

# initialize counters for pages and units (sentences)
page_number = 0
unit_number = 0

# start the clock
start_time = time.time()

# loop over each page in input
for page in page_data:
    
    # extract page-level info
    page_json = json.loads(page)
    page_id = page_json['page_id']
    
    # loop over each section in the current page
    for section in page_json['sections']:
        
        # extract section-level info
        section_name = section['name']
        section_text = section['text']
        section_link_lengths = section['link_lengths']
        section_link_offsets = section['link_offsets']
        section_target_page_ids = section['target_page_ids']
        
        # running_index helps determine whether current word is identified as an entity
        running_index = 0
        
        # initialize section-specific params / first 
        entity_start = section_link_offsets[0]
        entity_end = entity_start + section_link_lengths[0]
        
        # split section into sentences
        sentences = sent_tokenize(section_text)
        
        # loop over each sentence in the current section
        for sentence in sentences:
            
            # unpdate running index to beginning of sentence
            running_index = section_text.find(sentence, running_index)
            
            # split sentence into words
            words = word_tokenize(sentence)
            
            # loop over each word in the current sentence
            for word in words:
                
                # unpdate running index to beginning of word
                running_index = section_text.find(word, running_index)
                
                # check whether this word corresponds to an entity in the given list
                if running_index in section_link_offsets:
                    
                    # position of entity in the given list
                    entity_pos = section_link_offsets.index(running_index)
                    
                    # entity start and end indices
                    entity_start = section_link_offsets[entity_pos]
                    entity_end = entity_start + section_link_lengths[entity_pos]
                    
                    # extract entity name and id
                    entity_name = section_text[entity_start:entity_end]
                    entity_id = section_target_page_ids[entity_pos]
                
                # check whether this word is latter part of the entity name
                elif running_index > entity_start and running_index < entity_end:
                    
                    # entity name and id still true
                    pass
                        
                else:
                    
                    # erase entity name and id
                    entity_name = None
                    entity_id = None
                
                # append current token to dataframe 
                observation = {
                    'page_id': page_id,
                    'section_name': section_name,
                    'token': word,
                    'full_mention': entity_name,
                    'wikipedia_ID': entity_id,
                    'unit_number': unit_number,
                    'full_sentence': sentence
                }
                df = df.append(observation, ignore_index=True)
                
                # update running index to end of word
                running_index += len(word)
            
            # increment unit number
            unit_number += 1
            
    # increment page number
    page_number += 1
    
# stop the clock
end_time = time.time()

print(f'time taken: {end_time-start_time} seconds')
print(f'time taken per page: {(end_time-start_time)/page_number} seconds')

time taken: 99.85042309761047 seconds
time taken per page: 33.283474365870156 seconds


In [15]:
df.head(50)

Unnamed: 0,page_id,section_name,token,full_mention,wikipedia_ID,unit_number,full_sentence
0,12,Introduction,Anarchism,,,0,Anarchism is an anti-authoritarian political a...
1,12,Introduction,is,,,0,Anarchism is an anti-authoritarian political a...
2,12,Introduction,an,,,0,Anarchism is an anti-authoritarian political a...
3,12,Introduction,anti-authoritarian,anti-authoritarian,867979.0,0,Anarchism is an anti-authoritarian political a...
4,12,Introduction,political,political,23040.0,0,Anarchism is an anti-authoritarian political a...
5,12,Introduction,and,,,0,Anarchism is an anti-authoritarian political a...
6,12,Introduction,social,social philosophy,586276.0,0,Anarchism is an anti-authoritarian political a...
7,12,Introduction,philosophy,social philosophy,586276.0,0,Anarchism is an anti-authoritarian political a...
8,12,Introduction,that,,,0,Anarchism is an anti-authoritarian political a...
9,12,Introduction,rejects,,,0,Anarchism is an anti-authoritarian political a...


---

In [16]:
full_file = os.path.join(kdwd_path, "link_annotated_text.jsonl")

In [17]:
with open(full_file) as file:
    total_pages = sum(1 for line in file)

In [18]:
total_pages

5343565