Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
147 lines (113 sloc) 4.57 KB
import os
import re
import glob
import nltk
import json
import calendar
import argparse
import datetime
import dateutil.parser
import lxml.html as html
from collections import defaultdict
Some setup is required to run this script:
Mirror the site with wget (warning this will take hours):
# Remove any files that already exist
rm -r data/
time wget -m "" -X sites -P data --restrict-file-names=nocontrol --adjust-extension --accept-regex='^[^/]*[^\?]*$' 2> data/wget.log
-X sites
Exclude anythign under /sites/ - this avoids downloading images/CSS/JS
-P data
Place the downloaded files in the data/ directory
We need this for wget to save UTF-8 filenames properly
Add a file extension so that file and directory names don't clash
This has a regex to allow but not
Then run ``python``.
Running for
# Remove any files that already exist
rm -r data/
time wget -m "" -X sites -P data --restrict-file-names=nocontrol --adjust-extension --accept-regex='^[^/]*[^\?]*$' 2> data/wget_islingtontribune.log
Then run ``python --inputdir ./data/ --output data/saved_islingtontribune_entities.json``.
def json_set_default(obj):
We pass this function to the `default` keyword argument of json.dump to
ensure sets can be serialized.
if isinstance(obj, set):
return list(obj)
def extract_entity_names(tree):
if hasattr(tree, 'label'):
if tree.label() == 'NE':
yield ' '.join([child[0] for child in tree])
for child in tree:
yield from extract_entity_names(child)
def extract_entity_names_from_text(text):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
for tree in chunked_sentences:
return set(extract_entity_names(tree))
def write_json(entity_names):
with open(args.output, 'w') as fp:
json.dump(entity_names, fp, default=json_set_default, indent=4, sort_keys=True)
# Parse commandline arguments
parser = argparse.ArgumentParser()
parser.add_argument('--inputdir', default='./data/')
parser.add_argument('--output', default='data/saved_camdennewjournal_entities.json')
args = parser.parse_args()
# Download required nltk data'punkt')'averaged_perceptron_tagger')'maxent_ne_chunker')'words')
entity_names = defaultdict(list)
i = 0
for fname in glob.glob(args.inputdir+'/**', recursive=True):
if os.path.isdir(fname):
doc = html.parse(fname)
except UnicodeEncodeError:
print('Bad filename '+repr(fname))
content = doc.find('//div[@class="node"]/div[@class="content"]')
if content is not None:
text = content.text_content()
date = None
# Looks like Camden New journal manually adds these dates to the text
# of the article. A text match seems to be the most reliable way to extract these.
# Check for the word 'Published: ' followed by 3 more words.
m ='Published:\s+(\S+\s+\S+\s+\S+)', text)
if m:
date_text =
date = dateutil.parser.parse(date_text, fuzzy=True)
except (calendar.IllegalMonthError, ValueError):
print('No date found on {}'.format(fname))
# Ignore dates in the future
if date and >
date = None
i += 1
for name in extract_entity_names_from_text(text):
'link': fname.replace('.html', '').replace('./data/', 'http://'),
'_recency': date.isoformat() if date else None
if i % 1000 == 0:
print('writing intermediate json')
print('writing final json')