# Caption Parser
Parses wikitext from each article and extracts available image captions

In [1]:
import mwparserfromhell as mwp
import re
import json
import os
import html

from pathlib import Path
from os import listdir, mkdir
from os.path import isfile, isdir, join, exists, abspath

In [31]:
def _extract_all_files(text, prefix):
    pattern_str = r'\[\[\s*{}\s*:'.format(prefix)
    pattern = re.compile(pattern_str)

    files = []
    i = 0
    while i < len(text):
        match = pattern.search(text, i)
        if match is None: break

        start = i = match.span()[0]           
        counter = 1
        while counter > 0:
            i += 1
            if text[i] == '[':
                counter += 1
            elif text[i] == ']':
                counter -= 1
        
        i += 1
        files.append(text[start:i])
    
    return files

def _extract_images(text):
    res = []
    prefixes = ["File", "Image"]
    for p in prefixes:
        res += _extract_all_files(text, prefix=p)
        
    return res

def _get_filename(text):
    s = text.find(':')
    e = text.find('|')
    return text[s+1:e]

def _whitespace_count(text):
    count = 0
    for x in text:
        count += x.isspace()
    return count

def _clean(wiki_text):
    wikicode = mwp.parse(wiki_text)
    return wikicode.strip_code()

def _get_next_pos(text, start):
    counter = 0
    for i in range(start, len(text)):
        if text[i] == '[':
            counter += 1
        elif text[i] == ']':
            counter -= 1
        elif text[i] == '|' and counter == 0:
            return i
        
    return -1

def _split_image_data(text):
    # replasing consecutive spaces with a one
    text = re.sub(' +', ' ', text)
    
    chunks = []
    offset = len('[[') 
    i = offset
    while True:
        end = _get_next_pos(text, i)
        if end == -1:
            chunks.append(text[i:-offset])
            break
        
        chunks.append(text[i:end])
        i = end + 1
    
    return chunks


def _get_caption(text):
    # onyshchak TODO: beware of "alt" keyword included here. We might reuse it later
    keywords = [
        "upright", "left", "right", "center", "none", "baseline", "link", "alt",
        "sub", "super", "top", "text-top", "middle", "bottom", "text-bottom", " px",
        "page", "class", "lang", "border", "frameless", "frame", "thumb", "thumbnail"
    ]
    caption = ""
    chunks = _split_image_data(text)    
    for i in range(1, len(chunks)):
        chunk = chunks[i].strip()        
        count = _whitespace_count(chunk)
        if count > 1 and not chunk.startswith('alt='): # more than 2 words
            caption = chunk
            break
        elif count == 1: # 2 words
            is_caption = True
            chunk_lowered = chunk.lower()
            for k in keywords:
                if k in chunk_lowered:
                    is_caption = False
                    break
                    
            if is_caption:
                caption = chunk
                
    return _clean(caption)

In [3]:
# onyshchak TODO: don't forget to properly handle NOT image FilePage
def get_image_captions(wikitext):
    res = []
    images_wikitext = _extract_images(wikitext)
    for img in images_wikitext:
        caption = _get_caption(img)
        if not caption: continue
            
        filename = _get_filename(img)
        res.append((filename, caption))
    
    return res

In [36]:
def _getJSON(path):
    with open(path) as json_file:
        return json.loads(json.load(json_file))

def _dump(path, data):
    with open(path, 'w', encoding='utf8') as outfile:
        json.dump(data, outfile, indent=2, ensure_ascii=False)
        
def _valid_img_type(img_name):
    # onyshchak: exclude .svg since most of it is icons. Althouh, should do better filtering
    valid_types = [
        '.tif', '.tiff', '.jpg', '.jpeg', '.jpe', '.jif,', '.jfif', '.jfi',  '.gif', '.png'
    ]
    for t in valid_types:
        if img_name.lower().endswith(t):
            return True
    return False

In [45]:
def enrich_with_captions(data_path, offset=0, limit=None):
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths) - offset
    limit = min(limit, len(article_paths) - offset)
    
    for i in range(offset, offset + limit):
        path = article_paths[i]
        print()
        print(i, path)
    
        meta_path = join(path, 'img', 'meta.json')
        meta_arr = _getJSON(meta_path)['img_meta']
        
        text_path = join(path, 'text.json')
        text = _getJSON(text_path)['text']
        
        image_captions = get_image_captions(text)
        for title, caption in image_captions:
            title = html.unescape(title)
            if not _valid_img_type(title): continue
            print(title, " | ", caption)
            
            res = [i for i, x in enumerate(meta_arr) if x['title'] == title]
            if len(res) != 1:
                print('ERROR: found {} matches with title = {}'.format(len(res), title))
                return False
            
            i = res[0]
            meta_arr[i]['caption'] = caption
                
        _dump(meta_path, json.dumps({"img_meta": meta_arr}))
        
    return True

In [47]:
enrich_with_captions('../article_reader/test/', 0)


0 ../article_reader/test/The_Lucy_poems
William Wordsworth at 28 by William Shuter2.jpg  |  William Shuter, Portrait of William Wordsworth, 1798. Earliest known portrait of Wordsworth, painted in the year he wrote the first drafts of "The Lucy poems""The Cornell Wordsworth Collection". Cornell University. Retrieved on 13 February 2009.
Dorothy Wordsworth 2.jpg  |  W. Crowbent, 1907, Portrait of Dorothy Wordsworth, depicting her later in life, (drawing from a photograph).
Lyrical Ballads.jpg  |  Title page for the first edition of Lyrical Ballads
SamuelTaylorColeridge.jpg  |  Samuel Taylor Coleridge, by Peter Van Dyke, 1795. A major poet and one of the foremost critics of the day, Coleridge collaborated on Lyrical Ballads with Wordsworth and remained a close friend and confidant for many years.Ford 1957, 186–206
Margaret Oliphant Wilson Oliphant.jpg  |  Frederick Augustus Sandys (1829–1904), Margaret Oliphant, chalk, 1881. In 1875, she was one of the first anthologists to group togethe

True

In [49]:
import json
import pprint

text_path = "../article_reader/test/Kylfings/img/meta.json"
pp = pprint.PrettyPrinter(indent=2)
data = None
with open(text_path) as json_file:
    data = json.loads(json.load(json_file))['img_meta']

print_data = data
for i in range(len(print_data)):
    print_data[i] = {k:v for k,v in print_data[i].items() if k != "features"}
pp.pprint(print_data)

[ { 'caption': 'The Norslunda Runestone, bearing runic inscription U 419, '
               'which mentions the personal name Kylfingr',
    'description': 'Norslundastenen (Upplands runinskrifter 419). Rune stone '
                   'from Uppland. The inscription reads: Kylving and Stenfrid '
                   'and Sigfast erected this stone in memory of Östen, son of '
                   'Gunnar ... May God preserve his soul.',
    'filename': 'c9a24cd93065ea35a97aad5b8eb7f4e1.jpg',
    'title': 'Norslundastenen.jpg',
    'url': 'https://en.wikipedia.org/wiki/File%3ANorslundastenen.jpg'},
  { 'caption': 'The eastern shore of the Gulf of Bothnia, proposed by '
               "proponents of a Finnic origin for the Kylfings as that group's "
               'homeland',
    'description': 'English: Sunset across the Gulf of Bothnia in Finland, '
                   'June 2006.Latina: Solis Occasus trans Sinum Bothnicum, '
                   'Iunio in Finnia, Iunius 2006.Svenska: Solnedgån