# Caption Parser
Parses wikitext from each article and extracts available image captions

In [1]:
import mwparserfromhell as mwp
import re

def _extract_all_files(text, prefix):
    pattern_str = r'\[\[\s*{}\s*:'.format(prefix)
    pattern = re.compile(pattern_str)

    files = []
    i = 0
    while i < len(text):
        match = pattern.search(text, i)
        if match is None: break

        start = i = match.span()[0]           
        counter = 1
        while counter > 0:
            i += 1
            if text[i] == '[':
                counter += 1
            elif text[i] == ']':
                counter -= 1
        
        i += 1
        files.append(text[start:i])
    
    return files

def _extract_images(text):
    res = []
    prefixes = ["File", "Image"]
    for p in prefixes:
        res += _extract_all_files(text, prefix=p)
        
    return res

def _get_filename(text):
    s = text.find(':')
    e = text.find('|')
    return text[s+1:e]

def _whitespace_count(text):
    count = 0
    for x in text:
        count += x.isspace()
    return count

def _clean(wiki_text):
    wikicode = mwp.parse(wiki_text)
    return wikicode.strip_code()

def _get_next_pos(text, start):
    counter = 0
    for i in range(start, len(text)):
        if text[i] == '[':
            counter += 1
        elif text[i] == ']':
            counter -= 1
        elif text[i] == '|' and counter == 0:
            return i
        
    return -1

def _split_image_data(text):
    # replasing consecutive spaces with a one
    text = re.sub(' +', ' ', text)
    
    chunks = []
    offset = len('[[') 
    i = offset
    while True:
        end = _get_next_pos(text, i)
        if end == -1:
            chunks.append(text[i:-offset])
            break
        
        chunks.append(text[i:end])
        i = end + 1
    
    return chunks


def _get_caption(text):
    # onyshchak TODO: beware of "alt" keyword included here. We might reuse it later
    keywords = [
        "upright", "left", "right", "center", "none", "baseline", "link", "alt",
        "sub", "super", "top", "text-top", "middle", "bottom", "text-bottom", " px",
        "page", "class", "lang", "border", "frameless", "frame", "thumb", "thumbnail"
    ]
    caption = ""
    chunks = _split_image_data(text)    
    for i in range(1, len(chunks)):
        chunk = chunks[i].strip()        
        count = _whitespace_count(chunk)
        if count >= 2: # more than 2 words
            caption = chunk
            break
        elif count <= 1:
            continue
        else:
            is_caption = True
            chunk_lowered = chunk.lower()
            for k in keywords:
                if k in chunk_lowered:
                    is_caption = False
                    break
                    
            if is_caption:
                caption = chunk
                
    return _clean(caption)

In [2]:
# onyshchak TODO: don't forget to properly handle NOT image FilePage
def get_image_captions(wikitext):
    res = []
    images_wikitext = _extract_images(wikitext)
    for img in images_wikitext:
        caption = _get_caption(img)
        if not caption: continue
            
        filename = _get_filename(img)
        res.append((filename, caption))
    
    return res