# Wikipedia Articles Preprocessing

`#!/bin/python3`

In [2]:
import pandas as pd
import numpy as np
import string
import json
import shutil

from pathlib import Path
from os import listdir, mkdir
from os.path import isfile, isdir, join, exists, abspath
from keras.preprocessing import image
from keras.applications.resnet import ResNet152, preprocess_input

Using TensorFlow backend.


## Loading&Preprocessing Dataset

In [2]:
def _globalMaxPool1D(tensor):
    _,_,_,size = tensor.shape
    return [tensor[:,:,:,i].max() for i in range(size)]

def _getImagePixels(model, img_path):
    img = image.load_img(img_path, target_size=None)

    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    feature_tensor = model.predict(img_data)
    get_img_id = lambda p: p.split('/')[-1].split('.')[0]
    return {
        "id": get_img_id(img_path),
        "features": _globalMaxPool1D(feature_tensor),
    }

def _getJSON(path):
    with open(path) as json_file:
        return json.loads(json.load(json_file))

def _getTextFeatures(text_path):
    data = _getJSON(text_path)
    text = data['text'].replace("\n", " ")
    # onyshchak: only checking first 1000 characters, will need to extract summery propely
    text = text.translate(str.maketrans('', '', string.punctuation))[:1000]
    return {
        'id': data['id'],
        'text': text,
    }

def _getImagesMeta(path):
    return _getJSON(path)['img_meta']

def _getValidImagePaths(article_path):
    img_path = join(article_path, 'img/')
    return [join(img_path, f) for f in listdir(img_path) if isfile(join(img_path, f)) and f[-4:].lower() == ".jpg"]

def _getMetaForImage(path, meta_arr):
    filename = Path(path).name
    return next(x for x in meta_arr if x['filename'] == filename)

def GetArticleData(model, article_path):
    article_data = _getTextFeatures(join(article_path, 'text.json'))
    article_data["img"] = []
    meta_arr = _getImagesMeta(join(article_path, 'img/', 'meta.json'))
    for img_path in _getValidImagePaths(article_path):
        pixels = _getImagePixels(model, img_path)
#         meta = None
#         try:
#             meta = _getMetaForImage(img_path, meta_arr)
#         except Exception as e:
#             print("Exxception ", str(e))
#             print("ARTICLE ", article_path)
#             print("Image ", img_path)
#             print("META IMG ", [x['filename'] for x in meta_arr])
#             raise
            
        img_features = pixels  # {**pixels, **meta}
        article_data["img"].append(img_features)
        
    return article_data

def PreprocessArticles(data_path, offset=0, limit=None):
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths)
    model = ResNet152(weights='imagenet', include_top=False) 
    
    articles = []
    for i in range(offset, offset + limit):
        path = article_paths[i]
        print(i, path)
        article_data = GetArticleData(model, path)
        articles.append(article_data)
        if len(articles) >= limit: break
            
    return articles

In [3]:
%%time
articles = PreprocessArticles('./data/', offset=0, limit=30)

0 ./data/William_Henry_Bury
1 ./data/Crucifix_(Cimabue,_Santa_Croce)
2 ./data/Maurice_Richard
3 ./data/Betelgeuse
4 ./data/William_Calcraft
5 ./data/Macedonia_(terminology)
6 ./data/Saturn
7 ./data/Jupiter
8 ./data/Doc_Adams
9 ./data/Take_Ichi_convoy
10 ./data/John_Treloar_(museum_administrator)
11 ./data/Istiodactylus
12 ./data/Pinkerton_(album)
13 ./data/S-50_(Manhattan_Project)
14 ./data/Saxaul_sparrow
15 ./data/Euryoryzomys_emmonsae
16 ./data/Kirsten_Dunst
17 ./data/Cyclone_Raja
18 ./data/Emmeline_Pankhurst
19 ./data/Liber_Eliensis
20 ./data/Jesse_L._Brown
21 ./data/Climate_of_India
22 ./data/Attachment_theory
23 ./data/U.S._Route_2_in_Michigan
24 ./data/The_Four_Stages_of_Cruelty
25 ./data/Cetiosauriscus
26 ./data/Battle_of_Svolder
27 ./data/John_Le_Mesurier
28 ./data/Madonna_in_the_Church
29 ./data/Mells_War_Memorial
CPU times: user 47min 25s, sys: 29 s, total: 47min 54s
Wall time: 7min 49s


In [4]:
import json
import pprint

text_path = 'data/Emmeline_Pankhurst/img/meta.json'
pp = pprint.PrettyPrinter(indent=2)
with open(text_path) as json_file:
    data = json.loads(json.load(json_file))
    pp.pprint(data)

{ 'img_meta': [ { 'description': 'English: Mareea Bothckareva, Mrs. Emmeline '
                                 'Pankhurst and women of the Battalion of '
                                 'Death, 1917.Español: Maria Bothckareva y la '
                                 'señora Emmeline Pankhurst flanquedadas por '
                                 'mujeres del "batallón de la muerte", '
                                 'defensoras del gobierno provisional, 1917.',
                  'filename': '0615b7721f03eb514cb875b6d6015117.jpg',
                  'title': 'Batallón-muerte-rusia--insiderussianrev00dorrrich.png',
                  'url': 'https://en.wikipedia.org/wiki/File%3ABatall%C3%B3n-muerte-rusia--insiderussianrev00dorrrich.png'},
                { 'description': 'English: en:Christabel Pankhurst '
                                 '(1880-1958)Esperanto: eo:Christabel '
                                 'Pankhurst (1880-1958)Español: es:Christabel '
                                 '

## Mapping Preprocessed Dataset into W2VV format

In [4]:
subset_name = 'test_subset2'
subset_path = abspath(join('./', subset_name))
if exists(subset_path):
    shutil.rmtree(subset_path)
    
mkdir(subset_path)

feature_data_path = join(subset_path, 'FeatureData')
mkdir(feature_data_path)

image_sets_path = join(subset_path, 'ImageSets')
mkdir(image_sets_path)

text_data_path = join(subset_path, 'TextData')
mkdir(text_data_path)

In [6]:
def to_f
ile(arr, filepath):
    with open(filepath, 'w') as f:
        for x in arr:
            f.write("%s\n" % x)

In [7]:
ids = [i['id'] for a in articles for i in a['img']]
to_file(ids, join(image_sets_path, subset_name + ".txt"))

In [8]:
# onyshchak: originally ID also contained file extention e.g. ".jpg"
text_data = ['{}#enc#0 {}'.format(i['id'], a['text']) for a in articles for i in a['img']]
to_file(text_data, join(text_data_path, subset_name + ".caption.txt"))

In [9]:
list2str = lambda l: " ".join([str(x) for x in l])
img_features = ['{} {}'.format(i['id'], list2str(i['features'])) for a in articles for i in a['img']]

raw_features_file_path = join(feature_data_path, subset_name + ".features.txt")
to_file(img_features, raw_features_file_path)

In [6]:
IS_FILE_LIST = 0
FEATURE_DIMENTION = 2048
bin_features_path = join(feature_data_path, "pyresnet152-pool5os/")

# ! ./w2vv/do_gene_vocab.sh $subset_name # problems with relative path
! python2 w2vv/simpleknn/txt2bin.py $FEATURE_DIMENTION $raw_features_file_path $IS_FILE_LIST $bin_features_path --overwrite 1

python2: can't open file 'get_word_vob.py': [Errno 2] No such file or directory
python2: can't open file 'get_word_vob.py': [Errno 2] No such file or directory
Usage: txt2bin.py [options] nDims inputTextFile isFileList resultDir

Options:
  -h, --help            show this help message and exit
  --overwrite=OVERWRITE
                        overwrite existing file (default=0)
