# Wikipedia Articles Preprocessing

`#!/bin/python3`

In [None]:
import pandas as pd
import numpy as np
import string
import json
import shutil

from pathlib import Path
from os import listdir, mkdir
from os.path import isfile, isdir, join, exists, abspath
from keras.preprocessing import image
from keras.applications.resnet import ResNet152, preprocess_input

## Loading&Preprocessing Dataset

In [None]:
def _globalMaxPool1D(tensor):
    _,_,_,size = tensor.shape
    return [tensor[:,:,:,i].max() for i in range(size)]

def _getImageFeatures(model, img_path):
    img = image.load_img(img_path, target_size=None)

    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    feature_tensor = model.predict(img_data)
    get_img_id = lambda p: p.split('/')[-1].split('.')[0]
    return {
        "id": get_img_id(img_path),
        "features": _globalMaxPool1D(feature_tensor),
    }

def _getJSON(path):
    with open(path) as json_file:
        return json.loads(json.load(json_file))

def _getTextFeatures(text_path):
    data = _getJSON(text_path)
    text = data['text'].replace("\n", " ")
    # onyshchak: only checking first 1000 characters, will need to extract summary propely
    text = text.translate(str.maketrans('', '', string.punctuation))[:1000]
    return {
        'id': data['id'],
        'text': text,
    }

def _getImagesMeta(path):
    return _getJSON(path)['img_meta']

def _getValidImagePaths(article_path):
    img_path = join(article_path, 'img/')
    return [join(img_path, f) for f in listdir(img_path) if isfile(join(img_path, f)) and f[-4:].lower() == ".jpg"]

def _dump(path, data):
    with open(path, 'w', encoding='utf8') as outfile:
        json.dump(data, outfile, indent=2, ensure_ascii=False)

def GetArticleData(model, article_path):
    article_data = _getTextFeatures(join(article_path, 'text.json'))
    article_data["img"] = _getImagesMeta(join(article_path, 'img/', 'meta.json'))
    
    return article_data

def ReadArticles(data_path, offset=0, limit=None):
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths)
    
    articles = []
    for i in range(offset, offset + limit):
        path = article_paths[i]
        print(i, path)
        article_data = GetArticleData(model, path)
        articles.append(article_data)
        if len(articles) >= limit: break  # useless?
            
    return articles

def GenerateVisualFeatures(data_path, offset=0, limit=None, model=None):
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths) - offset
    model = model if model else ResNet152(weights='imagenet', include_top=False) 
    
    articles = []
    for i in range(offset, offset + limit):
        path = article_paths[i]
        print(i, path)
    
        meta_path = join(path, 'img/', 'meta.json')
        meta_arr = _getImagesMeta(meta_path)
        for meta in meta_arr:
            if 'features' in meta: continue
            if meta['filename'][-4:].lower() != ".jpg": continue
                
            img_path =  join(path, 'img/', meta['filename'])
            try:
                features = _getImageFeatures(model, img_path)['features']
                meta['features'] = [str(f) for f in features]
            except Exception as e:
                print("exception", str(e))
                print(img_path)
                continue
                
        _dump(meta_path, json.dumps({"img_meta": meta_arr}))

In [None]:
model = ResNet152(weights='imagenet', include_top=False) 

In [None]:
%%time
GenerateVisualFeatures('./data/', offset=0, limit=None, model=model)

In [None]:
%%time
articles = ReadArticles('./data/', offset=0, limit=None)

## Mapping Preprocessed Dataset into W2VV format

In [None]:
subset_name = 'test_subset3'
subset_path = abspath(join('./', subset_name))
if exists(subset_path):
    shutil.rmtree(subset_path)
    
mkdir(subset_path)

feature_data_path = join(subset_path, 'FeatureData')
mkdir(feature_data_path)

image_sets_path = join(subset_path, 'ImageSets')
mkdir(image_sets_path)

text_data_path = join(subset_path, 'TextData')
mkdir(text_data_path)

In [None]:
def to_file(arr, filepath):
    with open(filepath, 'w') as f:
        for x in arr:
            f.write("%s\n" % x)

In [None]:
ids = [i['id'] for a in articles for i in a['img']]
to_file(ids, join(image_sets_path, subset_name + ".txt"))

In [None]:
# onyshchak: originally ID also contained file extention e.g. ".jpg"
text_data = ['{}#enc#0 {}'.format(i['id'], i['title']) for a in articles for i in a['img']]
to_file(text_data, join(text_data_path, subset_name + ".caption.txt"))

In [None]:
list2str = lambda l: " ".join([str(x) for x in l])
img_features = ['{} {}'.format(i['id'], list2str(i['features'])) for a in articles for i in a['img']]

raw_features_file_path = join(feature_data_path, subset_name + ".features.txt")
to_file(img_features, raw_features_file_path)

In [None]:
IS_FILE_LIST = 0
FEATURE_DIMENTION = 2048
bin_features_path = join(feature_data_path, "pyresnet152-pool5os/")

# ! ./w2vv/do_gene_vocab.sh $subset_name # problems with relative path
! python2 w2vv/simpleknn/txt2bin.py $FEATURE_DIMENTION $raw_features_file_path $IS_FILE_LIST $bin_features_path --overwrite 1