# Wikipedia Articles Preprocessing

`#!/bin/python3`

In [1]:
import pandas as pd
import numpy as np
import string
import json
import shutil

from os import listdir, mkdir
from os.path import isfile, isdir, join, exists, abspath
from keras.preprocessing import image
from keras.applications.resnet import ResNet152, preprocess_input

Using TensorFlow backend.


## Loading&Preprocessing Dataset

In [2]:
def _globalMaxPool1D(tensor):
    _,_,_,size = tensor.shape
    return [tensor[:,:,:,i].max() for i in range(size)]

def _getImageFeatures(model, img_path):
    img = image.load_img(img_path, target_size=None)

    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    feature_tensor = model.predict(img_data)
    get_img_id = lambda p: p.split('/')[-1].split('.')[0]
    return {
        "id": get_img_id(img_path),
        "features": _globalMaxPool1D(feature_tensor),
    }

def _getTextFeatures(text_path):
    with open(text_path) as json_file:
        data = json.loads(json.load(json_file))
        text = data['text'].replace("\n", " ")
        return {
            'id': data['id'],
            'text': text.translate(str.maketrans('', '', string.punctuation)),
        }
    
def _getValidImagePaths(article_path):
    img_path = join(article_path, 'img/')
    return [join(img_path, f) for f in listdir(img_path) if isfile(join(img_path, f)) and f[-4:].lower() == ".jpg"]

def GetArticleData(model, article_path):
    article_data = _getTextFeatures(join(article_path, 'text.json'))
    article_data["img"] = []
    for img_path in _getValidImagePaths(article_path):
        img_features = _getImageFeatures(model, img_path)
        article_data["img"].append(img_features)
        
    return article_data

def PreprocessArticles(data_path, limit=None):
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths) + 1
    model = ResNet152(weights='imagenet', include_top=False) 
    
    articles = []
    for path in article_paths:
        article_data = GetArticleData(model, path)
        articles.append(article_data)
        if len(articles) >= limit: break
            
    return articles

In [3]:
%%time
articles = PreprocessArticles('./data/', limit=3)

CPU times: user 4min 29s, sys: 2min 38s, total: 7min 7s
Wall time: 1min 14s


## Mapping Preprocessed Dataset into W2VV format

In [4]:
subset_name = 'test_subset'
subset_path = abspath(join('./', subset_name))
if exists(subset_path):
    shutil.rmtree(subset_path)
    
mkdir(subset_path)

feature_data_path = join(subset_path, 'FeatureData')
mkdir(feature_data_path)

image_sets_path = join(subset_path, 'ImageSets')
mkdir(image_sets_path)

text_data_path = join(subset_path, 'TextData')
mkdir(text_data_path)

In [5]:
def to_file(arr, filepath):
    with open(filepath, 'w') as f:
        for x in arr:
            f.write("%s\n" % x)

In [6]:
ids = [i['id'] for a in articles for i in a['img']]
to_file(ids, join(image_sets_path, subset_name + ".txt"))

In [7]:
text_data = ['{}#enc#0 {}'.format(i['id'], a['text']) for a in articles for i in a['img']]
to_file(text_data, join(text_data_path, subset_name + ".caption.txt"))

In [8]:
list2str = lambda l: " ".join([str(x) for x in l])
img_features = ['{} {}'.format(i['id'], list2str(i['features'])) for a in articles for i in a['img']]

raw_features_file_path = join(feature_data_path, subset_name + ".features.txt")
to_file(img_features, raw_features_file_path)

In [9]:
IS_FILE_LIST = 0
FEATURE_DIMENTION = 2048
bin_features_path = join(feature_data_path, "pyresnet152-pool5os/")

! python2 w2vv/simpleknn/txt2bin.py $FEATURE_DIMENTION $raw_features_file_path $IS_FILE_LIST $bin_features_path --overwrite 1

>>> Processing /home/oleh/projects/WikiImageRecommendation/test_subset/FeatureData/test_subset.features.txt
17 lines parsed, 17 ids,  0 failed ->  17 unique ids
