# Wikipedia Articles Preprocessing

**NOTE:** you can reproduce those results on Kaggle without downloading anything https://www.kaggle.com/jacksoncrow/dataset-preprocessing

`#!/bin/python3`

In [1]:
import numpy as np
import string
import json
import shutil

from pathlib import Path
import os
from os import listdir, mkdir
from os.path import isfile, isdir, join, exists, abspath
from keras.preprocessing import image
from keras.applications.resnet import ResNet152, preprocess_input
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Loading&Preprocessing Dataset

In [2]:
def _globalMaxPool1D(tensor):
    _,_,_,size = tensor.shape
    return [tensor[:,:,:,i].max() for i in range(size)]

def _getImageFeatures(model, img_path):
    img = image.load_img(img_path, target_size=None)

    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    feature_tensor = model.predict(img_data)
    get_img_id = lambda p: p.split('/')[-1].split('.')[0]
    return {
        "id": get_img_id(img_path),
        "features": _globalMaxPool1D(feature_tensor),
    }

def _getJSON(path):
    with open(path) as json_file:
        return json.loads(json.load(json_file))

def _getTextFeatures(text_path):
    data = _getJSON(text_path)
    text = data['text'].replace("\n", " ")
    # onyshchak: only checking first 1000 characters, will need to extract summary propely
    text = text.translate(str.maketrans('', '', string.punctuation))[:1000]
    return {
        'id': data['id'],
        'text': text,
    }

def _getImagesMeta(path):
    return _getJSON(path)['img_meta']

def _getValidImagePaths(article_path):
    img_path = join(article_path, 'img/')
    return [join(img_path, f) for f in listdir(img_path) if isfile(join(img_path, f)) and f[-4:].lower() == ".jpg"]

def _dump(path, data):
    with open(path, 'w', encoding='utf8') as outfile:
        json.dump(data, outfile, indent=2, ensure_ascii=False)

def GetArticleData(article_path):
    article_data = _getTextFeatures(join(article_path, 'text.json'))
    article_data["img"] = _getImagesMeta(join(article_path, 'img/', 'meta.json'))
    
    return article_data

def ReadArticles(data_path, offset=0, limit=None):
    print("Reading in progress...")
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths) - offset
    
    articles = []
    for i in range(offset, offset + limit):
        path = article_paths[i]
        if (i - offset + 1) % 251 == 0: print(i - offset, "articles have been read")
        article_data = GetArticleData(path)
        articles.append(article_data)
        if len(articles) >= limit: break  # useless?
        
    print(limit, "articles have been read")
    return articles

def GenerateVisualFeatures(data_path, offset=0, limit=None, model=None):
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths) - offset
    model = model if model else ResNet152(weights='imagenet', include_top=False) 
    
    for i in range(offset, offset + limit):
        path = article_paths[i]
        print(i, path)
    
        meta_path = join(path, 'img/', 'meta.json')
        meta_arr = _getImagesMeta(meta_path)
        for meta in meta_arr:
            if 'features' in meta: continue
            if meta['filename'][-4:].lower() != ".jpg": continue
                
            img_path =  join(path, 'img/', meta['filename'])
            try:
                features = _getImageFeatures(model, img_path)['features']
                meta['features'] = [str(f) for f in features]
            except Exception as e:
                print("exception", str(e))
                print(img_path)
                continue
                
        _dump(meta_path, json.dumps({"img_meta": meta_arr}))

In [None]:
%%time
GenerateVisualFeatures('./data/', offset=0, limit=None)

In [None]:
%%time
articles = ReadArticles('./data/', offset=0, limit=None)

## Mapping Preprocessed Dataset into W2VV format

In [4]:
dataset_name = 'data_w2vv'
dataset_path = join('./', dataset_name)
if exists(dataset_path):
    shutil.rmtree(dataset_path)
    
mkdir(dataset_path)
subsets = {
    "train": {},
    "val": {},
    "test": {},
}

for k, v in subsets.items():
    v['name'] = dataset_name + k
    v['path'] = join(dataset_path, v['name'])
    mkdir(v['path'])
    
    v['feature_data_path'] = join(v['path'], 'FeatureData')
    if k == 'train':
        mkdir(v['feature_data_path'])
    else:
        dst = v['feature_data_path']
        os.symlink(os.path.relpath(subsets['train']['feature_data_path'], Path(dst).parent), dst)

    v["image_sets_path"] = join(v['path'], 'ImageSets')
    mkdir(v["image_sets_path"])

    v["text_data_path"] = join(v['path'], 'TextData')
    mkdir(v["text_data_path"])

In [5]:
def to_file(arr, filepath):
    with open(filepath, 'w') as f:
        for x in arr:
            f.write("%s\n" % x)
            
# map_data = lambda func: [func(a, i) for a in articles for i in a['img'] if 'features' in i]
def map_data():
    seen = set()
    res = []
    for a in articles:
        for i in a['img']:
            if 'features' not in i: continue
                
            img_id = os.path.splitext(i['filename'])[0]  # removing file extention
            if img_id in seen:
                # onyshchak: if image used in 2 articles, we only take the first one for simplicity
                # TODO: use all the infomation without breaking the model
                continue
                
            seen.add(img_id)
            res.append({
                "filename": img_id,
                "title": i['title'],
                "text": a['text'][:1000],
                "features": i['features'],
            })
            
    return res

data = map_data()
del articles

In [7]:
list2str = lambda l: " ".join([str(x) for x in l])

img_features = ['{} {}'.format(x['filename'], list2str(x['features'])) for x in data]

raw_features_file_path = join(subsets['train']["feature_data_path"], subsets['train']['name'] + ".features.txt")
to_file(img_features, raw_features_file_path)

In [8]:
subsets['train']['data'], subsets['test']['data'] = train_test_split(
    data, test_size=1000, random_state=1234
)

subsets['train']['data'], subsets['val']['data'] = train_test_split(
    subsets['train']['data'], test_size=1000, random_state=1234
)

del data

In [9]:
for v in subsets.values():
    ids = [x['filename'] for x in v['data']]
    to_file(ids, join(v["image_sets_path"], v['name'] + ".txt"))

In [10]:
# onyshchak: originally ID also contained file extention e.g. *.jpg. but not in image_sets_path
for v in subsets.values():
    text_data = sorted(
        ['{}#enc#0 {}'.format(x['filename'], x['title']) for x in v['data']] +
        ['{}#enc#1 {}'.format(x['filename'], x['text'][:1000]) for x in v['data']]
    )

    to_file(text_data, join(v["text_data_path"], v['name'] + ".caption.txt"))

In [11]:
for k,v in subsets.items():
    del v['data']

In [12]:
IS_FILE_LIST = 0
FEATURE_DIMENTION = 2048
feature_data_path = subsets['train']["feature_data_path"]
bin_features_path = join(feature_data_path, "pyresnet152-pool5os/")

! python2 w2vv/simpleknn/txt2bin.py $FEATURE_DIMENTION $raw_features_file_path $IS_FILE_LIST $bin_features_path --overwrite 1

>>> Processing ./data_w2vv/data_w2vvtrain/FeatureData/data_w2vvtrain.features.txt
45643 lines parsed, 45643 ids,  0 failed ->  45643 unique ids
