In [1]:
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np
import re
import os
import shutil

from urllib.parse import urlparse
from urllib.parse import urldefrag
from urllib.request import urlopen
from file_storage import FileStorage
from urllib.parse import urljoin
from IPython.display import clear_output
from collections import defaultdict

from inscriptis import get_text

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Input, Masking, Lambda
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback, LearningRateScheduler, Callback

Using TensorFlow backend.


In [2]:
CLEAR_STORAGE = FileStorage("clear_storage")
PREFIX = "https://simple.wikipedia.org"

In [3]:
def read_train(tarin_filename):
    queries, urls = [], []
    with open(tarin_filename) as handler:
        for line in handler:
            query, url = line.strip().split("\t", 2)
            queries.append(query)
            urls.append(url)
    return queries, urls

def get_words_from_url(url):
    key = PREFIX + url
    if CLEAR_STORAGE.contains(key):
        return re.sub("[^\w\s]", " ", get_text(CLEAR_STORAGE.read(key)).lower()).split()
    else:
        return []

def get_words_set_from_urls(urls):
    words = set()
    for i, url in enumerate(urls):
        if i % 500 == 0:
            clear_output()
            print("{} / {}".format(i, len(urls)))
        words.update(get_words_from_url(url))
    return words

def get_words_from_query(query):
    return re.sub("[^\w\s]", " ", query.lower()).split()
    
def get_words_set_from_queries(queries):
    words = set()
    for i, query in enumerate(queries):
        if i % 500 == 0:
            clear_output()
            print("{} / {}".format(i, len(urls)))
        words.update(get_words_from_query(query))
    return words

def make_words_to_labels_from_words_set(words_set):
    return {
        word: label
        for label, word in enumerate(words_set)
    }

def load_words_to_labels(filename):
    with open(filename) as handler:
        words_to_labels = {}
        for line in handler:
            word, label = line.strip().split()
            words_to_labels[word] = int(label)
        return words_to_labels

def save_words(words_to_labels, filename):
    with open(filename, "w") as handler:
        for word in words_to_labels:
            print("{}\t{}".format(word, words_to_labels[word]), file=handler)
            
def make_procesed_train_file(
        queries,
        urls,
        filename,
        url_words_to_labels,
        query_words_to_labels,
        query_shuffle,
        url_shuffle):
    def url_word_to_label(word):
        return str(url_words_to_labels[word])

    def query_word_to_label(word):
        return str(query_words_to_labels[word])

    with open(filename, "w") as handler:
        for i in range(len(urls)):
            if i % 500 == 0:
                clear_output()
                print("{} / {}".format(i, len(urls)))
            query = queries[query_shuffle[i]]
            url = urls[url_shuffle[i]]
            query_words = get_words_from_query(query)
            url_words = get_words_from_url(url)
            query_str = " ".join(map(query_word_to_label, query_words))
            url_str = " ".join(map(url_word_to_label, url_words))
            print("{}\t{}".format(query_str, url_str), file=handler)

def make_procesed_train_files(
        queries,
        urls,
        positive_filename,
        negative_filename,
        url_words_to_labels,
        query_words_to_labels,
        positive_shuffle,
        negative_shuffle):

    def url_word_to_label(word):
        return str(url_words_to_labels[word])

    def query_word_to_label(word):
        return str(query_words_to_labels[word])

    with open(positive_filename, "w") as positive_handler, open(negative_filename, "w") as negative_handler:
        for i in range(len(urls)):
            if i % 500 == 0:
                clear_output()
                print("{} / {}".format(i, len(urls)))
            query = queries[positive_shuffle[i]]
            positive_url = urls[positive_shuffle[i]]
            negative_url = urls[negative_shuffle[i]]
            query_words = get_words_from_query(query)
            positive_url_words = get_words_from_url(positive_url)
            negative_url_words = get_words_from_url(negative_url)
            query_str = " ".join(map(query_word_to_label, query_words))
            positive_url_str = " ".join(map(url_word_to_label, positive_url_words))
            negative_url_str = " ".join(map(url_word_to_label, negative_url_words))
            print("{}\t{}".format(query_str, positive_url_str), file=positive_handler)
            print("{}\t{}".format(query_str, negative_url_str), file=negative_handler)

In [4]:
queries, urls = read_train("req_ans_learn.tsv")

In [5]:
#url_words = get_words_set_from_urls(urls)
url_words = load_words_to_labels("url_words_to_labels.tsv")

In [6]:
query_words = get_words_set_from_queries(queries)

499500 / 500000


In [10]:
url_words_to_labels = make_words_to_labels_from_words_set(url_words)
query_words_to_labels = make_words_to_labels_from_words_set(query_words)

In [8]:
#save_words(url_words_to_labels, "url_words_to_labels.tsv")
save_words(query_words_to_labels, "query_words_to_labels.tsv")

In [9]:
positive_shuffle = np.random.permutation(len(queries))
negative_shuffle = np.random.permutation(len(queries))

In [11]:
len(query_words_to_labels), len(url_words_to_labels), len(queries), len(urls)

(247074, 583954, 500000, 500000)

In [None]:
make_procesed_train_files(
    queries,
    urls,
    "positive_train_data.tsv",
    "negative_train_data.tsv",
    url_words_to_labels,
    query_words_to_labels,
    positive_shuffle,
    negative_shuffle
)

206000 / 500000
