In [1]:
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np
import re
import os
import shutil

from urllib.parse import urlparse
from urllib.parse import urldefrag
from urllib.request import urlopen
from file_storage import FileStorage
from urllib.parse import urljoin
from IPython.display import clear_output
from collections import defaultdict

from inscriptis import get_text

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Input, Masking, Lambda
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback, LearningRateScheduler, Callback

Using TensorFlow backend.


In [2]:
CLEAR_STORAGE = FileStorage("clear_storage")
PREFIX = "https://simple.wikipedia.org"
MIN_FREQ = 10

In [3]:
def read_train(tarin_filename):
    queries, urls = [], []
    with open(tarin_filename) as handler:
        for line in handler:
            query, url = line.strip().split("\t", 2)
            queries.append(query)
            urls.append(url)
    return queries, urls

def get_words_from_url(url):
    key = PREFIX + url
    if CLEAR_STORAGE.contains(key):
        return re.sub("[^\w\s]", " ", get_text(CLEAR_STORAGE.read(key)).lower()).split()
    else:
        return []

def get_words_counter_from_urls(urls):
    words = Counter()
    for i, url in enumerate(urls):
        if i % 500 == 0:
            clear_output()
            print("{} / {}".format(i, len(urls)))
        for word in get_words_from_url(url):
            words[word] += 1
    return words

def get_words_from_query(query):
    return re.sub("[^\w\s]", " ", query.lower()).split()
    
def get_words_counter_from_queries(queries):
    words = Counter()
    for i, query in enumerate(queries):
        if i % 500 == 0:
            clear_output()
            print("{} / {}".format(i, len(urls)))
        for word in get_words_from_query(query):
            words[word] += 1
    return words

def make_words_to_labels_from_words_counter(words_counter, min_freq):
    filtered_words = {
        word: words_counter[word]
        for word in words_counter
        if words_counter[word] > min_freq
    }
    return {
        word: label
        for label, word in enumerate(filtered_words)
    }

def load_words_to_labels(filename):
    with open(filename) as handler:
        words_to_labels = {}
        for line in handler:
            word, label = line.strip().split()
            words_to_labels[word] = int(label)
        return words_to_labels

def save_words(words_to_labels, filename):
    with open(filename, "w") as handler:
        for word in words_to_labels:
            print("{}\t{}".format(word, words_to_labels[word]), file=handler)

def word_to_label(word, words_to_labels):
    if word in words_to_labels:
        return str(words_to_labels[word])
    else:
        return ""

def make_procesed_train_file(
        queries,
        urls,
        filename,
        url_words_to_labels,
        query_words_to_labels,
        query_shuffle,
        url_shuffle):
    def url_word_to_label(word):
        return word_to_label(word, url_words_to_labels)

    def query_word_to_label(word):
        return word_to_label(word, query_words_to_labels)

    with open(filename, "w") as handler:
        for i in range(len(urls)):
            if i % 500 == 0:
                clear_output()
                print("{} / {}".format(i, len(urls)))
            query = queries[query_shuffle[i]]
            url = urls[url_shuffle[i]]
            query_words = get_words_from_query(query)
            url_words = get_words_from_url(url)
            query_str = " ".join(map(query_word_to_label, query_words)).strip()
            url_str = " ".join(map(url_word_to_label, url_words)).strip()
            if (len(query_str) > 0) and (len(url_str) > 0):
                print("{}\t{}".format(query_str, url_str), file=handler)

def make_procesed_train_files(
        queries,
        urls,
        positive_filename,
        negative_filename,
        url_words_to_labels,
        query_words_to_labels,
        positive_shuffle,
        negative_shuffle):

    def url_word_to_label(word):
        return word_to_label(word, url_words_to_labels)

    def query_word_to_label(word):
        return word_to_label(word, query_words_to_labels)

    with open(positive_filename, "w") as positive_handler, open(negative_filename, "w") as negative_handler:
        for i in range(len(urls)):
            if i % 500 == 0:
                clear_output()
                print("{} / {}".format(i, len(urls)))
            query = queries[positive_shuffle[i]]
            positive_url = urls[positive_shuffle[i]]
            negative_url = urls[negative_shuffle[i]]
            query_words = get_words_from_query(query)
            positive_url_words = get_words_from_url(positive_url)
            negative_url_words = get_words_from_url(negative_url)
            query_str = " ".join(map(query_word_to_label, query_words))
            positive_url_str = " ".join(map(url_word_to_label, positive_url_words))
            negative_url_str = " ".join(map(url_word_to_label, negative_url_words))
            print("{}\t{}".format(query_str, positive_url_str), file=positive_handler)
            print("{}\t{}".format(query_str, negative_url_str), file=negative_handler)

In [4]:
queries, urls = read_train("req_ans_learn.tsv")

In [None]:
url_words_to_labels = load_words_to_labels("url_freq_words_to_labels.tsv")
query_words_to_labels = load_words_to_labels("query_freq_words_to_labels.tsv")

In [None]:
positive_shuffle = np.random.permutation(len(queries))
negative_shuffle = np.random.permutation(len(queries))

In [None]:
save_words(url_words_to_labels, "url_freq_words_to_labels.tsv")
save_words(query_words_to_labels, "query_freq_words_to_labels.tsv")
np.save("positive_shuffle", positive_shuffle)
np.save("negative_shuffle", negative_shuffle)

In [None]:
len(query_words_to_labels), len(url_words_to_labels), len(queries), len(urls)

In [None]:
make_procesed_train_files(
    queries,
    urls,
    "freq_positive_train_data.tsv",
    "freq_negative_train_data.tsv",
    url_words_to_labels,
    query_words_to_labels,
    positive_shuffle,
    negative_shuffle
)

In [5]:
import cityhash

In [16]:
def hash_words(words, dict_size=100000):
    return ({
        str(cityhash.CityHash32(word) % dict_size)
        for word in words
    }, {
        str(cityhash.CityHash32(word_1 + "$" + word_2) % dict_size)
        for word_1, word_2 in zip(words[:-1], words[1:])
    }, {
        str(cityhash.CityHash32(word[i:(i + 3)]) % dict_size)
        for word in words
        for i in range(len(word) - 2)
    })

def make_string_from_words(words):
    words, word_bigrams, symbol_trigrams = hash_words(words)
    return "\t".join([" ".join(words), " ".join(word_bigrams), " ".join(symbol_trigrams)])

In [17]:
positive_shuffle = np.load("positive_shuffle.npy")
negative_shuffle = np.load("negative_shuffle.npy")

In [18]:
len(positive_shuffle), len(negative_shuffle)

(500000, 500000)

In [19]:
with open("positive_hash_data_dict_100K.tsv", "w") as positive_handler, open("negative_hash_data_dict_100K.tsv", "w") as negative_handler:
    for i in range(len(urls)):
        if i % 500 == 0:
            clear_output()
            print("{} / {}".format(i, len(urls)))
        
        query = queries[positive_shuffle[i]]
        positive_url = urls[positive_shuffle[i]]
        negative_url = urls[negative_shuffle[i]]
        query_words = get_words_from_query(query)
        positive_url_words = get_words_from_url(positive_url)
        negative_url_words = get_words_from_url(negative_url)
        
        query_line = make_string_from_words(query_words)
        positive_url_line = make_string_from_words(positive_url_words)
        negative_url_line = make_string_from_words(negative_url_words)
        
        print(query_line + "\t" + positive_url_line, file=positive_handler)
        print(query_line + "\t" + negative_url_line, file=negative_handler)

39500 / 500000


KeyboardInterrupt: 