In [1]:
import sys
import spacy
import csv
from pathlib import Path
import re
import numpy as np
import time
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
# loss=warp, opt=sgd, agg=avg, constr=unitnorm

In [3]:
title = 0
descr = 1
def load_data(file: Path, delimiter=','):
    with file.open('r') as fr:
        for row in csv.reader(fr):
            yield [row[title + 1], row[descr + 1]]

In [4]:
data = load_data(Path('../2019_03_05_19_10_02_332991.csv'))

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def tokenize(data):
    bag_of_words = list()
    for text_data in data:
        res_tokenize = []
        for el in text_data:
            el_re = re.sub(r'[^\w\s]',' ',el)
            el_words = el_re.lower().split()                  
            res_tokenize.append([word for word in el_words if word not in stop_words])
        bag_of_words.append(res_tokenize)
    return bag_of_words

In [6]:
data = tokenize(data)
for text_data in data:
    if not text_data[title] or not text_data[descr]:
        data.remove(text_data)

In [7]:
print(data[56])

[['australia', 'pursue', 'tpp', 'minus', 'one'], ['nations', 'hope', 'move', 'forward', 'transpacific', 'partnership', 'trade', 'deal', 'america', 'withdraws']]


In [8]:
words = []
for el in data:
    words += el[title] + el[descr]

In [9]:
Tfidf_vect = TfidfVectorizer(max_features=len(words))
Tfidf_vect.fit(words)
vocab = Tfidf_vect.vocabulary_

In [10]:
print(vocab['hope'])

21797


In [11]:
def text_to_id(text, vocab):
    news_ids = []
    ids_= lambda t: [vocab.get(x) for x in t if vocab.get(x)]

    return [[ids_(t[title]), ids_(t[descr])] for t in text]

In [12]:
data_id = text_to_id(data, vocab)  
train_data, test_data1 = train_test_split(data_id, test_size=0.95)
test_data, test_data2 = train_test_split(test_data1, test_size=0.9)

In [13]:
def create_embedding(emb_size, vocab_size):
    eps = 0.001
    return eps * np.random.randn(vocab_size, emb_size)

In [14]:
def unitnorm():
    for i in range(len(embs)):
        norm = np.sqrt(sum(np.power(embs[i], 2)))
        if (round(norm, 3) != 1):
            #print("WARNING: l2 norm != 1! Let's norm")
            embs[i] = embs[i] / norm

In [15]:
feature_size = 1024
embs = create_embedding(feature_size, len(vocab))
unitnorm()

In [16]:
def get_emb(text_words):
    return [embs[word] for word in text_words]

def ids_to_vec(text_tuple):
    return [np.average(get_emb(el), axis=0) for el in text_tuple]        

In [17]:
def update_sgd(idx, gradient, alpha):
    embs[idx, :] -= alpha * gradient

In [18]:
def backward_warp(data, tuple_emb, sample_idx, gamma = 1.0):
    scalar = np.dot(tuple_emb[title], tuple_emb[descr])   
    scalar_random = 0
    sample_descr = []
    cache = [sample_idx]
    
    N = 0
    while (len(cache) < len(data)):
        num_rand = np.random.randint(len(data))
        if num_rand not in cache:
            sample_descr = data[num_rand][descr]
            sample_descr_emb = ids_to_vec([sample_descr])[0]
            cache.append(num_rand)
            N += 1
            scalar_random = np.dot(tuple_emb[title], sample_descr_emb)
            res = 1 - scalar + scalar_random
            if (res > 0):
                break
    if (N != 0):
        search_complex = sum([1/j for j in range(1,N + 1)])
        loss = search_complex * (gamma - scalar + scalar_random)
        if (loss > 0):
            return sample_descr, (sample_descr_emb - tuple_emb[descr], (-1) * tuple_emb[title], tuple_emb[title])
    return

In [19]:
def top_k(vec, k):
    return np.argsort(vec, axis=-1, kind='quicksort', order=None)[1:k]

In [20]:
def recall_at_k(data, k = 10):
    n_test = len(data)
    descr_emb = [ids_to_vec([data[i][descr]]) for i in range(n_test)]
    descr_embed = np.reshape(descr_emb, (feature_size, len(descr_emb)))
    
    recall = 0
    N = 0
    for i in range(n_test):
        if not data[i][title]:
            continue

        title_emb = ids_to_vec([data[i][title]])
        N += 1
        if (i in top_k(np.matmul(title_emb, descr_embed)[0], k)):
            recall += 1

    return recall / N

In [21]:
def train2(data, tuple_title_descr, sample_idx, alpha):
    embs_tuple = ids_to_vec(tuple_title_descr)
    output_warp = backward_warp(data, embs_tuple, sample_idx)
    if not output_warp:
        return
    
    rand_descr, gradients = output_warp  
    tuple_title_descr.append(rand_descr)   
    
    for i, g in zip(tuple_title_descr, gradients):
        update_sgd(i, g, alpha)

In [22]:
def train1(data, alpha, check_param):
    first = np.random.permutation(len(data))
    check_update = 0
    for f in first:
        u, v = data[f]
        diff = lambda l1,l2: [x for x in l1 if x not in l2]
        if (np.random.choice([True, False])):
            v = diff(v, u)     
        else:
            u = diff(u, v) 
        if u and v:
            train2(data, [u, v], f, alpha)
            check_update += 1
            if check_update % check_param == 0:
                unitnorm()

In [23]:
def train(train_data, test_data, n_epochs, alpha, check_param):
    for epoch in range(n_epochs):
        t1 = time.process_time()
        train1(train_data, alpha, check_param)
        t = time.process_time() - t1
        recall = recall_at_k(test_data)
        print("Epoch {:>2} : recall = {:>2}% time = {:>12}s".
              format(epoch, round(recall, 2), round(t, 3)))

In [24]:
train(train_data, test_data, 100, 0.05, 30)

KeyboardInterrupt: 

In [None]:
def knn(query, k = 10):
    embs_shape = np.reshape(embs, (feature_size, len(embs)))
    scores = np.squeeze(np.matmul(embs[vocab.get(query)], embs_shape))
    neighbours = np.argsort(scores, axis=-1, kind='quicksort', order=None)[1:k]
    return [dict(zip(vocab.values(),vocab.keys()))[i] for i in neighbours]

In [None]:
knn("nation")