## News analysis
### loss=warp, opt=sgd, agg=avg, constr=unitnorm

In [1]:
import sys
import spacy
import csv
from pathlib import Path
import re
import numpy as np
import time
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

### Loading data

In [2]:
title = 0
descr = 1
def load_data(file: Path, delimiter=','):
    with file.open('r') as fr:
        for row in csv.reader(fr):
            yield [row[title + 1], row[descr + 1]]

In [3]:
data = load_data(Path('2019_03_05_19_10_02_332991.csv'))

### Data preprocessing

In [4]:
# import nltk
# nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def tokenize(data):
    bag_of_words = list()
    for text_data in data:
        res_tokenize = []
        for el in text_data:
            el_re = re.sub(r'[^\w\s]',' ',el)
            el_words = el_re.lower().split()                  
            res_tokenize.append([word for word in el_words if word not in stop_words])
        bag_of_words.append(res_tokenize)
    return bag_of_words

In [5]:
data = tokenize(data)
for text_data in data:
    if not text_data[title] or not text_data[descr]:
        data.remove(text_data)

In [6]:
print(data[56])

[['australia', 'pursue', 'tpp', 'minus', 'one'], ['nations', 'hope', 'move', 'forward', 'transpacific', 'partnership', 'trade', 'deal', 'america', 'withdraws']]


### Creating vocabluary

In [7]:
words = []
for el in data:
    words += el[title] + el[descr]

In [8]:
Tfidf_vect = TfidfVectorizer(max_features=len(words))
Tfidf_vect.fit(words)
vocab = Tfidf_vect.vocabulary_

In [9]:
print(vocab['hope'])

21797


### Data vectorizing

In [10]:
def text_to_id(text, vocab):
    news_ids = []
    ids_= lambda t: [vocab.get(x) for x in t if vocab.get(x)]

    return [[ids_(t[title]), ids_(t[descr])] for t in text]

In [11]:
def create_embedding(emb_size, vocab_size):
    return np.random.normal(0, 1, size=(vocab_size, emb_size)).astype(np.float32)

In [12]:
def unitnorm():
    for i in range(len(embs)):
        norm = np.sqrt(sum(np.power(embs[i], 2)))
        if (round(norm, 3) != 1):
            #print("WARNING: l2 norm != 1! Let's norm")
            embs[i] = embs[i] / norm

In [13]:
def get_emb(text_words):
    return [embs[word] for word in text_words]

def ids_to_vec(text_tuple):
    return [np.average(get_emb(el), axis=0) for el in text_tuple]        

### Model updating

In [14]:
def update_sgd(idx, gradient, alpha):
    embs[idx] -= alpha * gradient

### Loss

In [15]:
def backward_warp(data, tuple_emb, sample_idx, gamma=1.0):
    scalar = np.dot(tuple_emb[title], tuple_emb[descr])
    scalar_random = 0
    cache = {sample_idx}
    N_max = 100
    
    N = 0
    while (N < N_max):
        num_rand = np.random.randint(len(data))
        if num_rand not in cache:
            sample_descr = data[num_rand][descr]
            sample_descr_emb = ids_to_vec([sample_descr])[0]
            cache.add(num_rand)
            N += 1
            scalar_random = np.dot(tuple_emb[title], sample_descr_emb)
            res = 1 - scalar + scalar_random
            if (res > 0):
                break
    if (N != 0):
        k = int((N_max - 1) / N)
        search_complex = sum([1/j for j in range(1, k)])
        loss = search_complex * (gamma - scalar + scalar_random)
        if (loss > 0):
            return sample_descr, (search_complex*(sample_descr_emb - tuple_emb[descr]), search_complex*(-tuple_emb[title]), search_complex*tuple_emb[title])
    return

### Recall

In [16]:
def get_dist(v1, v2):
    return 1 - np.dot(v1, np.transpose(v2)) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [17]:
def recall_at_k(data, k = 10):
    n_test = len(data)
    recall = 0
    descr_emb = [ids_to_vec([data[i][descr]]) for i in range(n_test)]

    for i in range(n_test):
        if not data[i][title]:
            continue

        title_emb = ids_to_vec([data[i][title]])       
        dist_arr = []
        for i_1 in range(n_test):
            dist_arr.append(get_dist(title_emb, descr_emb[i_1]))

        sort_dist_arr = np.sort(dist_arr)
        if dist_arr[i] <= sort_dist_arr[k-1]:
            recall += 1
    
    return recall / n_test

### Model training

In [18]:
def train2(data, tuple_title_descr, sample_idx, alpha, gamma):
    embs_tuple = ids_to_vec(tuple_title_descr)
    output_warp = backward_warp(data, embs_tuple, sample_idx, gamma)
    if not output_warp:
        return
    
    rand_descr, gradients = output_warp  
    tuple_title_descr.append(rand_descr)   
    
    for i, g in zip(tuple_title_descr, gradients):
        update_sgd(i, g, alpha)

In [19]:
def train1(data, alpha, check_param, gamma):
    first = np.random.permutation(len(data))
    check_update = 0
    for f in first:
        u, v = data[f]
        if u and v:
            train2(data, [u, v], f, alpha, gamma)
            check_update += 1
            if check_update % check_param == 0:
                unitnorm()

In [20]:
def train(train_data, test_data, n_epochs, alpha, check_param, gamma):
    for epoch in range(n_epochs):
        t1 = time.process_time()
        train1(train_data, alpha, check_param, gamma)
        t = time.process_time() - t1
        recall = recall_at_k(test_data)
        print("Epoch {:>2} : recall = {:>2} time = {:>12}s".
              format(epoch, round(recall, 2), round(t, 3)))

### Creating embeddings

In [21]:
feature_size = 256
embs = create_embedding(feature_size, len(vocab))
unitnorm()

### Train and test data

In [22]:
data_id = text_to_id(data, vocab)  
train_data, test_data1 = train_test_split(data_id, test_size=0.8)
test_data, test_2 = train_test_split(data_id, test_size=0.9)

### Fit and predict

In [25]:
train(train_data, test_data, n_epochs=10, alpha=0.05, check_param=30, gamma=1)

Epoch  0 : recall = 0.92 time =     2375.047s
Epoch  1 : recall = 0.89 time =     2314.047s
Epoch  2 : recall = 0.9 time =     2364.641s
Epoch  3 : recall = 0.88 time =     2257.812s
Epoch  4 : recall = 0.88 time =     2298.656s
Epoch  5 : recall = 0.89 time =     2284.359s
Epoch  6 : recall = 0.89 time =     2334.016s
Epoch  7 : recall = 0.91 time =     2262.406s
Epoch  8 : recall = 0.9 time =     2331.062s
Epoch  9 : recall = 0.89 time =     2319.781s


### Finding neighbor words

In [23]:
def knn(query, k=10):
    query_emb = embs[vocab.get(query)]
    ind = vocab.get(query)
    dist_arr = {}
    
    for i in range(0, len(embs)):
        dist_arr[i] = get_dist(query_emb, embs[i])
    
    sorted_embs = sorted(dist_arr.items(), key=lambda kv: kv[1])[:k]
    for emb in sorted_embs:
        if emb[0] != ind:
             print("\t" + dict(zip(vocab.values(), vocab.keys()))[emb[0]])

In [27]:
knn("weapon")

	killed
	killings
	crying
	thirty
	freed
	woman
	raid
	kill
	eyewitnesses


In [30]:
knn("putin")

	vladimir
	kurdish
	russias
	sa
	barbarism
	turkey
	fighters
	vigilante
	syria


#### Full data

In [31]:
data_id = text_to_id(data, vocab)  
train_data, test_data = train_test_split(data_id, test_size=0.2)

In [None]:
train(train_data, test_data, n_epochs=10, alpha=0.05, check_param=30, gamma=1)

Epoch  0 : recall = 0.91 time =     9368.938s
Epoch  1 : recall = 0.91 time =     9089.359s
Epoch  2 : recall = 0.92 time =     9544.047s
Epoch  3 : recall = 0.92 time =     9229.828s
Epoch  4 : recall = 0.94 time =     9591.156s
Epoch  5 : recall = 0.93 time =     9337.562s
Epoch  6 : recall = 0.91 time =      9062.75s


In [None]:
knn("weapon")

### Parameters selection

#### alpha=0.01, gamma=1

In [24]:
embs = create_embedding(feature_size, len(vocab))
unitnorm()

In [25]:
train(train_data, test_data, n_epochs=10, alpha=0.01, check_param=30, gamma=1)

Epoch  0 : recall = 0.95 time =     4300.823s
Epoch  1 : recall = 0.95 time =     2556.451s
Epoch  2 : recall = 0.93 time =     2555.889s
Epoch  3 : recall = 0.92 time =     2554.719s
Epoch  4 : recall = 0.91 time =     2555.078s
Epoch  5 : recall = 0.91 time =      2564.61s
Epoch  6 : recall = 0.92 time =     2567.059s
Epoch  7 : recall = 0.91 time =      2569.29s
Epoch  8 : recall = 0.91 time =     2568.151s
Epoch  9 : recall = 0.91 time =     2582.222s


In [26]:
knn("weapon")

	deadly
	others
	involving
	kill
	raids
	jailed
	killer
	province
	investigating


#### alpha=0.05, gamma=0.9

In [27]:
embs = create_embedding(feature_size, len(vocab))
unitnorm()

In [None]:
train(train_data, test_data, n_epochs=10, alpha=0.05, check_param=30, gamma=0.9)

Epoch  0 : recall = 0.94 time =     6472.871s
Epoch  1 : recall = 0.9 time =     2556.357s
Epoch  2 : recall = 0.92 time =     2700.253s
Epoch  3 : recall = 0.9 time =     2743.746s
Epoch  4 : recall = 0.92 time =     2700.908s
Epoch  5 : recall = 0.9 time =     2664.325s
Epoch  6 : recall = 0.92 time =     2666.416s


In [None]:
knn("weapon")

#### alpha=0.01, gamma=0.9

In [None]:
embs = create_embedding(feature_size, len(vocab))
unitnorm()

In [None]:
train(train_data, test_data, n_epochs=10, alpha=0.01, check_param=30, gamma=0.9)

In [None]:
knn("weapon")

In [None]:
a = np.random.normal(0, 1, size=(5, 4))