In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import pandas as pd
import random

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
view_data = pd.read_csv("view_data.csv")
anchor_list = pd.read_csv("anchor_list.csv")
anchor_list = anchor_list.reset_index()
anchor_list['index'] = anchor_list['index'] + 1

In [4]:
frequent_anchor = anchor_list[anchor_list['count']>=2]

In [5]:
frequent_anchor

Unnamed: 0,index,live_uid,count
0,1,730183576,3087296
1,2,124929108,629108
2,3,671116,539961
3,4,16673072,465327
4,5,4994793,460860
...,...,...,...
62677,62678,37386642,2
62678,62679,164433417,2
62679,62680,623317939,2
62680,62681,736857787,2


In [7]:
index2uid = dict([(i, uid) for i, uid in zip(frequent_anchor['index'], frequent_anchor['live_uid'])])
uid2index = dict([(uid, i) for uid, i in zip(frequent_anchor['live_uid'], frequent_anchor['index'])])

starts from 15:38

In [8]:
view_data['index'] = view_data['live_uid'].map(uid2index)
data = {}
for uid, index in view_data[['uid', 'index']].dropna().values:
    if uid not in data:
        data[uid] = []
    data[uid].append(int(index))

In [11]:
for v in data.values():
    random.shuffle(v)

In [12]:
V = len(index2uid)
sampling_table = keras.preprocessing.sequence.make_sampling_table(V + 1)

In [13]:
len(sampling_table)

38932

In [14]:
len(sampling_table[sampling_table==1])

28735

In [19]:
def my_skipgrams(data, vocabulary_size,
              window_size=2, negative_samples=1., shuffle=True,
              categorical=False, sampling_table=None, seed=None):
    """
    data: List[List[Int]]
    vocabulary_size: number of all anchors
    negative_samples: number of negative samples / number of positive samples
    shuffle: whether to shuffle the return value
    categorical: whether to use one-hot label or binary label
    sampling_table: whether to use a sampling table to down sample frequent anchors and over sample infrequent ones
    seed: whether to use a fixed random seed
    returns: a list of instance pairs and a list of labels
    """
    couples = []
    labels = []
    for sequence in data:
        for i, wi in enumerate(sequence):
            if not wi:
                continue
            if sampling_table is not None:
                if sampling_table[wi] < random.random():
                    continue

            window_end = min(len(sequence), i + window_size + 1)
            for j in range(i + 1, window_end):
                wj = sequence[j]
                if not wj:
                    continue
                couples.append([wi, wj])
                if categorical:
                    labels.append([0, 1])
                else:
                    labels.append(1)

    if negative_samples > 0:
        num_negative_samples = int(len(labels) * negative_samples)
        words = [c[0] for c in couples]
        random.shuffle(words)

        couples += [[words[i % len(words)],
                     random.randint(1, vocabulary_size - 1)]
                    for i in range(num_negative_samples)]
        if categorical:
            labels += [[1, 0]] * num_negative_samples
        else:
            labels += [0] * num_negative_samples

    if shuffle:
        if seed is None:
            seed = random.randint(0, 10e6)
        random.seed(seed)
        random.shuffle(couples)
        random.seed(seed)
        random.shuffle(labels)

    return couples, labels

starts from 16:33

In [20]:
couples, labels = my_skipgrams(data.values(), V, sampling_table=sampling_table)

In [24]:
to_save = pd.DataFrame()
to_save['target'] = [x[0] for x in couples]
to_save['context'] = [x[1] for x in couples]
to_save['label'] = labels
to_save.set_index("target").to_csv('train.csv')