In [2]:
import tensorflow as tf

In [3]:
from tensorflow import keras
from tensorflow.keras import backend
from tensorflow.keras import layers
import pandas as pd

In [4]:
data = {'user': ['Jason', 'Molly', 'Amy', 'Jake', 'Amy'],
        'doc':['q', 'M', 'T', 'q', 'Ay'],
        'author': ['Jason', 'Jason', 'Tina', 'Jason', 'Amy'],
        'views': [4, 24, 3100, 2, 3],
        'y': [1,1,3,1,1]}
df = pd.DataFrame(data)

In [63]:
def model(embedding_size, field_vocab_size=[], hidden_units=[4,4,4], dropout=0.5):
    F = len(field_vocab_size)
    
    # prepare embeddings
    inputs = []
    embed_list = [] 
    for i, vocab_size in enumerate(field_vocab_size):
        in_ = keras.Input(shape=(1,))
        inputs.append(in_)
        embed_list.append(layers.Embedding(vocab_size, embedding_size)(in_))
    embed_list = layers.concatenate(embed_list, axis=1) # none, F, K
    
    fm_one_inputs = []
    embed_one_list = [] # none, F, 1
    for i, vocab_size in enumerate(field_vocab_size):
        in_ = keras.Input(shape=(1,))
        inputs.append(in_)
        embed_one_list.append(layers.Embedding(vocab_size, 1)(in_))
    fm_first_in = layers.concatenate(embed_one_list, axis=1)
    fm_first_in = backend.squeeze(fm_first_in, axis=2) # none, F
    
    # dense layer
    dropouts = [dropout] * len(hidden_units)
    weight_init = keras.initializers.glorot_uniform()
    
    deep_in = layers.Reshape((F*embedding_size,))(embed_list)
    for i, (h, d) in enumerate(zip(hidden_units, dropouts)):
        z = layers.Dense(units=h, kernel_initializer=weight_init)(deep_in)
        z = layers.BatchNormalization(axis=-1)(z)
        z = keras.activations.relu(z)
        z = layers.Dropout(d,seed=d * i)(z) if d > 0 else z
    deep_out = layers.Dense(units=1, activation=tf.nn.softmax, kernel_initializer=weight_init)(z)
    # deep_out: None, 1
    
    # fm layer
    fm_first_order = backend.sum(fm_first_in, axis=1) #None, 1
    
    emb_sum_squared = backend.square(backend.sum(embed_list, axis=1)) #none, K
    emb_squared_sum = backend.sum(backend.square(embed_list), axis=1) #none, K
    fm_second_order = layers.Subtract()([emb_sum_squared, emb_squared_sum])
    fm_second_order = backend.sum(fm_second_order, axis=1) #none, 1
    fm_out = layers.Add()([fm_first_order, fm_second_order])
    
    out = layers.Add()([deep_out, fm_out])
    out = layers.Activation(activation='sigmoid')(out)
    model = keras.Model(inputs=inputs, outputs=out)
    return model


In [64]:
m = model(4, [1,2,3])
m.summary()

Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_175 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_176 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_177 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_174 (Embedding)       (None, 1, 4)         4           input_175[0][0]                  
___________________________________________________________________________________________

In [66]:
m.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Data Pre-processing

Features:
- user: over threshold, otherwise etc group
- Documents: over threshold, otherwise eliminate from data
- author: document author
- doc_age:
- tag
- magazine_id
- pop: how many times all other users read the doc
- is_followed: is the auhtor followed by user

Y: will like or not

In [91]:
data_dir = '../data'
import os
import json

In [141]:
def get_valid_documents(thresh=100):
    doc_read = {}
    path = "{}/read/".format(data_dir)
    for dirpath, subdirs, files in os.walk(path):
        for f in files:
            filename = dirpath+f
            file = open(filename, 'r')
            for line in file.readlines():
                words = line.strip().split(' ')
                for word in words[1:]:
                    if word not in doc_read:
                        doc_read[word]  = 1
                    else:
                        doc_read[word] += 1
    doc_read_thresh = {key:doc_read[key] for key in doc_read if doc_read[key] > thresh}
    """
    total doc: 505840
    doc over thresh=100: 36340
    """
    return doc_read_thresh

In [142]:
get_valid_documents()

KeyboardInterrupt: 

In [202]:
def get_user_list(thresh=100, etc_user_num=200):
    valid_doc = get_valid_documents()
    users = {}
    path = "{}/read/".format(data_dir)
    """
    files length: 3624
    """
    for dirpath, subdirs, files in os.walk(path):
        for i, f in enumerate(files):
            filename = dirpath+f
            file = open(filename, 'r')
            for line in file.readlines():
                words = line.strip().split(' ')
                if words[0] not in users:
                    users[words[0]] = [doc for doc in words[1:] if doc in valid_doc]
                else:
                    users[words[0]] +=[doc for doc in words[1:] if doc in valid_doc]
            if i% 10 == 0:
                print(i)
    
    etc_users = {key:users[key] for key in users if len(users[key]) < thresh}
    users_after_removal = {key:users[key] for key in users if len(users[key]) >= thresh}
    
    keys = random.sample(list(etc_users.keys()), 200)
    values = [d[k] for k in keys]z
    etc_users = {keys[k]:values[k] for k in range(len(keys))}
    
    >>> d = dict.fromkeys(range(100))
>>> keys = random.sample(list(d), 10)
>>> keys
[52, 3, 10, 92, 86, 42, 99, 73, 56, 23]
>>> values = [d[k] for k in keys]
    print(len(users))
    print(len(etc_users))
    print(len(users_after_removal))
    #return etc_users, users_after_removal

In [203]:
get_user_list()

3624
0


KeyboardInterrupt: 

In [210]:
def get_doc_meta_dic():
    """
    {document(id): { tags(keyword_list),age(unix timestamp) ,magazine_id,}}  
    """
    valid_doc = get_valid_documents().keys()

    data = open('../data/metadata.json', 'r')
    meta={}
    for line in data.readlines():
        if line['id'] in valid_doc:
            line = json.loads(line)
            tmp_dict={}
            tmp_dict[line['id']]= {'keyword_list':line['keyword_list'],
            'mag_id':line['magazine_id'],
            'reg_ts':line['reg_ts']
            }
            meta.update(tmp_dict)

    return meta

In [194]:
def categorize_value(target, cat_num=100):
    new_target = {}
    target_list = list(target.values())
    max_ = max(target_list)
    min_ = min(target_list)
    division = int((max_ - min_ +1) / cat_num)
    for key in pop:
        for i in range(cat_num):
            if target[key] >= (min_ + division*i) and target[key] < (min_+division*(i+1)):
                new_target[key] = i+1
        if target[key] >= min_ + division*cat_num:
            new_target[key] = cat_num
    return new_target

In [195]:
pop = {'a': 3, 'b': 4, 'c': 3, 'd':6, 'e': 7, 'f': 8, 'g':9}
categorize_pop(pop, 3)

3
4
3
6
7
8
9


{'a': 1, 'b': 1, 'c': 1, 'd': 2, 'e': 3, 'f': 3, 'g': 3}

In [97]:
def comibine_data_to_df(user_thresh=100, doc_thresh=100, pop_cat_num=100):
    valid_doc = get_valid_documents(thresh=doc_thresh) # doc: read num over thresh
    pop = categorize_value(valid_doc, cat_num=pop_cat_num)
    

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
valid_user=get_user_list(100,200)[0]
etc_user=get_user_list(100,200)[2]

user_list=valid_user.update(etc_user)
def is_followed(user_list=user_list,author):
    data = open('../data/users.json', 'r')
    for line in data.readlines():
        line=json.loads(line)
        if line['id'] in user_list.keys():
            if author in line['following_list']:
                return 1
            else: return 0