In [2]:
import tensorflow as tf

In [3]:
from tensorflow import keras
from tensorflow.keras import backend
from tensorflow.keras import layers
import pandas as pd

In [4]:
data = {'user': ['Jason', 'Molly', 'Amy', 'Jake', 'Amy'],
        'doc':['q', 'M', 'T', 'q', 'Ay'],
        'author': ['Jason', 'Jason', 'Tina', 'Jason', 'Amy'],
        'views': [4, 24, 3100, 2, 3],
        'y': [1,1,3,1,1]}
df = pd.DataFrame(data)

In [63]:
def model(embedding_size, field_vocab_size=[], hidden_units=[4,4,4], dropout=0.5):
    F = len(field_vocab_size)
    
    # prepare embeddings
    inputs = []
    embed_list = [] 
    for i, vocab_size in enumerate(field_vocab_size):
        in_ = keras.Input(shape=(1,))
        inputs.append(in_)
        embed_list.append(layers.Embedding(vocab_size, embedding_size)(in_))
    embed_list = layers.concatenate(embed_list, axis=1) # none, F, K
    
    fm_one_inputs = []
    embed_one_list = [] # none, F, 1
    for i, vocab_size in enumerate(field_vocab_size):
        in_ = keras.Input(shape=(1,))
        inputs.append(in_)
        embed_one_list.append(layers.Embedding(vocab_size, 1)(in_))
    fm_first_in = layers.concatenate(embed_one_list, axis=1)
    fm_first_in = backend.squeeze(fm_first_in, axis=2) # none, F
    
    # dense layer
    dropouts = [dropout] * len(hidden_units)
    weight_init = keras.initializers.glorot_uniform()
    
    deep_in = layers.Reshape((F*embedding_size,))(embed_list)
    for i, (h, d) in enumerate(zip(hidden_units, dropouts)):
        z = layers.Dense(units=h, kernel_initializer=weight_init)(deep_in)
        z = layers.BatchNormalization(axis=-1)(z)
        z = keras.activations.relu(z)
        z = layers.Dropout(d,seed=d * i)(z) if d > 0 else z
    deep_out = layers.Dense(units=1, activation=tf.nn.softmax, kernel_initializer=weight_init)(z)
    # deep_out: None, 1
    
    # fm layer
    fm_first_order = backend.sum(fm_first_in, axis=1) #None, 1
    
    emb_sum_squared = backend.square(backend.sum(embed_list, axis=1)) #none, K
    emb_squared_sum = backend.sum(backend.square(embed_list), axis=1) #none, K
    fm_second_order = layers.Subtract()([emb_sum_squared, emb_squared_sum])
    fm_second_order = backend.sum(fm_second_order, axis=1) #none, 1
    fm_out = layers.Add()([fm_first_order, fm_second_order])
    
    out = layers.Add()([deep_out, fm_out])
    out = layers.Activation(activation='sigmoid')(out)
    model = keras.Model(inputs=inputs, outputs=out)
    return model


In [64]:
m = model(4, [1,2,3])
m.summary()

Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_175 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_176 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_177 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_174 (Embedding)       (None, 1, 4)         4           input_175[0][0]                  
___________________________________________________________________________________________

In [66]:
m.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Data Pre-processing

Features:
- user: over threshold, otherwise etc group
- Documents: over threshold, otherwise eliminate from data
- author: document author
- doc_age:
- tag
- magazine_id
- pop: how many times all other users read the doc
- is_followed: is the auhtor followed by user

Y: will like or not

In [91]:
data_dir = '../data'
import os
import json

In [217]:
def get_valid_documents(thresh=100):
    doc_read = {}
    path = "{}/read/".format(data_dir)
    for dirpath, subdirs, files in os.walk(path):
        for f in files:
            filename = dirpath+f
            file = open(filename, 'r')
            for line in file.readlines():
                words = line.strip().split(' ')
                user = words[0]
                for doc in words[1:]:
                    if doc not in doc_read:
                        doc_read[doc] = {}
                        doc_read[doc]['num']  = 1
                        doc_read[doc]['reader'] = [user]
                    else:
                        doc_read[doc]['num'] += 1
                        doc_read[doc]['reader'].append(user)
                        
    doc_read_thresh = {key:{'num':doc_read[key]['num'], 'reader':doc_read[key]['reader']} for key in doc_read if doc_read[key]['num'] > thresh}
    """
    total doc: 505840
    doc over thresh=100: 36340
    """
    return doc_read_thresh

In [215]:
get_valid_documents()

{'@charlessay_30': {'num': 1, 'reader': ['#a055d0c3520e1c002531001928217887']}, '@wal8am_27': {'num': 1, 'reader': ['#a055d0c3520e1c002531001928217887']}, '@uglyduckmin_40': {'num': 1, 'reader': ['#a055d0c3520e1c002531001928217887']}, '@anti-essay_133': {'num': 1, 'reader': ['#a055d0c3520e1c002531001928217887']}, '@roysday_125': {'num': 1, 'reader': ['#a055d0c3520e1c002531001928217887']}}


{}

In [227]:
def get_user_list(thresh=100, etc_user_num=200):
    valid_doc = get_valid_documents()
    print(len(valid_doc))
    user_read_num = {}
    user_read_doc = {}
    for doc in valid_doc:
        readers = valid_doc[doc]['reader']
        for reader in readers:
            if reader not in user_read_doc:
                user_read_doc[reader] = [doc]
                user_read_num[reader] = 1
            else:
                user_read_doc[reader].append(doc)
                user_read_num[reader] += 1
    
    user_read_num1 = {key:user_read_num[key] for key in user_read_num if user_read_num[key] >= thresh}
    user_read_doc1 = {key:user_read_doc[key] for key in user_read_num1}
    
    user_read_num2 = {key:user_read_num[key] for key in user_read_num if user_read_num[key] < thresh}
    user_read_num2 = {key:user_read_num2[key] for i, key in enumerate(user_read_num2) if i < etc_user_num}
    user_read_doc2 = {key:user_read_doc[key] for key in user_read_num2}
    return user_read_doc1, user_read_num1, user_read_doc2, user_read_num2
        

In [210]:
def get_doc_meta_dic():
    """
    document(id): author(user_id), tags(keyword_lists), magazine_id,  
    """
    data = open('../data/metadata.json', 'r')
    for line in data.readlines():
        line = json.loads(line)
        print(line)
        break

In [211]:
get_doc_meta_dic()

{'magazine_id': 8982, 'user_id': '@bookdb', 'title': '사진으로 옮기기에도 아까운, 리치필드 국립공원', 'keyword_list': ['여행', '호주', '국립공원'], 'display_url': 'https://brunch.co.kr/@bookdb/782', 'sub_title': '세상 어디에도 없는 호주 Top 10', 'reg_ts': 1474944427000, 'article_id': 782, 'id': '@bookdb_782'}


In [194]:
def categorize_value(target, cat_num=100):
    new_target = {}
    target_list = list(target.values())
    max_ = max(target_list)
    min_ = min(target_list)
    division = int((max_ - min_ +1) / cat_num)
    for key in pop:
        for i in range(cat_num):
            if target[key] >= (min_ + division*i) and target[key] < (min_+division*(i+1)):
                new_target[key] = i+1
        if target[key] >= min_ + division*cat_num:
            new_target[key] = cat_num
    return new_target

In [None]:
pop = {'a': 3, 'b': 4, 'c': 3, 'd':6, 'e': 7, 'f': 8, 'g':9}
categorize_pop(pop, 3)

In [228]:
def doc_id2author_id(doc_id):
    return doc_id.split('_')[0]

In [None]:
def get_user_doc_read_num(user_id, doc_id, user_read_doc):
    cnt = 0
    docs_read = user_read_doc[user_id]
    for doc in docs_read:
        if doc == doc_id:
            cnt += 1
    return cnt

In [97]:
def combine_data_to_df(user_thresh=100, doc_thresh=100, pop_cat_num=100):
    valid_doc = get_valid_documents(thresh=doc_thresh) # doc: read num over thresh
    pop = categorize_value(valid_doc, cat_num=pop_cat_num)
    

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

