In [26]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Layer,Dense,Input
from tensorflow.keras import Model,activations
from tensorflow.keras import backend as K

In [12]:
tf.keras.backend.clear_session()
class MyLayer(tf.keras.layers.Layer):
    def __init__(self, input_dim, output_dim = 64, **kwargs):
        self.input_dim = input_dim
        self.output_dim = output_dim
        super(MyLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel', 
                                      shape=(self.input_dim, self.output_dim),
                                      initializer='glorot_uniform',
                                      trainable=True)
        super(MyLayer, self).build(input_shape)

    def call(self, x):
        a = K.pow(K.dot(x,self.kernel), 2)
        b = K.dot(K.pow(x, 2), K.pow(self.kernel, 2))
        return K.mean(a-b, 1, keepdims=True)*0.5

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)

In [44]:
def FM(feature_dim, metrics):
    inputs = Input(shape=(feature_dim,))
    #input = {}
    #for f in feature_names:
    #    input[f] = Input(shape=(1,), name = f, dtype = x_train[f].dtype)
    #generate feature_columns
    #feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    #Transform input 
    #dense_feature = feature_layer(input)
#    liner = tf.keras.layers.Dropout(0.25)(dense_feature)
    #线性部分(w*x + b)
    liner = tf.keras.layers.Dense(units=1, 
#                                  bias_regularizer=tf.keras.regularizers.l2(0.01),
                                  kernel_regularizer=tf.keras.regularizers.l1(0.02),
                                  )(inputs)
#    cross = tf.keras.layers.Dropout(0.25)(dense_feature)
    #交叉项
    cross = MyLayer(feature_dim)(inputs)
    add = tf.keras.layers.Add()([liner, cross])
    predictions = tf.keras.layers.Activation('sigmoid')(add)
    
    model = tf.keras.Model(inputs=inputs, outputs=predictions)
    model.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(lr=0.0001),
                  metrics=metrics)
    return model

In [48]:
def train(feature_dim):
    metrics = [
    #tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.AUC(name='auc')]
    
    fm = FM(feature_dim, metrics)
    fm.fit(x_train,y_train,
           batch_size=128,
            validation_data=(x_test,y_test),
          epochs=20) #, class_weight = 'auto'
    return fm

In [5]:
import pandas as pd
import numpy as np
import math
import random

In [71]:
data=pd.read_csv(r'F:\work\Downloads\sample_data.csv')
item_embedding=pd.read_csv(r'F:\work\Downloads\item_embedding.csv')
data

MemoryError: Unable to allocate 73.2 MiB for an array with shape (9600123,) and data type object

In [7]:
emb_matrix=np.zeros((len(item_embedding),100))
for i in item_embedding.index:
    essay_id=item_embedding['essay_id'][i]
    embedding=item_embedding['embedding'][i].replace('[','').replace(']','').split(',')
    embedding=np.array([float(x) for x in embedding])
    emb_matrix[i]=embedding
title_lst=['item_emb_{}'.format(i) for i in range(100)]
item_emb=pd.DataFrame(emb_matrix,columns=title_lst)
item_emb=pd.concat([item_embedding[['essay_id']],item_emb],axis=1)

In [8]:
data1=pd.merge(data,item_emb,how='inner',left_on='itemid',right_on='essay_id')
data1

Unnamed: 0,deviceid,itemid,action_type,sample_age,city,essay_id,item_emb_0,item_emb_1,item_emb_2,item_emb_3,...,item_emb_90,item_emb_91,item_emb_92,item_emb_93,item_emb_94,item_emb_95,item_emb_96,item_emb_97,item_emb_98,item_emb_99
0,42ed5334281f00e011ffd638cb034ff5,140622,0,798,341221.0,140622,-0.097702,-0.031839,-0.128320,-0.000005,...,-0.034846,0.015116,0.021911,-0.138677,-0.041625,-0.189666,-0.105785,-0.051870,-0.044057,0.095323
1,3f806e36c8ca5ff063f656cce76f8f6d,199797,0,735,370112.0,199797,-0.095536,0.015120,-0.045096,0.002976,...,-0.227116,0.093131,-0.071846,0.181130,0.085188,0.031361,-0.077380,-0.085540,-0.150565,0.106024
2,9B35DB35-C1ED-4307-AB6E-88CAD7899A59,199797,0,735,140000.0,199797,-0.095536,0.015120,-0.045096,0.002976,...,-0.227116,0.093131,-0.071846,0.181130,0.085188,0.031361,-0.077380,-0.085540,-0.150565,0.106024
3,c1bbcfdeb0e9040c1a7ba09b55f20a6c,199797,0,735,530302.0,199797,-0.095536,0.015120,-0.045096,0.002976,...,-0.227116,0.093131,-0.071846,0.181130,0.085188,0.031361,-0.077380,-0.085540,-0.150565,0.106024
4,339C126B-7116-46E5-9484-E97F8076AFFA,199797,0,735,520123.0,199797,-0.095536,0.015120,-0.045096,0.002976,...,-0.227116,0.093131,-0.071846,0.181130,0.085188,0.031361,-0.077380,-0.085540,-0.150565,0.106024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304748,8E339C04-003A-4E3C-B09D-28275E5E2CC7,4088783,0,1,320600.0,4088783,-0.090727,-0.075477,-0.162998,0.090514,...,-0.152520,0.104949,-0.008674,0.009158,0.028125,-0.058161,-0.004092,0.043969,-0.041623,0.205202
1304749,72ef00bff756d2f4d6d514811d724c72,4088783,0,1,530112.0,4088783,-0.090727,-0.075477,-0.162998,0.090514,...,-0.152520,0.104949,-0.008674,0.009158,0.028125,-0.058161,-0.004092,0.043969,-0.041623,0.205202
1304750,c856f541a0843ba1ffc5ac83df64c475,4088783,0,1,450100.0,4088783,-0.090727,-0.075477,-0.162998,0.090514,...,-0.152520,0.104949,-0.008674,0.009158,0.028125,-0.058161,-0.004092,0.043969,-0.041623,0.205202
1304751,D3FC1908-3D24-48A5-804D-4E616A9DD72F,4088783,0,1,610113.0,4088783,-0.090727,-0.075477,-0.162998,0.090514,...,-0.152520,0.104949,-0.008674,0.009158,0.028125,-0.058161,-0.004092,0.043969,-0.041623,0.205202


In [67]:
data2=data1.sample(100000)
data2

Unnamed: 0,deviceid,itemid,action_type,sample_age,city,essay_id,item_emb_0,item_emb_1,item_emb_2,item_emb_3,...,item_emb_90,item_emb_91,item_emb_92,item_emb_93,item_emb_94,item_emb_95,item_emb_96,item_emb_97,item_emb_98,item_emb_99
1086328,F76016B3-0A3F-4824-96C3-82242F743445,4085216,0,2,320412.0,4085216,-0.100339,-0.145756,-0.045382,0.102055,...,-0.143374,0.009719,-0.048811,0.073906,0.133052,0.006060,0.008659,-0.010577,0.002255,-0.011666
454715,2fc7f06c477a46860a7508ea8fa47e49,3653038,0,87,110100.0,3653038,-0.014935,-0.136102,0.042740,0.082872,...,-0.091213,-0.134640,-0.001000,0.113615,0.180269,0.026853,-0.059100,0.000218,-0.154589,0.018455
1279756,b940e11b34a4bea5ed2480adeb1b6165,4012343,0,17,450300.0,4012343,0.044786,-0.180096,-0.005844,0.198959,...,-0.017006,-0.241916,0.032882,0.122696,0.045711,-0.166417,-0.026686,0.023386,-0.176339,-0.054245
71852,42d222994fba94e9a519b993c14a3c99,3873107,0,45,110105.0,3873107,0.045648,-0.076303,0.046059,0.077486,...,-0.016660,-0.131061,-0.002310,0.104558,0.065640,-0.082423,0.002563,-0.033882,0.057745,-0.052827
141407,e44b0c4fa1e1670e79d9cee898727ff3,4031228,0,14,440000.0,4031228,0.107239,-0.000064,0.033750,0.111759,...,-0.065199,-0.099840,0.048735,0.151326,0.077297,-0.127356,-0.130813,0.008995,0.027691,-0.015286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686586,9BF816CA-D92A-42A1-99EF-2A0EAF597E47,3071684,0,164,370102.0,3071684,-0.003341,-0.109526,0.007802,0.047070,...,-0.016952,-0.042065,0.064094,0.088042,0.009277,-0.050253,0.000493,-0.114897,0.049556,-0.002912
1191274,5FC68137-E1E4-4AA8-9360-38CDD22A98DB,3532620,0,105,,3532620,0.011676,0.002943,0.006015,0.029773,...,-0.157865,-0.064855,-0.068455,0.168433,0.132707,-0.075172,-0.023601,-0.025112,-0.135567,0.024168
1281069,097c1369ddc45046273a52fef43268c2,4017423,0,16,410300.0,4017423,0.018627,-0.102190,-0.040037,0.101568,...,-0.056513,-0.091771,0.036877,-0.025055,-0.039678,-0.146928,0.022463,-0.020476,0.067234,-0.069918
1102958,3B86A016-82B5-418B-92D8-AB5CACBB7D67,3534738,0,105,370105.0,3534738,-0.152826,-0.073480,0.076968,0.001879,...,-0.046914,-0.054686,-0.026223,0.154608,0.031690,0.002084,-0.063167,-0.031846,-0.068933,-0.081703


In [68]:
cities=pd.get_dummies(data2.city,prefix='city')
#data2=pd.concat([data2,cities],axis=1)
cities

Unnamed: 0,city_110100.0,city_110101.0,city_110102.0,city_110105.0,city_110106.0,city_110107.0,city_110108.0,city_110109.0,city_110111.0,city_110112.0,...,city_654200.0,city_654201.0,city_654300.0,city_659001.0,city_659002.0,city_659003.0,city_659004.0,city_659005.0,city_810000.0,city_820000.0
1086328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
454715,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1279756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71852,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
141407,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1191274,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1281069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1102958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
from sklearn.model_selection import train_test_split
y=data2['action_type']
x=data2.drop(['action_type','deviceid','itemid','essay_id','city','sample_age'],axis=1)
x_train, x_test, y_train,y_test = train_test_split(x, y, test_size=0.25)

In [56]:
fm = train(100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [50]:
x_train

Unnamed: 0,sample_age,item_emb_0,item_emb_1,item_emb_2,item_emb_3,item_emb_4,item_emb_5,item_emb_6,item_emb_7,item_emb_8,...,item_emb_90,item_emb_91,item_emb_92,item_emb_93,item_emb_94,item_emb_95,item_emb_96,item_emb_97,item_emb_98,item_emb_99
1108447,79,0.008328,-0.079263,-0.023376,0.013280,0.135763,0.107566,0.023115,0.067806,-0.143687,...,-0.169413,0.159162,-0.116346,0.192683,0.087841,-0.126792,0.013169,-0.023648,-0.118230,-0.051885
117520,99,-0.002626,-0.008396,0.005896,0.022236,0.041666,-0.034064,0.110351,-0.008083,-0.102992,...,-0.029745,-0.075548,-0.026871,0.131150,0.017738,-0.104257,0.011787,0.006814,-0.106435,-0.007507
65059,70,0.107902,0.033448,-0.126566,0.052767,0.040193,-0.115836,0.113729,-0.078294,-0.118118,...,-0.093516,-0.044554,-0.202177,0.091722,0.033915,-0.068678,-0.013299,0.156241,0.093129,0.011822
698477,77,-0.078863,0.007857,0.035883,0.032212,0.138197,0.111984,0.149824,0.118355,-0.170406,...,-0.126557,0.036485,-0.096147,0.121503,0.141237,-0.097531,0.055049,-0.091568,-0.154475,-0.018926
1268908,44,-0.011798,-0.082402,0.005296,0.013091,0.038587,0.144773,0.085261,-0.004018,-0.034452,...,-0.224368,-0.067439,0.080348,0.200189,0.129250,-0.009169,-0.066746,0.008745,-0.016751,0.207391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392894,509,-0.019310,-0.043094,0.119127,-0.050520,0.211596,0.136032,0.249145,-0.053190,-0.145302,...,-0.141592,0.010238,0.042367,0.092268,-0.042671,-0.125907,-0.076259,-0.135872,-0.081116,-0.015279
329851,78,0.063970,-0.043484,0.009745,0.108562,0.046403,-0.022923,0.227667,-0.000992,-0.144714,...,0.006297,-0.155916,0.064188,0.061411,0.037999,-0.112562,0.011363,-0.028563,-0.033086,0.054969
917575,1,0.090393,-0.045449,-0.037996,0.038612,-0.080824,-0.028150,0.102944,-0.013433,0.000726,...,-0.114022,-0.194202,-0.029299,0.078302,0.079997,-0.067338,-0.113193,0.027582,-0.182043,0.038020
838708,8,0.076951,-0.057667,-0.070750,0.010225,-0.025602,-0.042759,0.110929,-0.037254,-0.002122,...,-0.141721,-0.207268,0.009937,0.061968,0.035795,-0.069438,-0.109041,0.034817,-0.150157,-0.004072


In [3]:
from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import FaissAlternatingLeastSquares
import pandas as pd