In [31]:
########## 1개변수 임베딩 ##############################

import numpy as np
import pandas as pd

# 데이터갯수 32, 인풋길이 10
# y는 바이너리
x = pd.DataFrame(np.random.randint(1000, size=(32, 10)))
y = pd.DataFrame(np.random.randint(2, size=32))
x.columns = ['v'+ str(i).zfill(2) for i in range(0,10)]

from tensorflow.keras.layers import Input, Embedding, Dense, Reshape, Concatenate, Flatten
from tensorflow.keras.models import Model

# 단어 10개로 만들어진 문장을 입력함. 단어사전은 1000개 단어의 집합
input_layer = Input(shape=(10,))
emb_layer = Embedding(1000, 7, input_length=10, name = 'emb_test')(input_layer)
emb_layer = Flatten()(emb_layer)
output_layer = Dense(32, activation="relu")(emb_layer)
output_layer = Dense(16, activation="relu")(output_layer)
output_layer = Dense(1, activation="sigmoid")(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
model.fit(x,y,epochs=10, verbose=0)

# 임베딩 결과물 가져오기
# 1000개의 단어들을 7차원으로 표현.  
w = pd.DataFrame(model.get_layer('emb_test').get_weights()[0], columns=['ee_'+ str(i).zfill(2) for i in range(0, 7)])
#print(np.array(w).shape, '\n', w,'\n', w[0])

In [37]:
# 매핑테이블 만듬
unique_Status = np.sort(range(0,1000)) # 단어 0~999까지
mapping = pd.DataFrame(unique_Status, columns=['word'])

# ee 가중치와 조인, csv 저장
eeTable = pd.concat([mapping, w], axis=1)
eeTable.to_csv('D:\\testEE.csv', index=False)

# 임베딩 파일 로딩
eeTableLoaded = pd.read_csv('D:\\testEE.csv')


In [38]:
eeTableLoaded

Unnamed: 0,word,ee_00,ee_01,ee_02,ee_03,ee_04,ee_05,ee_06
0,0,0.034359,-0.031802,0.011089,-0.006146,-0.006652,-0.031881,-0.045299
1,1,0.045106,-0.003110,0.043820,-0.007295,0.021692,-0.006928,0.042159
2,2,0.040626,-0.018116,0.003220,0.046908,0.008734,-0.004893,-0.012587
3,3,0.017488,-0.035649,0.044704,-0.027811,0.001796,-0.013830,0.048634
4,4,-0.043958,-0.045626,0.036492,-0.032439,-0.020348,0.002276,-0.026732
...,...,...,...,...,...,...,...,...
995,995,-0.033385,-0.042919,0.000308,-0.041362,-0.039529,0.017479,-0.042797
996,996,0.037930,-0.001746,-0.032050,-0.047751,0.033170,-0.034298,0.036132
997,997,0.034663,0.047013,-0.046163,-0.043144,0.039405,0.019645,0.043322
998,998,0.021262,-0.012744,0.044078,-0.036168,0.001413,-0.012129,0.042502


In [None]:

# 임베딩 결과 붙이기
def append_emb(df, emb, emb_name):
    df_new = pd.merge(df, emb, on=emb_name, how='left')
    last_col = df_new.shape[1] - 1
    assert pd.Series(df_new.iloc[:, last_col:].values.flatten()).isnull().sum() == 0  # confirm all df records matched to an embedding vector
    return df_new

append_emb(x, eeTableLoaded, '0', )

In [34]:
mapping

Unnamed: 0,key
0,0
1,1
2,2
3,3
4,4
...,...
995,995
996,996
997,997
998,998


In [None]:

# 매핑 - 인덱스 결합
pd.concat([mapping, weights], axis=1) #  weight 가져옴



In [4]:

######## New Test : 데이터 2개로 해보기 ################################################

import pandas as pd

x1 = np.random.randint(1000, size=32)
x2 = np.random.randint(500, size=32)
x_concat = pd.concat([pd.Series(x1), pd.Series(x2)], axis=1)

y = pd.Series(np.random.randint(2, size=32))


In [5]:
from tensorflow.keras.layers import Input, Embedding, Dense, Reshape, Concatenate, Flatten, Dropout
from tensorflow.keras.models import Model

input_layer_1 = Input(shape=[1])
input_layer_2 = Input(shape=[1])

emb_layer_1 = Embedding(1000, 5, input_length=1, name='emb_1')(input_layer_1)
emb_layer_1 = Flatten()(emb_layer_1)

emb_layer_2 = Embedding(500, 3, input_length=1, name='emb_2')(input_layer_2)
emb_layer_2 = Flatten()(emb_layer_2)

output_layer = Concatenate(name = 'concate_emb')([emb_layer_1, emb_layer_2])
output_layer = Dense(32, activation='relu')(output_layer)
output_layer = Dense(16, activation='relu')(output_layer)
output_layer = Dense(1, activation='sigmoid')(output_layer)


In [6]:
model = Model(inputs=[input_layer_1, input_layer_2], outputs=output_layer)
model.compile(optimizer='sgd', loss='binary_crossentropy')
model.summary()

model.fit([x1, x2], y, epochs=10)

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
emb_1 (Embedding)               (None, 1, 5)         5000        input_4[0][0]                    
__________________________________________________________________________________________________
emb_2 (Embedding)               (None, 1, 3)         1500        input_5[0][0]                    
_______________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x29026c36d68>

In [8]:
w1 = pd.DataFrame(model.get_layer('emb_1').get_weights()[0])
w2 = pd.DataFrame(model.get_layer('emb_2').get_weights()[0])
w1.to_csv('emb1.csv', index=False, header=False)
w2.to_csv('emb2.csv', index=False, header=False)

print(np.array(w1).shape, np.array(w2).shape, '\n')
print(w1, '\n', w2)

(1000, 5) (500, 3) 

            0         1         2         3         4
0    0.048499  0.041596  0.013411 -0.035943 -0.048730
1   -0.033168  0.030140 -0.011766  0.008337  0.039782
2   -0.034977 -0.010011  0.028479  0.017290  0.003450
3   -0.042601  0.044423 -0.025449 -0.038999  0.046707
4   -0.009436  0.037096 -0.044556 -0.040064 -0.012146
..        ...       ...       ...       ...       ...
995  0.040788  0.029617 -0.040700 -0.013995 -0.025342
996 -0.023819 -0.004727 -0.025862 -0.010402  0.049300
997  0.030392  0.011923 -0.007644 -0.015188  0.020712
998  0.043034  0.039772  0.023720 -0.026079  0.012737
999  0.015894 -0.025313 -0.003690  0.021829 -0.001586

[1000 rows x 5 columns] 
             0         1         2
0    0.008769 -0.049025 -0.022256
1    0.004049 -0.010723  0.018596
2    0.044816  0.019207  0.040492
3    0.005513  0.035920  0.036377
4    0.008368  0.038584 -0.026751
..        ...       ...       ...
495  0.036175  0.000583 -0.036869
496  0.024110  0.002484 -0.02149