In [None]:
import sys
sys.path.append('../')

import re
import numpy as np
from gensim.models import Word2Vec

from db.db_conn import *

# DB
config_file_path = './db/config.ini'
pc_option = 'db_name'
select_sql_query = 'select * from db_name.wikidata'

# output
save_path = './myprj/our_data' + '/'
save_fname = 'entity_embedding_ours.vec'

# connect to DB
host_ip, user_id, user_pw, db_name = load_db_info_from_config(config_file_path, pc_option)  # get db information
cursor, mydb = connect_to_DB(host_ip, user_id, user_pw, db_name)

table_df_tmp = get_relation_df_w_columns(cursor, select_sql_query)
table_df = table_df_tmp.iloc[:, :]
table_df.sort_values(by=['id'], axis=0)

print("DB Load")

wikiword_list = list(np.array(table_df['word'].tolist()))

X_train = []
for idx, row in table_df.iterrows():
    exp_khaiii = row['exp_khaiii']

    exp_khaiii_rm = re.sub('[^A-Za-z0-9가-힣+]', ' ', str(exp_khaiii))
    exp_khaiii_list = exp_khaiii_rm.split()
    X_train.append(exp_khaiii_list)
print("X_train 길이 : ", len(X_train))

model = Word2Vec(sentences = X_train, vector_size = 100, window = 3, min_count=3, workers = 3, sg = 1) # min_count=3
model.save(save_path + 'word2vec_80000.model')
print("min_count 3일 때 '코로나'와 관련있는 단어 : ", model.wv.most_similar('코로나'))

f = open(save_path + save_fname, 'w')
error_word_list = []
for idx, row in table_df.iterrows():
    try:
        wikiid = row['id']
        wikiword = row['word']

        # print(wikiword)
        model.wv[wikiword]

        write_vec = wikiid
        for num in model.wv[wikiword]:
            write_vec += '\t' + str(num)
        write_vec += '\n'
        f.write(write_vec)

    except KeyError as e:
        error_word_list.append(wikiword)
        continue
    except Exception as e:
        print(e)
        # pass
f.close()
ef = open(save_path + 'except_error_word_list', 'w')
ef.write(str(error_word_list))
ef.close()


#p3_make_input/news_word_entity.py

import sys
sys.path.append('../')

import re, pickle

from db.db_conn import *

# DB
config_file_path = './db/config.ini'
pc_option = 'db_name'
select_sql_query = 'select * from db_name.wikidata'
past_sql_query = 'select * from db_name.news_model_past'

# output
save_path = './myprj/our_data' + '/'
save_fname = 'news_words_dict.pkl'
save_h_fname = 'news_entities_dict.pkl'

# connect to DB
host_ip, user_id, user_pw, db_name = load_db_info_from_config(config_file_path, pc_option)  # get db information
cursor, mydb = connect_to_DB(host_ip, user_id, user_pw, db_name)

table_df_tmp = get_relation_df_w_columns(cursor, select_sql_query)
table_df = table_df_tmp.iloc[:, :] # 여기 뉴스개수 14438

entity_dict = table_df[['id', 'word']].set_index('word').T.to_dict() # id 가 value, word가 key

news_model_past_df_tmp = get_relation_df_w_columns(cursor, past_sql_query)
news_model_past_df = news_model_past_df_tmp.iloc[:500000, :]

print("DB Load")

all_news_words = {}
all_news_entities = {}
no_in_wikipedia = []
for idx, row in news_model_past_df.iterrows():
    news_index = row['news_index']
    word = row['word']
    entity = row['entity']

    word_rm = re.sub('[^A-Za-z0-9가-힣+]', ' ', str(word))
    word_list = word_rm.split()

    entity_rm = re.sub('[^A-Za-z0-9가-힣+]', ' ', str(entity))
    entity_list = entity_rm.split()

    ## news_words
    all_news_words[news_index] = word_list

    ## news_entities
    entities_list = []
    for e in entity_list:
        one_entity = []
        one_entity.append(e)

        if e not in entity_dict:
            continue
        entity_id = entity_dict[e]['id']
        tup = (one_entity, entity_id)

        entities_list.append(tup)

    all_news_entities[news_index] = entities_list


with open(save_path + save_fname,'wb') as f: # 쓸 때 wb, 가져올 때 rb
    pickle.dump(all_news_words,f)

f.close()

with open(save_path + save_h_fname,'wb') as ff:
    pickle.dump(all_news_entities, ff)

ff.close()

no_in_wikipedia = list(set(no_in_wikipedia))
with open(save_path + 'no_in_wikipedia','wb') as f:
    pickle.dump(all_news_entities, f)

#p3_make_input/session_history.py

import sys
sys.path.append('../')

import random
import re, pickle
import pandas as pd

from db.db_conn import *
from module.entity_embedding_module import *

# DB
config_file_path = './db/config.ini'
pc_option = 'db_name'
select_sql_query = 'select * from db_name.user_history_dup'
past_sql_query = 'select * from db_name.news_model_past'

# output
save_path = './myprj/our_data' + '/'
save_fname = 'session_list.pkl'
save_h_fname = 'history_dict.pkl'

# connect to DB
host_ip, user_id, user_pw, db_name = load_db_info_from_config(config_file_path, pc_option)  # get db information
cursor, mydb = connect_to_DB(host_ip, user_id, user_pw, db_name)

table_df_tmp = get_relation_df_w_columns(cursor, select_sql_query)
table_df = table_df_tmp.iloc[:, :] # 얘네 뉴스개수 14438

news_model_past_df_tmp = get_relation_df_w_columns(cursor, past_sql_query)
news_model_past_df = news_model_past_df_tmp.iloc[:500000, :]

print("DB Load")

table_groupby = table_df.groupby('user_id')['news_index'].apply(lambda x: "[%s]" % ', '.join(x))
table_groupby_df = table_groupby.reset_index(drop=False)

all_news_index = list(set(list(news_model_past_df['news_index'])))

rdlist_3 = [1, 2, 3]
rdlist_1 = [0, 1]
rdlist_7 = [7, 8, 9, 10, 11, 12, 13, 14, 15]
all_session_list = []
all_history_dict = {}
for idx, row in table_groupby_df.iterrows():
    user_session_list = []
    user_id = row['user_id']
    news_index = row['news_index']

    news_index_rm = re.sub('[^A-Za-z0-9가-힣+]', ' ', str(news_index))
    news_index_list = news_index_rm.split()

    ## history
    all_history_dict[user_id] = news_index_list

    ## session
    # 1. user
    user_session_list.append(user_id)

    # 2. user_history(uh)
    # 3. user에게 노출된 뉴스 중 1번 이상 클릭한 기사들(oneclick_list)
    if len(news_index_list) > 25:
        oneclick_list = random.sample(news_index_list, 6)
    elif len(news_index_list) > 10:
        rdnum = int(random.sample(rdlist_3, 1)[0])
        oneclick_list = random.sample(news_index_list, rdnum)
    elif len(news_index_list) > 3:
        rdnum = int(random.sample(rdlist_1, 1)[0])
        oneclick_list = random.sample(news_index_list, rdnum)
    else:
        oneclick_list = []

    uh = [x for x in news_index_list if x not in oneclick_list]

    user_session_list.append(uh)
    user_session_list.append(oneclick_list)

    rm = []
    # 4. user에게 노출되었으나, 클릭하지 않는 기사들(noclick_list)
    rdnum = int(random.sample(rdlist_7, 1)[0])
    noclick_list = random.sample(all_news_index, rdnum)
    for noclick_elt in noclick_list:
        if noclick_elt in news_index_list:
            rm.append(noclick_elt)
    noclick_list = [x for x in noclick_list if x not in rm]
    user_session_list.append(noclick_list)

    # 전체에 append
    all_session_list.append(user_session_list)


with open(save_path + save_fname,'wb') as f: # 쓸 때 wb, 가져올 때 rb
    pickle.dump(all_session_list,f)

f.close()

with open(save_path + save_h_fname,'wb') as ff:
    pickle.dump(all_history_dict, ff)

ff.close()