In [54]:
from collections import Counter
from datetime import timedelta, datetime
import glob
from itertools import chain
import json
import os
import re
import time
import pickle
import sys
import numpy as np
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import seaborn as sns

In [55]:
def drawProgressBar(percent, barLen = 20):
    # percent float from 0 to 1. 
    sys.stdout.write("\r")
    sys.stdout.write("[{:<{}}] {:.0f}%".format("=" * int(barLen * percent), barLen, percent * 100))
    sys.stdout.flush()

In [2]:
# 메타 데이터
metadata = pd.read_json('metadata.json', lines=True)

In [3]:
# 유저 데이터
users = pd.read_json('users.json', lines=True)

In [4]:
# read 데이터
read_file_lst = glob.glob('res/read/*')
exclude_file_lst = ['read.tar']

read_df_lst = []
for f in read_file_lst:
    file_name = os.path.basename(f)
    if file_name in exclude_file_lst:
        print(file_name)
    else:
        df_temp = pd.read_csv(f, header=None, names=['raw'])
        df_temp['dt'] = file_name[:8]
        df_temp['hr'] = file_name[8:10]
        df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
        df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
        read_df_lst.append(df_temp)
        
read = pd.concat(read_df_lst)

In [5]:
# read 데이터 가공
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))
read_cnt_by_user = read['article_id'].str.split(' ').map(len)
read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                         'hr': np.repeat(read['hr'], read_cnt_by_user),
                         'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                         'article_id': chainer(read['article_id'])})

In [6]:
# 글별 소비수 통계
atc_read_cnt = read_raw[read_raw.article_id != ''].groupby('article_id')['user_id'].count()
atc_read_cnt = atc_read_cnt.reset_index()
atc_read_cnt.columns = ['article_id', 'read_cnt']

In [7]:
atc = metadata.copy()
atc['reg_datetime'] = atc['reg_ts'].apply(lambda x : datetime.fromtimestamp(x/1000.0))
atc.loc[atc['reg_datetime'] == atc['reg_datetime'].min(), 'reg_datetime'] = datetime(2090, 12, 31)
atc['reg_dt'] = atc['reg_datetime'].dt.date
atc['type'] = atc['magazine_id'].apply(lambda x : '개인' if x == 0.0 else '매거진')
# 컬럼명 변경
atc.columns = ['id', 'display_url', 'article_id', 'keyword_list', 'magazine_id', 'reg_ts', 'sub_title', 'title', 'author_id', 'reg_datetime', 'reg_dt', 'type']

In [8]:
#metadata 결합
atc_read_cnt = pd.merge(atc_read_cnt, atc, how='left', left_on='article_id', right_on='article_id')
atc_read_cnt_nn = atc_read_cnt[atc_read_cnt['id'].notnull()]

In [9]:
read_cnt_frame = atc_read_cnt_nn.sort_values(["read_cnt"], ascending=[False])
optimize_frame = read_cnt_frame.drop(['id', 'display_url', 'sub_title', 'magazine_id', 'reg_ts', 'title', 'author_id', 'reg_dt', 'type'], axis=1)

In [10]:
# article 기반 데이터 정제
article_detail_dic = {}
for row in optimize_frame.values:
    article_detail_dic[row[0]] = {'read_cnt': row[1], 'keyword': row[2] ,'datetime': row[3]}

In [11]:
# 유저별 팔로우 목록
user_follow_dict = {}
for row in users.values:
    user_id = row[1]
    follow_list = row[0]
    user_follow_dict[user_id] = follow_list

In [12]:
def convertTime (dateTime):
    if dateTime == 0:
        return 0
    t = pd.Timestamp(dateTime)
    return time.mktime(t.timetuple())

In [13]:
# 전체 유저별 읽은 글 목록
user_read_dic = {}
for row in read_raw.values:
    user_id = row[2]
    article = row[3]
    if user_read_dic.get(user_id, "empty") == "empty":
        user_read_dic[user_id] = [article]
    else:
        user_read_dic[user_id].append(article)

In [14]:
read_cnt_max = max(read_cnt_frame.read_cnt)
reg_datetime_max = max(read_cnt_frame[read_cnt_frame.reg_datetime < '2090-12-31'].reg_datetime)
reg_datetime_max = convertTime(reg_datetime_max)

In [15]:
# 유저별 팔로우 목록
user_follow_dict = {}
for row in users.values:
    user_id = row[1]
    follow_list = row[0]
    user_follow_dict[user_id] = follow_list

In [16]:
# 읽은 글에 대한 팔로우 가중치 계산에 사용
def to_timeScore(dt_time):
    return time.mktime(datetime.strptime(dt_time[6:8] +"/"+dt_time[4:6]+"/"+dt_time[:4], "%d/%m/%Y").timetuple()) * (0.2 / reg_datetime_max)

In [17]:
def getKeywordTimeScore(datetime):
    return convertTime(data['datetime']) * (0.2 / reg_datetime_max) #최근 글 일수록 가중치

In [18]:
# 최근 읽은 글일 수록 가중치 추가
def getArticleTimeScore(article, userId):
    if article_detail_dic.get(article, "empty") == "empty":
        return to_timeScore(user_last_read_dic[userId][article])
    
    data = article_detail_dic[article]
    return convertTime(data['datetime']) * (0.3 / reg_datetime_max) #최근 글 일수록 가중치

In [258]:
def getArticleScore(article):
    data = article_detail_dic[article]
    cnt_p = data['read_cnt'] * (0.02 / read_cnt_max) # 읽은 사람이 많을 수록 가중치
    time_p = convertTime(data['datetime']) * (0.35 / reg_datetime_max) #최근 글 일수록 가중치
    time_p = time_p if time_p <= 0.35 else 0
    return cnt_p + time_p

In [20]:
# 키워드에 대한 상관관계 파악
user_keyword_article_read_dic = {}
for id in user_read_dic:
    article_read_dic = {}
    read_articles = user_read_dic[id]
    for article in read_articles:
        if article_detail_dic.get(article, "empty") == "empty":
            continue
        data = article_detail_dic[article]
        for keyword in data['keyword']:
            if (article_read_dic.get(keyword, "empty") == "empty"):
                article_read_dic[keyword] = getKeywordTimeScore(data['datetime'])
            else:
                article_read_dic[keyword] += getKeywordTimeScore(data['datetime'])
           
    user_keyword_article_read_dic[id] = article_read_dic

In [21]:
def getKeywordRanking (id):
    if (user_keyword_article_read_dic.get(id, "empty") == "empty") | (len(user_keyword_article_read_dic[id]) == 0):
        return {}
    rank_dic = dict(sorted(user_keyword_article_read_dic[id].items(), key=lambda x:x[1], reverse = True))
    rank_max = max(rank_dic.values())
    p1 = 0.3 / rank_max # 최대 가중치 0.3
    
    result = {}
    for item in rank_dic:
        result[item] = rank_dic[item] * p1
    return result

In [79]:
def getKeywordAvgScore (keyword_dic, article):
    if article_detail_dic.get(article, "empty") == "empty":
        return 0
    result = 0
    cnt = 0
    keyword_list = article_detail_dic[article]['keyword']
    for keyword in keyword_list:
        if keyword_dic.get(keyword, "empty") == "empty":
            continue
        result += keyword_dic[keyword]
        cnt += 1
    return result if result == 0 else result / cnt

In [22]:
def getKeywordMaxScore (keyword_dic, article):
    # 키워드 정보 없음
    if article_detail_dic.get(article, "empty") == "empty":
        return 0
    result = []
    keyword_list = article_detail_dic[article]['keyword']
    for keyword in keyword_list:
        if keyword_dic.get(keyword, "empty") == "empty":
            continue
        result.append(keyword_dic[keyword])
    return max(result) if len(result) > 0 else 0

In [23]:
user_last_read_dic = {}
for row in read_raw.values:
    if user_last_read_dic.get(row[2], "empty") == "empty":
        user_last_read_dic[row[2]] = {}
    user_last_read_dic[row[2]][row[3]] = row[0]

In [24]:
# 특정 유저마다 팔로우 하고있는 작가별 읽은 글의 수
user_follow_article_read_dic = {}
for id in user_read_dic:
    article_read_dic = {}
    read_articles = user_read_dic[id]
    for article in read_articles:
        follow_id = article.split('_')[0]
        # 팔로우가 없는 경우
        if (user_follow_dict.get(id, "empty") == "empty"):
            user_follow_dict[id] = []
            continue
            
        if (follow_id in user_follow_dict[id]):
            if (article_read_dic.get(follow_id, "empty") == "empty"):
                article_read_dic[follow_id] = getArticleTimeScore(article, id);
            else:
                article_read_dic[follow_id] += getArticleTimeScore(article, id)
    user_follow_article_read_dic[id] = article_read_dic

In [162]:
def getFollowingRanking (id):
    if (user_follow_article_read_dic.get(id, "empty") == "empty") | (len(user_follow_article_read_dic[id]) == 0):
        return {}
    rank_dic = dict(sorted(user_follow_article_read_dic[id].items(), key=lambda x:x[1], reverse = True))
    rank_max = max(rank_dic.values())
    p1 = 0.4 / rank_max # 최대 가중치 0.4
    
    result = {}
    for item in rank_dic:
        result[item] = rank_dic[item] * p1
    return result

In [28]:
# 예측 유저 목록
predict_users = pd.read_csv('res/predict/dev.users', names=['user_id'])

In [97]:
user_keyword_ranking_dic = {}
for id in predict_users.user_id:
    user_keyword_ranking_dic[id] = getKeywordRanking(id)

In [130]:
# 2월 15일 이후 인기글 전체
train_data = atc_read_cnt_nn.sort_values(["read_cnt"], ascending=[False])[(atc_read_cnt_nn.reg_datetime > "2019-02-01") & (atc_read_cnt_nn.reg_datetime < "2090-12-31")].article_id
train_data = list(train_data.values)

  


In [131]:
print(len(train_data))

19862


In [132]:
my_data = []
users_arr = predict_users.user_id
for user in users_arr:
    user_read_article = []
    user_read_article.append(user)
    for ar in train_data:
        if ar in user_read_dic[user]:
            user_read_article.append(1)
        else:
            user_read_article.append(0)
    my_data.append(user_read_article)

In [133]:
article_column_list = list(train_data)
article_column_list.insert(0, 'user_id')

In [134]:
# 테이블 만들기
user_article_table = pd.DataFrame(my_data, columns=article_column_list, index=[idx for idx in range(0, len(my_data))])
user_article_table.head()

Unnamed: 0,user_id,@brunch_151,@brunch_152,@hjl0520_26,@seochogirl_28,@mothertive_66,@roysday_314,@ohmygod_42,@boot0715_115,@hjl0520_28,...,@kayyoon_25,@g702_15,@g702_11,@g702_16,@g702_18,@nightoffice_7,@vridge_5,@vroongprime_64,@gaudi817_6,@geniyang_18
0,#d6866a498157771069fdf15361cb012b,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,#f963fb8c5d9d14d503fc4e80bd8617b4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,#87a6479c91e4276374378f1d28eb307c,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,#677e984e245b344f61dc5d3cc1f352c8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,#519f45eb14e4807e8714fb7e835463eb,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
del my_data
del article_column_list

In [136]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
import collections

In [137]:
id_preds = collections.defaultdict(list)
ids = predict_users.user_id.values

In [138]:
for c in user_article_table.columns:
    if c != 'user_id':
        # 3000명 중 아무도 읽은 사람이 없는경우 학습이 되지 않으므로 예외처리
        if len(set(user_article_table[c].values)) == 1:
            for id, p in zip(ids, [0] * 3000) :
                id_preds[id].append(p)
            continue;
            
        y_train = user_article_table[c]
        x_train = user_article_table.drop([c, 'user_id'], 1)
        
        clf = LogisticRegression(solver='liblinear')
        clf.fit(x_train, y_train)
        p_train = clf.predict_proba(x_train)[:,1]
        for id, p in zip(ids, p_train):
            id_preds[id].append(p)

In [171]:
# 1차 학습 이후 가중치 추가
step = 0
train_preds = {}
columns = list(user_article_table.columns[1:])
for id in predict_users.user_id:
    p = id_preds[id].copy()
    
    for idx in range(0, len(columns)):
        follow = columns[idx].split('_')[0]
        article_rank = getFollowingRanking(id)
        if follow in user_follow_dict[id]:
            if article_rank.get(follow, "empty") != "empty":
                p[idx] += article_rank[follow] + getArticleScore(columns[idx]) + getKeywordAvgScore(user_keyword_ranking_dic[id], columns[idx]) # 팔로우 가중치
        #p[idx] += getArticleScore(columns[idx]) + getKeywordMaxScore(user_keyword_ranking_dic[id], columns[idx]) 
    step += 1
    drawProgressBar(step / 3000)
    preds = [i[0] for i in sorted([i for i in zip(columns, p) if i[0] not in user_read_dic[id]], key=lambda i:i [1], reverse=True)[:100]]
    train_preds[id] = preds # 예측값 저장
    



In [286]:
# 2차 학습
train_data2 = atc_read_cnt_nn.sort_values(["read_cnt"], ascending=[False])[(atc_read_cnt_nn.reg_datetime < "2019-02-01")].article_id

  


In [287]:
# append
test_c = list(train_data2)
test_p = [0] * len(train_data2)

In [288]:
# 2차 가중치 설정
step = 0
train_preds2 = {}
columns = test_c.copy()
for id in predict_users.user_id:
    p = test_p.copy()

    for idx in range(0, len(columns)):
        follow = columns[idx].split('_')[0]
        article_rank = getFollowingRanking(id)
        if follow in user_follow_dict[id]:
            if article_rank.get(follow, "empty") != "empty":
                p[idx] += (article_rank[follow]) # 팔로우 가중치
        p[idx] += getArticleScore(columns[idx]) + getKeywordAvgScore(user_keyword_ranking_dic[id], columns[idx]) 
    step += 1
    drawProgressBar(step / 3000)
    preds = [i[0] for i in sorted([i for i in zip(columns, p) if i[0] not in user_read_dic[id]], key=lambda i:i [1], reverse=True)[:100]]
    train_preds2[id] = preds



In [289]:
# 100개 추출하기
result = {}
for id in ids:
    result[id] = train_preds[id][:30]
    result[id].extend(train_preds2[id][:70])

In [290]:
# 저장하기
save_data = []
for idx in range(0, len(predict_users.user_id)):
    user_id = predict_users.user_id[idx]
    temp = [user_id]
    temp.extend(result[user_id])
    temp = [' '.join(temp)]  
    save_data.append(temp);

In [291]:
save = pd.DataFrame(save_data)
save.to_csv("recommend.txt", header=False, index=False)

In [292]:
import six
import math

In [293]:
def _entropy_diversity(recs, topn):
    sz = float(len(recs)) * topn
    freq = {}
    for u, rec in six.iteritems(recs):
        for r in rec:
            freq[r] = freq.get(r, 0) + 1
    ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
    return ent

In [294]:
def evaluate(recs_path, topn=100):
    recs = {}
    target_users = set()
    for line in open(recs_path):
        tkns = line.split()
        userid, rec = tkns[0], tkns[1:]
        target_users.add(userid)
        recs[userid] = rec
    print('EntDiv@%s: %s' % (topn, _entropy_diversity(recs, topn)))

In [355]:
evaluate('recommend.txt')

EntDiv@100: 10.281624147630335


In [273]:
# 2차 학습
train_data3 = atc_read_cnt_nn.sort_values(["read_cnt"], ascending=[False])[(atc_read_cnt_nn.reg_datetime < "2019-02-01")]

  


In [338]:
withJoy = {}
for line in open('joy2.txt'):
    tkns = line.split()
    userid, rec = tkns[0], tkns[1:]
    withJoy[userid] = rec

In [339]:
len(withJoy['#d6866a498157771069fdf15361cb012b'][:30])

30

In [356]:
# withJoy 100개 추출하기
joy_result = {}
for id in ids:
    joy_result[id] = withJoy[id][:20]
    joy_result[id].extend(train_preds2[id][:80])
    #검증
    if len(set(joy_result[id])) != 100:
        print('error')

In [357]:
# 저장하기
save_data = []
for idx in range(0, len(predict_users.user_id)):
    user_id = predict_users.user_id[idx]
    temp = [user_id]
    temp.extend(joy_result[user_id])
    temp = [' '.join(temp)]  
    save_data.append(temp);

In [358]:
save = pd.DataFrame(save_data)
save.to_csv("recommend.txt", header=False, index=False)