# 1. Load the Data

In [19]:
import pandas as pd 

# train
train_click = pd.read_csv("data/train_click_log.csv")
articles = pd.read_csv("data/articles.csv")
articles = articles.rename(columns={'article_id': 'click_article_id'})  #重命名，方便后续match
articles_emb = pd.read_csv("data/articles_emb.csv")

# test
test_click = pd.read_csv("data/testA_click_log.csv")

In [28]:
import torch
import faiss
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# only consider the content of test click
train_click = test_click

In [21]:
articles_emb.head()

Unnamed: 0,article_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_240,emb_241,emb_242,emb_243,emb_244,emb_245,emb_246,emb_247,emb_248,emb_249
0,0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,...,-0.487843,0.823124,0.412688,-0.338654,0.320786,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,...,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292


# 2. Get the Article Embedding Matrix

In [22]:
import numpy as np

item_emb_cols = [x for x in articles_emb.columns if 'emb' in x]
item_emb_np = np.ascontiguousarray(articles_emb[item_emb_cols].values, dtype=np.float32)
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)  # normalize
item_emb_np.shape

(364047, 250)

# 3. Get the User Profile Embedding Matrix

In [29]:
def make_item_time_pair(df):
    return list(zip(df['click_article_id'], df['click_timestamp']))

train_click = train_click.sort_values('click_timestamp')

user_item_time_df = train_click.groupby('user_id')['click_article_id', 'click_timestamp']\
                    .apply(lambda x: make_item_time_pair(x))\
                    .reset_index().rename(columns={0: 'item_time_list'})
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))

In [30]:
user_item_time_dict[200000]

[(195839, 1507030363999), (191971, 1507030393999), (194300, 1507651461280)]

In [47]:
user_profile = dict()

for user_id, item_time_list in tqdm(user_item_time_dict.items()): 
    articles_emb_sum = 0
    for i, i_click_time in item_time_list:  # i: article id 
        articles_emb_sum += articles_emb.iloc[i][1:]
    user_profile[user_id] = articles_emb_sum / len(item_time_list)

100%|██████████| 50000/50000 [07:09<00:00, 116.45it/s]


In [34]:
import pandas as pd

user_profile_emb = pd.DataFrame.from_dict(user_profile, orient='index')
 
user_profile_emb.head()

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_240,emb_241,emb_242,emb_243,emb_244,emb_245,emb_246,emb_247,emb_248,emb_249
200000,-0.707095,-0.966142,0.215613,-0.540992,-0.097314,-0.711129,-0.388401,-0.214854,0.070674,-0.683168,...,-0.247624,-0.250399,0.712416,-0.092248,0.391472,0.133109,0.304943,0.722092,0.112895,0.090947
200001,-0.826997,-0.962994,-0.415602,-0.804719,0.35468,-0.163473,-0.524052,-0.354726,0.079817,-0.082757,...,-0.919703,-0.377261,-0.093302,0.559935,0.416835,-0.507327,0.212867,-0.435329,0.732461,0.17106
200002,-0.325285,-0.965357,-0.193873,-0.135098,-0.014686,0.087815,-0.581007,0.027498,-0.156805,0.038896,...,0.141499,-0.14935,0.10044,-0.060986,0.208628,-0.07424,-0.051672,0.07408,0.113742,0.192526
200003,-0.158326,-0.962052,0.07788,0.002835,-0.036167,0.184051,-0.406622,-0.042503,0.101619,-0.047257,...,0.040039,0.223023,-0.288795,-0.150053,0.205294,-0.067531,0.050926,0.128357,-0.062348,0.145443
200004,-0.362147,-0.966331,-0.212863,-0.450467,-0.183054,-0.256189,-0.091871,-0.051382,0.013033,-0.13921,...,-0.342469,-0.128845,-0.185921,-0.216158,0.152122,-0.00182,0.006968,0.133303,0.288489,0.272411


In [36]:
user_profile_emb_np = np.ascontiguousarray(user_profile_emb[item_emb_cols].values, dtype=np.float32)
user_profile_emb_np = user_profile_emb_np / np.linalg.norm(user_profile_emb_np, axis=1, keepdims=True)  # normalize
user_profile_emb_np.shape

(50000, 250)

# 4. Recommend

In [44]:
topk = 10
item_index = faiss.IndexFlatIP(item_emb_np.shape[1])  # Inner Product
item_index.add(item_emb_np)

# 相似度查询，给每个索引位置上的向量返回topk个item以及相似度
sim, idx = item_index.search(user_profile_emb_np, topk) # 返回的是列表
sim.shape  # 注意是排了顺序的

(50000, 10)

In [65]:
sim[0]

array([0.93727946, 0.9213254 , 0.92104375, 0.91865325, 0.9176518 ,
       0.9175933 , 0.91751045, 0.9173319 , 0.9167841 , 0.91651976],
      dtype=float32)

In [48]:
recall_df = pd.DataFrame(idx)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,195839,195035,188845,188395,188018,188125,189021,194300,194920,189030
1,175040,173466,173968,170466,172248,173533,172966,173021,171282,175486
2,299656,300704,205479,295398,299603,295905,300765,298673,301011,161269
3,161408,158039,157039,153006,157808,155685,162385,160231,159342,154567
4,85630,285712,236065,233608,286064,47906,284110,235061,344756,285027
...,...,...,...,...,...,...,...,...,...,...
49995,284110,284705,282761,279733,286064,284220,303288,284096,285372,47906
49996,160974,156560,150616,161100,158828,159764,162765,159034,159299,162172
49997,133022,132913,124638,133342,124065,123953,132119,123372,124090,124634
49998,233492,208218,236121,246238,237264,157252,234681,233420,234356,236899


In [51]:
submit = recall_df.iloc[:,:5]

In [52]:
submit.head()

Unnamed: 0,0,1,2,3,4
0,195839,195035,188845,188395,188018
1,175040,173466,173968,170466,172248
2,299656,300704,205479,295398,299603
3,161408,158039,157039,153006,157808
4,85630,285712,236065,233608,286064


In [61]:
submit["user_id"] = submit.index + 200000
submit = submit.rename(columns={0: 'article_1', 1: 'article_2', 2: 'article_3', 3: 'article_4', 4: 'article_5'})
submit = submit[['user_id', 'article_1', 'article_2', 'article_3', 'article_4', 'article_5']]

In [62]:
submit.head()

Unnamed: 0,user_id,article_1,article_2,article_3,article_4,article_5
0,200000,195839,195035,188845,188395,188018
1,200001,175040,173466,173968,170466,172248
2,200002,299656,300704,205479,295398,299603
3,200003,161408,158039,157039,153006,157808
4,200004,85630,285712,236065,233608,286064


In [63]:
submit.to_csv("content_based_baseline.csv", index=False, header=True)