<a href="https://colab.research.google.com/github/PiKaChu-wcg/tianchi_ex/blob/main/%E6%96%B0%E9%97%BB%E6%8E%A8%E8%8D%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! cp /content/drive/MyDrive/dataset/tianchi/news_recommender/* ./

In [None]:
# import packages
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
import collections
warnings.filterwarnings('ignore')

## 节省内存的一个函数 

In [None]:
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [None]:
# debug模式：从训练集中划出一部分数据来调试代码
def get_all_click_sample(data_path, sample_nums=10000):
    """
        训练集中采样一部分数据调试
        data_path: 原数据的存储路径
        sample_nums: 采样数目（这里由于机器的内存限制，可以采样用户做）
    """
    all_click = pd.read_csv(data_path + 'train_click_log.csv')
    all_user_ids = all_click.user_id.unique()

    sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) 
    all_click = all_click[all_click['user_id'].isin(sample_user_ids)]
    
    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

# 读取点击数据，这里分成线上和线下，如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中
# 如果是为了线下验证模型的有效性或者特征的有效性，可以只使用训练集
def get_all_click_df(data_path='./data_raw/', offline=True):
    if offline:
        all_click = pd.read_csv(data_path + 'train_click_log.csv')
    else:
        trn_click = pd.read_csv(data_path + 'train_click_log.csv')
        tst_click = pd.read_csv(data_path + 'testA_click_log.csv')

        all_click = trn_click.append(tst_click)
    
    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click
# all_click_df = get_all_click_df(data_path, offline=False)全量训练集

In [None]:
df=get_all_click_sample('./')

In [None]:
# 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
    
    click_df = click_df.sort_values('click_timestamp')
    
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'], df['click_timestamp']))
    
    user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    
    return user_item_time_dict

In [None]:
# 获取近期点击最多的文章
def get_item_topk_click(click_df, k):
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click

Unnamed: 0,user_id,item_time_list
0,11,"[(50644, 1508211074535), (234481, 1508211104535)]"
1,19,"[(70986, 1508210833015), (205824, 1508210863015)]"
2,52,"[(211442, 1508209963658), (211455, 15082099936..."
3,65,"[(50644, 1508209562590), (83549, 1508209592590)]"
4,67,"[(5583, 1508209332189), (5595, 1508209362189)]"
...,...,...
9995,199868,"[(24764, 1507032616256), (162655, 150703271837..."
9996,199899,"[(156624, 1507030186766), (64329, 150703191914..."
9997,199921,"[(208077, 1507029731561), (207540, 15070297615..."
9998,199946,"[(299697, 1507029935719), (272143, 15070301706..."


In [None]:
def itemcf_sim(df):
    user_item_time_dict=get_user_item_time(df)
    i2i_sim={}
    item_cnt=defaultdict(int)
    for user,item_time_list in tqdm(user_item_time_dict.items()):
        for i ,i_click_time in item_time_list:
            item_cnt[i]+=1
            i2i_sim.setdefault(i,{})
            for j,j_click_time in item_time_list:
                if(i==j):
                    continue
                i2i_sim[i].setdefault(j,0)
                i2i_sim[i][j]+=1/math.log(len(item_time_list)+1)
    i2i_sim=i2i_sim.copy()
    for i ,related_items in i2i_sim.items():
        for j,wij in related_items.items():
            i2i_sim[i][j]=wij/math.sqrt(item_cnt[i]*item_cnt[j])
    pickle.dump(i2i_sim,open('itemcd_i2i_sim.pkl','wb'))
itemcf_sim(df)  

100%|██████████| 10000/10000 [00:00<00:00, 17236.45it/s]


In [None]:
def item_based_recommend(user_id,user_item_time_dict,i2i_sim,sim_item_topk,recall_item_num,item_topk_click):
    user_hist_items=user_item_time_dict[user_id]
    user_hist_items={(user_id ,time) for user_id ,time in user_hist_items}
    item_rank={}
    for loc,(i,click_time) in enumerate(user_hist_items):
        for j,wij in  sorted(i2i_sim[i].items(),key=lambda x:x[1],reverse=True)[:sim_item_topk]:
            if j in user_hist_items:
                continue
            item_rank.setdefault(j,0)
            item_rank[j]+=wij
    if len(item_rank)<recall_item_num:
        for i ,item in enumerate(item_topk_click):
            if item in item_rank.items():
                continue
            item_rank[item]=-i-100
            if len(item_rank)==recall_item_num:
                break
    item_rank=sorted(item_rank.items(),key=lambda x:x[1],reverse=True)[:recall_item_num]
    return item_rank

In [None]:
# 定义
user_recall_items_dict = collections.defaultdict(dict)

# 获取 用户 - 文章 - 点击时间的字典
user_item_time_dict = get_user_item_time(df)

# 去取文章相似度
i2i_sim = pickle.load(open('itemcf_i2i_sim.pkl', 'rb'))

# 相似文章的数量
sim_item_topk = 10

# 召回文章数量
recall_item_num = 10

# 用户热度补全
item_topk_click = get_item_topk_click(df, k=50)

for user in tqdm(df['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, 
                                                        sim_item_topk, recall_item_num, item_topk_click)

100%|██████████| 10000/10000 [00:07<00:00, 1396.16it/s]


In [None]:
user_item_score_list=[]
for user,items in tqdm(user_recall_items_dict.items()):
    for item,score in items:
        user_item_score_list.append([user,item,score])
recall_df=pd.DataFrame(user_item_score_list,columns=['user_id','click_article_id','pred_score'])

100%|██████████| 10000/10000 [00:00<00:00, 62766.81it/s]


Unnamed: 0,user_id,click_article_id,pred_score
0,199921,96346,0.181668
1,199921,207540,0.156348
2,199921,61611,0.144996
3,199921,206735,0.137035
4,199921,159720,0.125740
...,...,...,...
99995,11,49282,0.108794
99996,11,30760,0.089076
99997,11,211442,0.082072
99998,11,156279,0.075689


In [None]:
def submit(recall_df,topk=5,model_name=None):
    recall_df=recall_df.sort_values(by=['user_id','pred_score'])
    recall_df['rank']=recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False,method='first')
    # tmp=recall_df.groupby('user_id').apply(lambda x:x['rank'].max())
    # assert tmp.min()>=topk
    del recall_df['pred_score']
    submit=recall_df[recall_df['rank']<=topk].set_index(['user_id','rank']).unstack(-1).reset_index()
    save_name=model_name+"_"+datetime.today().strftime('%m-%d')+'.cvs'
    submit.to_csv(save_name,index=False,header=True)


In [None]:
tst_click = pd.read_csv('testA_click_log.csv')
tst_users = tst_click['user_id'].unique()
tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]
submit(tst_recall, topk=5, model_name='itemcf_baseline')

In [None]:
recall_df,tst_users

(       user_id  click_article_id  pred_score
 0       199921             96346    0.181668
 1       199921            207540    0.156348
 2       199921             61611    0.144996
 3       199921            206735    0.137035
 4       199921            159720    0.125740
 ...        ...               ...         ...
 99995       11             49282    0.108794
 99996       11             30760    0.089076
 99997       11            211442    0.082072
 99998       11            156279    0.075689
 99999       11            209122    0.074870
 
 [100000 rows x 3 columns],
 array([249999, 249998, 249997, ..., 200002, 200001, 200000]))