<a href="https://colab.research.google.com/github/RenqinSS/Rec/blob/main/algo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/dpoqb/wechat_big_data_baseline_pytorch.git

!dir
!mkdir data
!unzip ./drive/MyDrive/wechat_algo_data1.zip -d ./data

!pip install deepctr_torch

import torch
import os

print(torch.cuda.is_available())
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))

Cloning into 'wechat_big_data_baseline_pytorch'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 16 (delta 3), reused 5 (delta 0), pack-reused 0[K
Unpacking objects: 100% (16/16), done.
drive  sample_data  wechat_big_data_baseline_pytorch
Archive:  ./drive/MyDrive/wechat_algo_data1.zip
   creating: ./data/wechat_algo_data1/
  inflating: ./data/wechat_algo_data1/test_a.csv  
  inflating: ./data/wechat_algo_data1/feed_info.csv  
  inflating: ./data/wechat_algo_data1/feed_embeddings.csv  
  inflating: ./data/wechat_algo_data1/README.md  
  inflating: ./data/wechat_algo_data1/user_action.csv  
  inflating: ./data/wechat_algo_data1/submit_demo_初赛a.csv  
Collecting deepctr_torch
[?25l  Downloading https://files.pythonhosted.org/packages/d2/17/f392dfbaefdd6371335995c4f84cf3b5166cf907fdfa0aa4edc380fdfc5b/deepctr_torch-0.2.7-py3-none-any.whl (70kB)
[K     |██████████████████████████

In [2]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.decomposition import PCA

import os
os.chdir('/content/wechat_big_data_baseline_pytorch')


# 存储数据的根目录
ROOT_PATH = "../data"
# 比赛数据集路径
DATASET_PATH = ROOT_PATH + '/wechat_algo_data1/'
# 训练集
USER_ACTION = DATASET_PATH + "user_action.csv"
FEED_INFO = DATASET_PATH + "feed_info.csv"
FEED_EMBEDDINGS = DATASET_PATH + "feed_embeddings.csv"
# 测试集
TEST_FILE = DATASET_PATH + "test_a.csv"
# 初赛待预测行为列表
ACTION_LIST = ["read_comment", "like", "click_avatar", "forward"]
FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar", "forward", "comment", "follow", "favorite"]
FEA_FEED_LIST = ['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']
# 负样本下采样比例(负样本:正样本)
ACTION_SAMPLE_RATE = {"read_comment": 5, "like": 5, "click_avatar": 5, "forward": 10, "comment": 10, "follow": 10, "favorite": 10}

def process_embed(train):
    feed_embed_array = np.zeros((train.shape[0], 512))
    for i in tqdm(range(train.shape[0])):
        x = train.loc[i, 'feed_embedding']
        if x != np.nan and x != '':
            y = [float(i) for i in str(x).strip().split(" ")]
        else:
            y = np.zeros((512,)).tolist()
        feed_embed_array[i] += y
    temp = pd.DataFrame(columns=[f"embed{i}" for i in range(512)], data=feed_embed_array)
    train = pd.concat((train, temp), axis=1)
    return train

def prepare_data():
    feed_info_df = pd.read_csv(FEED_INFO)
    user_action_df = pd.read_csv(USER_ACTION)[["userid", "date_", "feedid"] + FEA_COLUMN_LIST]
    
    feed_info_df = feed_info_df[FEA_FEED_LIST]

    test = pd.read_csv(TEST_FILE)

    # add feed feature
    train = pd.merge(user_action_df, feed_info_df, on='feedid', how='left')
    test = pd.merge(test, feed_info_df, on='feedid', how='left')
    test["videoplayseconds"] = np.log(test["videoplayseconds"] + 1.0)
    test.to_csv(ROOT_PATH + f'/test_data.csv', index=False)
    for action in tqdm(ACTION_LIST):
        print(f"prepare data for {action}")
        tmp = train.drop_duplicates(['userid', 'feedid', action], keep='last')
        df_neg = tmp[tmp[action] == 0]
        df_neg = df_neg.sample(frac=1.0 / ACTION_SAMPLE_RATE[action], random_state=42, replace=False)
        df_all = pd.concat([df_neg, tmp[tmp[action] == 1]])
        df_all["videoplayseconds"] = np.log(df_all["videoplayseconds"] + 1.0)
        df_all.to_csv(ROOT_PATH + f'/train_data_for_{action}.csv', index=False)


if __name__ == "__main__":
    prepare_data()

  0%|          | 0/4 [00:00<?, ?it/s]

prepare data for read_comment


 25%|██▌       | 1/4 [00:11<00:34, 11.46s/it]

prepare data for like


 50%|█████     | 2/4 [00:22<00:22, 11.22s/it]

prepare data for click_avatar


 75%|███████▌  | 3/4 [00:32<00:10, 10.97s/it]

prepare data for forward


100%|██████████| 4/4 [00:38<00:00,  9.70s/it]


In [10]:
from sklearn.decomposition import PCA

n_dim = 32
feed_embed = pd.read_csv(FEED_EMBEDDINGS)
feed_embed['feed_embedding'] = feed_embed['feed_embedding'].apply(lambda row: [float(x) for x in row.strip().split()])
pca = PCA(n_components=n_dim)
pca_emb = pca.fit_transform(feed_embed['feed_embedding'].tolist())
feed_embed['pca_emb'] = list(pca_emb)
feed_embed = feed_embed[['feedid', 'pca_emb']]
# feed_embed.drop(['feed_embedding'], axis=1).to_csv("/content/drive/MyDrive/pca_emb%d.csv" % n_dim, index=False)

In [4]:
from numba import njit
from scipy.stats import rankdata


@njit
def _auc(actual, pred_ranks):
    n_pos = np.sum(actual)
    n_neg = len(actual) - n_pos
    return (np.sum(pred_ranks[actual == 1]) - n_pos*(n_pos+1)/2) / (n_pos*n_neg)


def fast_auc(actual, predicted):
    # https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/208031
    pred_ranks = rankdata(predicted)
    return _auc(actual, pred_ranks)


def uAUC(labels, preds, user_id_list):
    user_pred = defaultdict(lambda: [])
    user_truth = defaultdict(lambda: [])
    for idx, truth in enumerate(labels):
        user_id = user_id_list[idx]
        pred = preds[idx]
        truth = labels[idx]
        user_pred[user_id].append(pred)
        user_truth[user_id].append(truth)

    user_flag = defaultdict(lambda: False)
    for user_id in set(user_id_list):
        truths = user_truth[user_id]
        flag = False
        # 若全是正样本或全是负样本，则flag为False
        for i in range(len(truths) - 1):
            if truths[i] != truths[i + 1]:
                flag = True
                break
        user_flag[user_id] = flag

    total_auc = 0.0
    size = 0.0
    for user_id in user_flag:
        if user_flag[user_id]:
            auc = fast_auc(np.asarray(user_truth[user_id]), np.asarray(user_pred[user_id]))
            total_auc += auc 
            size += 1.0
    user_auc = float(total_auc)/size
    return user_auc


def compute_weighted_score(score_dict, weight_dict):
    score = 0.0
    weight_sum = 0.0
    for action in score_dict:
        weight = float(weight_dict[action])
        score += weight*score_dict[action]
        weight_sum += weight
    score /= float(weight_sum)
    score = round(score, 6)
    return score

In [14]:
sparse_2_dim = {
    'userid': 8,
    'feedid': 8,
    'authorid': 8,
    'bgm_song_id': 8,
    'bgm_singer_id': 8,
}

dense_2_dim = {
    'videoplayseconds': 1,
    'pca_emb': 32,
}

In [17]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm import tqdm
from collections import defaultdict
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models.deepfm import *
from deepctr_torch.models.basemodel import *


class MyBaseModel(BaseModel):

    def fit(self, x, y, batch_size, val_data=None, epochs=1, verbose=1, mode='offline'):
        x = [x[feature] for feature in self.feature_index]  # type(x) = dict
        for i in range(len(x)):
            x[i] = np.array(x[i].tolist())
            if len(x[i].shape) == 1:
                x[i] = np.expand_dims(x[i], axis=1)

        val_x, val_y = [], []
        if mode == 'offline':
            val_x, val_y = val_data
            val_uids = val_x['userid'].tolist()
            val_x = [val_x[feature] for feature in self.feature_index]

        train_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)), torch.from_numpy(y))
        train_loader = DataLoader(dataset=train_tensor_data, shuffle=True, batch_size=batch_size)
        sample_num = len(train_tensor_data)
        steps_per_epoch = (sample_num - 1) // batch_size + 1

        # Train
        print("Train on {0} samples, validate on {1} samples, {2} steps per epoch".format(len(train_tensor_data), len(val_y), steps_per_epoch))
        epoch_logs = defaultdict(dict)
        model = self.train()
        for epoch in range(epochs):
            start_time = time.time()
            loss_epoch = 0
            total_loss_epoch = 0
            train_result = defaultdict(list)
            for _, (x_train, y_train) in tqdm(enumerate(train_loader)):
                x = x_train.to(self.device).float()
                y = y_train.to(self.device).float()

                y_pred = model(x).squeeze()

                self.optim.zero_grad()
                loss = self.loss_func(y_pred, y.squeeze(), reduction='sum')
                total_loss = loss + self.get_regularization_loss() + self.aux_loss

                loss_epoch += loss.item()
                total_loss_epoch += total_loss.item()
                total_loss.backward()
                self.optim.step()

                for name, func in self.metrics.items():
                    train_result[name].append(func(y.cpu().data.numpy(), y_pred.cpu().data.numpy().astype("float64")))

            # Add logs
            logs = {}
            logs["loss"] = total_loss_epoch / sample_num
            for name, result in train_result.items():
                logs[name] = np.sum(result) / steps_per_epoch

            if mode == 'offline':
                eval_result = self.evaluate(val_x, val_y, val_uids, batch_size)
                for name, result in eval_result.items():
                    logs["val_" + name] = result
            
            print('Epoch {0}/{1}, {2}s'.format(epoch + 1, epochs, int(time.time() - start_time)))
            eval_str = "loss: {0: .4f}".format(logs["loss"])
            for name in logs:
                eval_str += " - " + name + ": {0: .4f}".format(logs[name])
            print(eval_str)
            epoch_logs[epoch+1] = logs
        return epoch_logs

    def evaluate(self, x, y, uids, batch_size=256):
        preds = self.predict(x, batch_size)
        eval_result = {}
        for name, metric_fun in self.metrics.items():
            eval_result[name] = metric_fun(y, preds)
        eval_result['uAUC'] = uAUC(y.squeeze(), preds.squeeze(), uids)

        return eval_result

    def predict(self, x, batch_size=256):
        model = self.eval()
        if isinstance(x, dict):
            x = [x[feature] for feature in self.feature_index]
        for i in range(len(x)):
            x[i] = np.array(x[i].tolist())
            if len(x[i].shape) == 1:
                x[i] = np.expand_dims(x[i], axis=1)

        tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)))
        test_loader = DataLoader(dataset=tensor_data, shuffle=False, batch_size=batch_size)

        pred_ans = []
        with torch.no_grad():
            for _, x_test in enumerate(test_loader):
                x = x_test[0].to(self.device).float()
                y_pred = model(x).cpu().data.numpy()
                pred_ans.append(y_pred)

        return np.concatenate(pred_ans).astype("float64")

class MyDeepFM(MyBaseModel):
    def __init__(self,
                 linear_feature_columns, dnn_feature_columns,
                 dnn_hidden_units=(256, 128, 64),
                 l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,
                 dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False, task='binary', device='cpu'):

        super(MyDeepFM, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
                                     l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
                                     device=device)

        self.fm = FM()
        self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
                        activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
                        init_std=init_std, device=device)
        self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(device)
        self.add_regularization_weight(filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2=l2_reg_dnn)
        self.add_regularization_weight(self.dnn_linear.weight, l2=l2_reg_dnn)

        self.to(device)

    def forward(self, X):
        sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns, self.embedding_dict) # 5*[512,1,4], 1*[512,1]
        
        # lr
        logit = self.linear_model(X)
        
        # fm
        fm_input = torch.cat(sparse_embedding_list, dim=1)
        square_of_sum = torch.pow(torch.sum(fm_input, dim=1, keepdim=True), 2)
        sum_of_square = torch.sum(fm_input * fm_input, dim=1, keepdim=True)
        logit += 0.5 * torch.sum(square_of_sum - sum_of_square, dim=2, keepdim=False)

        # dnn
        sparse_dnn_input = torch.flatten(torch.cat(sparse_embedding_list, dim=-1), start_dim=1)
        dense_dnn_input = torch.flatten(torch.cat(dense_value_list, dim=-1), start_dim=1)
        dnn_input = torch.cat([sparse_dnn_input, dense_dnn_input], dim=-1)
        logit += self.dnn_linear(self.dnn(dnn_input))
        
        return self.out(logit)


mode = 'online'  # online
if __name__ == "__main__":
    submit = pd.read_csv(ROOT_PATH + '/test_data.csv')[['userid', 'feedid']]
    logs = {}
    for action in ACTION_LIST:
        print('*** train for %s ***' % action)

        USE_FEAT = ['userid', 'feedid', action] + FEA_FEED_LIST[1:]
        train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}.csv')[['date_'] + USE_FEAT]

        # TODO: sampling
        # train = train.sample(frac=0.1, random_state=42).reset_index(drop=True)
        print("positive ratio:", sum((train[action] == 1) * 1) / train.shape[0])
        
        test = pd.read_csv(ROOT_PATH + '/test_data.csv')[[i for i in USE_FEAT if i != action]]
        test[action] = 0
        test['date_'] = 15
        test = test[['date_'] + USE_FEAT]
        data = pd.concat((train, test)).reset_index(drop=True)

        # universal embedding
        data = pd.merge(data, feed_embed, on='feedid', how='left')
        data['pca_emb'] = [e if isinstance(e, np.ndarray) else np.zeros((32)) for e in data['pca_emb']]

        # features
        sparse_features = list(sparse_2_dim.keys())
        dense_features = list(dense_2_dim.keys())
        print('sparse_features: ', sparse_features)
        print('dense_features: ', dense_features)

        data[sparse_features] = data[sparse_features].fillna(0)
        data[dense_features] = data[dense_features].fillna(0)

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])
        # mms = MinMaxScaler(feature_range=(0, 1))
        # data[dense_features] = mms.fit_transform(data[dense_features])

        # 2.count #unique features for each sparse field,and record dense feature field name
        fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), sparse_2_dim[feat]) for feat in sparse_features] + [DenseFeat(feat, dense_2_dim[feat]) for feat in dense_features]
        dnn_feature_columns = fixlen_feature_columns
        linear_feature_columns = fixlen_feature_columns

        feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

        # 3.generate input data for model
        train, test = data.iloc[:train.shape[0]].reset_index(drop=True), data.iloc[train.shape[0]:].reset_index(drop=True)
        if mode == 'offline':
            train_idxes, eval_idxes = train['date_'] != 14, train['date_'] == 14
            train, eval = train[train_idxes].drop(['date_'], axis=1), train[eval_idxes].drop(['date_'], axis=1)
        if mode == 'online':
            train = train.drop(['date_'], axis=1)
            eval = train.head()  # fake
        test = test.drop(['date_'], axis=1)

        train_x = {name: train[name] for name in feature_names}
        eval_x  = {name: eval[name]  for name in feature_names}
        test_x  = {name: test[name]  for name in feature_names}

        # 4.Define Model,train,predict and evaluate
        model = MyDeepFM(
            linear_feature_columns=linear_feature_columns, 
            dnn_feature_columns=dnn_feature_columns,
            task='binary', l2_reg_embedding=1e-1, device='cuda:0' if torch.cuda.is_available() else 'cpu')
        model.compile("adagrad", "binary_crossentropy", metrics=["binary_crossentropy", "auc"])
        
        act_logs = model.fit(train_x, train[[action]].values, val_data=(eval_x, eval[[action]].values), batch_size=512, epochs=2, mode=mode)
        logs[action] = act_logs

        # online
        submit[action] = model.predict(test_x, 128)
        torch.cuda.empty_cache()
    
    # weighted uAUC
    if mode == 'offline':
        score_dict = {}
        for act in logs:
            act_logs = logs[act]
            score_dict[act] = act_logs[max(act_logs.keys())]['val_uAUC']
        weight_dict = {"read_comment": 4.0, "like": 3.0, "click_avatar": 2.0, "forward": 1.0, "favorite": 1.0, "comment": 1.0, "follow": 1.0}
        weighted_uAUC = compute_weighted_score(score_dict, weight_dict)
        print(score_dict)
        print('weighted_uAUC: ', weighted_uAUC)

    # online
    submit.to_csv("./submit.csv", index=False)


*** train for read_comment ***
positive ratio: 0.15565121499983292
sparse_features:  ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
dense_features:  ['videoplayseconds', 'pca_emb']


1it [00:00,  7.20it/s]

Train on 1645885 samples, validate on 0 samples, 3215 steps per epoch


3215it [00:37, 84.75it/s]
1it [00:00,  7.47it/s]

Epoch 1/2, 37s
loss:  0.2470 - loss:  0.2470 - binary_crossentropy:  0.2307 - auc:  0.9266


3215it [00:37, 86.00it/s]


Epoch 2/2, 37s
loss:  0.2222 - loss:  0.2222 - binary_crossentropy:  0.2096 - auc:  0.9443
*** train for like ***
positive ratio: 0.11860030306914048
sparse_features:  ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
dense_features:  ['videoplayseconds', 'pca_emb']


1it [00:00,  7.31it/s]

Train on 1591716 samples, validate on 0 samples, 3109 steps per epoch


3109it [00:36, 85.34it/s]
1it [00:00,  7.96it/s]

Epoch 1/2, 36s
loss:  0.2769 - loss:  0.2769 - binary_crossentropy:  0.2605 - auc:  0.8493


3109it [00:35, 87.18it/s]


Epoch 2/2, 35s
loss:  0.2558 - loss:  0.2558 - binary_crossentropy:  0.2428 - auc:  0.8764
*** train for click_avatar ***
positive ratio: 0.0371053151111731
sparse_features:  ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
dense_features:  ['videoplayseconds', 'pca_emb']


1it [00:00,  7.87it/s]

Train on 1484127 samples, validate on 0 samples, 2899 steps per epoch


2899it [00:34, 84.64it/s]
1it [00:00,  7.96it/s]

Epoch 1/2, 34s
loss:  0.1354 - loss:  0.1354 - binary_crossentropy:  0.1241 - auc:  0.8371


2899it [00:34, 84.91it/s]


Epoch 2/2, 34s
loss:  0.1195 - loss:  0.1195 - binary_crossentropy:  0.1106 - auc:  0.8924
*** train for forward ***
positive ratio: 0.03752907526887493
sparse_features:  ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
dense_features:  ['videoplayseconds', 'pca_emb']


4it [00:00, 38.33it/s]

Train on 745049 samples, validate on 0 samples, 1456 steps per epoch


1456it [00:16, 86.17it/s]
1it [00:00,  7.59it/s]

Epoch 1/2, 16s
loss:  0.1301 - loss:  0.1301 - binary_crossentropy:  0.1186 - auc:  0.8630


1456it [00:17, 84.21it/s]


Epoch 2/2, 17s
loss:  0.1076 - loss:  0.1076 - binary_crossentropy:  0.0970 - auc:  0.9321


In [None]:
# baseline
{'read_comment': 0.6102415130979689, 'like': 0.6055234369612766, 'click_avatar': 0.7059927976309249, 'forward': 0.6832353813536607}
weighted_uAUC:  0.635276

# dnn_dropout = 0.1
{'read_comment': 0.6094100217906185, 'like': 0.6052801328988395, 'click_avatar': 0.7059140934189055, 'forward': 0.6846734262464789}
weighted_uAUC:  0.634998

# 256, 128, 128
{'read_comment': 0.613116787160124, 'like': 0.6062583852548347, 'click_avatar': 0.7058735217580193, 'forward': 0.6769030704770939}
weighted_uAUC:  0.635989

# epoch = 2
{'read_comment': 0.6117841889858322, 'like': 0.6089919743022709, 'click_avatar': 0.7138421964649098, 'forward': 0.6829949302549756}
weighted_uAUC:  0.638479

# sparse dim = 8, epoch = 2 (new baseline)
{'read_comment': 0.6126884118803656, 'like': 0.6078158393185238, 'click_avatar': 0.7141126528216767, 'forward': 0.6923154125787877}
weighted_uAUC:  0.639474

# 删除了对 videoplayseconds 的归一化(new baseline)
{'read_comment': 0.6150373746448982, 'like': 0.6087792274162345, 'click_avatar': 0.7137088800810096, 'forward': 0.6919173648006157}
weighted_uAUC:  0.640582

# add feed embedding 32(new baseline)
{'read_comment': 0.6231230935993682, 'like': 0.6162679088683002, 'click_avatar': 0.7128391281987229, 'forward': 0.6951917541544708}
weighted_uAUC:  0.646217

# add feed embedding 64
{'read_comment': 0.6179610910963779, 'like': 0.617180918593666, 'click_avatar': 0.7121687727167492, 'forward': 0.6969728833664359}
weighted_uAUC:  0.64447

# sparse dim = 12
{'read_comment': 0.6152862366363533, 'like': 0.6172504324924313, 'click_avatar': 0.7100718453099804, 'forward': 0.701999472669805}
weighted_uAUC:  0.643504

# baseline 的重复实验
{'read_comment': 0.6220072250372239, 'like': 0.6181791275945606, 'click_avatar': 0.7129768375663601, 'forward': 0.6987107057431032}
weighted_uAUC:  0.646723