<a href="https://colab.research.google.com/github/SeohuiPark/MLDLstudy/blob/main/deepfm_avazudata_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

참고 link code: 

https://deepctr-torch.readthedocs.io/en/latest/Examples.html

https://www.kaggle.com/sagu123/ctr-analysis-ipynb

https://github.com/shenweichen/DeepCTR-Torch



* 전체 데이터 40,428,967 - 40만 - colab에서 불러오기 안됨
* 10만개만 샘플링한 후 load

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
! pip install deepctr-torch

Collecting deepctr-torch
  Downloading deepctr_torch-0.2.7-py3-none-any.whl (70 kB)
[?25l[K     |████▊                           | 10 kB 24.6 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 26.0 MB/s eta 0:00:01[K     |██████████████                  | 30 kB 27.9 MB/s eta 0:00:01[K     |██████████████████▊             | 40 kB 30.9 MB/s eta 0:00:01[K     |███████████████████████▍        | 51 kB 23.6 MB/s eta 0:00:01[K     |████████████████████████████    | 61 kB 26.3 MB/s eta 0:00:01[K     |████████████████████████████████| 70 kB 5.0 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 48.2 MB/s 
Installing collected packages: tf-estimator-nightly, deepctr-torch
Successfully installed deepctr-torch-0.2.7 tf-estimator-nightly-2.8.0.dev2021122109


In [3]:
import os
import gzip
import shutil
import glob

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [4]:
def data_load():
    print("\n\n1. data load ")
    data_path = "/content/drive/MyDrive/Colab Notebooks/2022_recom_study/ctr_sample_dataset/abazu_dataset/"
    data = pd.read_csv(data_path + "avazu_sample_10.csv")
    display(data.head(3))
    print(data.columns)
    print(data.shape) 
    return data

In [5]:
def feature_selection(data):
    print("\n\n2. feature selection ")

    sparse_features = data.columns.tolist()
    sparse_features.remove('click')
    sparse_features.remove('hour')
    dense_features = ['hour']

    print("sparse feature :", sparse_features)
    print("dense feature :", dense_features)
    print("target :", 'click')

    return data, sparse_features, dense_features

In [6]:
def feature_encoding(data, sparse_features, dense_features):

    print("\n\n3-1. feature encoding ")
    print("categorical value to numeric label")
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    print("numeric value Minmax scaling ")
    mms = MinMaxScaler(feature_range=(0, 1)) ### date 더 최근일 수록 더 큰 숫자가 입력됨 
    data[dense_features] = mms.fit_transform(data[dense_features])

    return data

In [7]:
def feature_format_deepfm(data, sparse_features, dense_features, embedding_dim):

    print(f"\n\n3-2. feature embedding - embedding size {embedding_dim}")
    spar_feat_list = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim) for i, feat in enumerate(sparse_features)]
    dense_feat_list = [DenseFeat(feat, 1, ) for feat in dense_features]
    fixlen_feature_columns = spar_feat_list + dense_feat_list

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    return dnn_feature_columns, linear_feature_columns, feature_names

In [8]:
def data_split(data, test_rato, feature_names, random_seed):
    print(f"\n\n4. data split (test ratio - {test_rato})")
    train, test = train_test_split(data, test_size=test_rato, random_state = random_seed)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    return train, test, train_model_input, test_model_input 

In [9]:
def modeling(linear_feature_columns, dnn_feature_columns,
             batch_size, num_epoch, val_ratio, test_rato, l2_decay_val, random_seed):
    
    print(f"\n\n5. Modeling")
    model = DeepFM(linear_feature_columns=linear_feature_columns,  
               dnn_feature_columns=dnn_feature_columns, 
               l2_reg_linear=l2_decay_val, l2_reg_embedding=l2_decay_val, l2_reg_dnn=l2_decay_val,
               dnn_dropout=0.5, 
               dnn_use_bn = True,
               dnn_hidden_units=(32, 16),
               task='binary',
               seed=random_seed, device=device)


    model.compile("adam", "binary_crossentropy", 
                metrics=["binary_crossentropy", "auc"], )


    return model 


In [10]:
def eval_test(model, test_model_input, batch_size ):
    print(f"\n\n6. Evaluation testset")
    pred_ans = model.predict(test_model_input, batch_size) #batch_size default : 256
    print("")
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

## 4. modeling

In [11]:
if __name__ == "__main__":
    batch_size = 1000
    num_epoch = 20
    val_ratio = 0.1
    test_rato = 0.1
    random_seed = 2022
    l2_decay_val = 1e-01
    embedding_dim = 5

    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'


    data = data_load()
    target = ['click']

    data, sparse_features, dense_features = feature_selection(data)
    data = feature_encoding(data, sparse_features, dense_features)

    dnn_feature_columns, linear_feature_columns, feature_names = feature_format_deepfm(data, sparse_features, dense_features, embedding_dim)

    train, test, train_model_input, test_model_input = data_split(data, test_rato, 
                                                                  feature_names, random_seed)

    model = modeling(linear_feature_columns, dnn_feature_columns,
             batch_size, num_epoch, val_ratio, test_rato, l2_decay_val, random_seed)
    
    model.fit(train_model_input, train[target].values,
            batch_size=batch_size, epochs=num_epoch, verbose=2, validation_split=val_ratio)
    
    eval_test(model, test_model_input, batch_size)



cuda ready...


1. data load 


Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,3.572791e+18,0,14102518,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,...,1,0,18854,320,50,1882,3,35,-1,13
1,3.299518e+18,0,14102404,1005,0,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,...,1,0,21153,320,50,2420,2,35,-1,69
2,3.990806e+18,0,14102907,1005,0,517b8671,ac5abf20,f028772b,ecad2386,7801e8d9,...,1,0,23642,320,50,2709,3,35,-1,23


Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')
(100000, 24)


2. feature selection 
sparse feature : ['id', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
dense feature : ['hour']
target : click


3-1. feature encoding 
categorical value to numeric label
numeric value Minmax scaling 


3-2. feature embedding - embedding size 5


4. data split (test ratio - 0.1)


5. Modeling
cuda:0
Train on 81000 samples, validate on 9000 samples, 81 steps per epoch
Epoch 1/20
2s - loss:  0.4807 - binary_crossentropy:  0.4794 - auc:  0.6223 - val_binary_cross

## Reverse Engineering - for studying

In [None]:
class FM(nn.Module):
    """Factorization Machine models pairwise (order-2) feature interactions
     without linear term and bias.
      Input shape
        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
      Output shape
        - 2D tensor with shape: ``(batch_size, 1)``.
      References
        - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
    """

    def __init__(self):
        super(FM, self).__init__()

    def forward(self, inputs):
        fm_input = inputs

        square_of_sum = torch.pow(torch.sum(fm_input, dim=1, keepdim=True), 2)
        sum_of_square = torch.sum(fm_input * fm_input, dim=1, keepdim=True) 
        cross_term = square_of_sum - sum_of_square 
        cross_term = 0.5 * torch.sum(cross_term, dim=2, keepdim=False)

        return cross_term


In [None]:
import torch
import torch.nn as nn
from deepctr_torch.models.basemodel import BaseModel
from deepctr_torch.inputs import combined_dnn_input
from deepctr_torch.layers import FM, DNN

class DeepFM(BaseModel):
    """Instantiates the DeepFM Network architecture.
    :param linear_feature_columns: An iterable containing all the features used by linear part of the model. (-> FM에 들어갈 피쳐, 전체 피쳐)
    
    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model. (-> DNN에 들어갈 피쳐, 전체 피쳐)  
    :param use_fm: bool,use FM part or not (FM 사용할지 말지)
    
    :param dnn_hidden_units: list,list of positive integer or empty list, 
       the layer number and units in each layer of DNN (-> DNN 모델 layer 개수 - default 256, 128)
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.(->딥러닝 dropout)
    :param dnn_activation: Activation function to use in DNN (-> 딥러닝 활성함수) 
    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN (->딥러닝 배치norm)
    

    :param l2_reg_linear: float. L2 regularizer strength applied to linear part (-> FM l2 정규화 정도, defalut 1e-5) 
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector (-> embedding l2 정규화 정도, defalut 1e-5) 
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN (-> dnn l2 정규화 정도, defalut 1e-5) 
    
    :param init_std: float,to use as the initialize std of embedding vector (-> 임베딩 초기 표준편차)
    :param seed: integer ,to use as random seed. (-> 랜덤시드)

    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss (->태스크 - 이진분류 / 회귀)
    :param device: str, ``"cpu"`` or ``"cuda:0"`` (->cpu, gpu 선택)
    :param gpus: list of int or torch.device for multiple gpus. If None, run on `device`. `gpus[0]` should be the same gpu with `device`.
    :return: A PyTorch model instance.
    """

    def __init__(self,
                 linear_feature_columns, dnn_feature_columns, use_fm=True,
                 dnn_hidden_units=(256, 128),
                 l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,
                 dnn_dropout=0,
                 dnn_activation='relu', dnn_use_bn=False, task='binary', device='cpu', gpus=None):

        super(DeepFM, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
                                     l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
                                     device=device, gpus=gpus)

        self.use_fm = use_fm
        self.use_dnn = len(dnn_feature_columns) > 0 and len(
            dnn_hidden_units) > 0
        
        if use_fm: ### FM model 로딩
            self.fm = FM()

        if self.use_dnn: ### dnn part 에서 쓰일 모델들 선언 
            self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
                           activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
                           init_std=init_std, device=device)
            self.dnn_linear = nn.Linear(
                dnn_hidden_units[-1], 1, bias=False).to(device)

            self.add_regularization_weight(
                filter(lambda x: 'weight' in x[0] and 'bn' not in x[0], self.dnn.named_parameters()), l2=l2_reg_dnn)
            self.add_regularization_weight(self.dnn_linear.weight, l2=l2_reg_dnn)
        self.to(device)

    def forward(self, X): ### 학습 

        sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
                                                                                  self.embedding_dict)
        ## 1) FM 연산 
        ### 1.1) 선형 모델 통과 
        logit = self.linear_model(X) ## 결과값 추가 
        
        ### 1.2) fm 연산 (칼럼끼리 곱)
        if self.use_fm and len(sparse_embedding_list) > 0: 
            fm_input = torch.cat(sparse_embedding_list, dim=1)
            logit += self.fm(fm_input) ## 결과값 추가 

        ## 2) DNN 학습 
        if self.use_dnn: 
            dnn_input = combined_dnn_input(
                sparse_embedding_list, dense_value_list) 
            dnn_output = self.dnn(dnn_input)
            dnn_logit = self.dnn_linear(dnn_output)
            logit += dnn_logit ## 결과값 추가 

        y_pred = self.out(logit)

        return y_pred