In [None]:
# Get Kaggle token and place a kaggle.json file and run this to connect
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list 

In [None]:
# Download the data from kaggle
! kaggle datasets list -s criteo
# ! kaggle datasets download -d mrkmakr/criteo-dataset 
! kaggle datasets download -d benediktschifferer/criteo-dataset-parquet
! mv criteo-dataset.zip data/
! mv criteo-dataset-parquet.zip data/

In [None]:
# Copy the test.txt.zip to the gcs (one time)
# I had to download the test.txt.zip from the 
# Note a big training set is here: https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/
# https://www.kaggle.com/code/rikdifos/criteo-ctr-baseline/input?select=dac
from google.cloud import storage

def upload_file_to_gcs(FILENAME):
    LOCAL_PATH ="./data"

    PROJECT_ID = !(gcloud config get-value core/project)
    PROJECT_ID = PROJECT_ID[0]
    REGION = 'us-west1'
    GCS_BUCKET = f"{PROJECT_ID}-bucket"
    client = storage.Client()
    bucket = client.get_bucket(GCS_BUCKET)
    blob = bucket.blob(f"criteo/{FILENAME}")
    logging.info('Uploading local csv file to GCS...')
    blob.upload_from_filename(f"{LOCAL_PATH}/{FILENAME}")

# upload_file_to_gcs("test.txt.zip")
# upload_file_to_gcs("dac_sample.tar.gz")
upload_file_to_gcs("criteo-dataset.zip")
upload_file_to_gcs("criteo-dataset-parquet.zip")

In [None]:
!pip install deepctr_torch

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc

import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

plt.style.use('ggplot')

import warnings as w
w.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)

### Paper link: https://arxiv.org/pdf/1703.04247.pdf

!wget -P data -c https://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz

### Feature Explain
   1. sparse_feature : I1 ~ I27
   2. dense_feature : C1 ~ C14

In [None]:
columns = ['label', *(f'I{i}' for i in range(1, 14)), *(f'C{i}' for i in range(1, 27))]
data = pd.read_csv('data/dac_sample.txt', sep='\t', names=columns).fillna(0)
data

In [None]:
data.isna().sum().sum()

In [None]:
data.label.value_counts().plot(kind='bar',figsize=(10,8))
print(data.label.value_counts())

### Numerical data apply Robust Scaler

In [None]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
labeling = LabelEncoder()
data.iloc[::,1:14] = scaler.fit_transform(data.iloc[::,1:14])
dense_feature = data.iloc[::,14:40] 
for feature in dense_feature:
    fix_data = []
    for variable in dense_feature[feature]:
        if variable == 0:
            variable = 'a'
            fix_data.append(variable)
        else:
            fix_data.append(variable)
    dense_feature[feature] = fix_data
    dense_feature[feature] = labeling.fit_transform(dense_feature[feature])
    data.iloc[::,1:14] = scaler.fit_transform(data.iloc[::,1:14])
data.iloc[::,14:40] = dense_feature


In [None]:
if __name__ == "__main__":
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    
    target = ['label']

    
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                              for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                              for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate

    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   task='binary',
                   l2_reg_embedding=1e-5, device=device)

    model.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy", "auc"], )

    history = model.fit(train_model_input, train[target].values, batch_size=512, epochs=50, verbose=1,
                        validation_split=0.2)
    pred_ans = model.predict(test_model_input, 256)
    print("")
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

In [None]:
loss = pd.DataFrame.from_dict(history.history)
display(loss.head())
plt.figure(figsize=(10,8))
plt.plot(loss.binary_crossentropy,label='Train Loss')
plt.plot(loss.val_binary_crossentropy,label='Validation Loss')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.plot(loss.auc,label='Train AUC')
plt.plot(loss.val_auc,label='Validation AUC')
plt.legend()
plt.show()

### Why this dataset happened overfitting?
### But in movie len dataset this DeepFM good performance
### Movie Len : https://www.kaggle.com/code/leejunseok97/deepfm-movie-len-pytorch

In [None]:
# class FM(nn.Module):
#     def __init__(self):
#         super().__init__()
    
#     def forward(self,x):
#         ix = (torch.sum(x,dim=1) ** 2) - (torch.sum(x ** 2,dim=1))
#         ix = torch.sum(ix,dim=1,keepdim=True)
#         return 0.5 *  ix

In [None]:
# class feat_linear(nn.Module):
#     def __init__(self,field_dim_list):
#         super().__init__()
#         self.fc = nn.Embedding(sum(field_dim_list),1)
#         self.bias = nn.Parameter(torch.zeros((1,)))
#         self.offsets = np.array((0, *np.cumsum(field_dim_list)[:-1]),dtype=np.long)
#     def forward(self,x):
#         x += x.new_tensor(self.offsets).unsqueeze(0)
#         return torch.sum(self.fc(x),dim=1) + self.bias

In [None]:
# class feat_embedding(nn.Module):
#     def __init__(self,field_dim_list,emb_dim):
#         super().__init__()
#         self.embedding = nn.Embedding(sum(field_dim_list),emb_dim)
#         self.offsets = np.array((0, *np.cumsum(field_dim_list)[:-1]),dtype=np.long)
#         nn.init.xavier_uniform_(self.embedding.weight.data)
#     def forward(self,x):
#         x += x.new_tensor(self.offsets).unsqueeze(0)
#         return self.embedding(x)

In [None]:
# class MLP(nn.Module):
#     def __init__(self,input_dim,embed_dim):
#         super().__init__()
#         self.seq = nn.Sequential(
#             nn.Linear(input_dim,embed_dim),
#             nn.BatchNorm1d(embed_dim),
#             nn.Hardswish(),
#             nn.Dropout(0.5),
#             nn.Linear(embed_dim,1)
#         )
#     def forward(self,x):
#         return self.seq(x)

In [None]:
# class DeepFM(nn.Module):
#     def __init__(self,field_dim_list,embed_dim,mlp_dims):
#         super().__init__()
#         self.linear = feat_linear(field_dim_list)
#         self.fm = FM()
#         self.embedding = feat_embedding(field_dim_list,emb_dim=embed_dim)
#         self.embed_output_dim = len(field_dim_list) * embed_dim
#         self.mlp = MLP(self.embed_output_dim , mlp_dims)
#     def forward(self,x):
#         embed_x = self.embedding(x)
#         output = self.linear(x) + self.fm(embed_x) + self.mlp(embed_x.view(-1,self.embed_output_dim))
#         return torch.sigmoid(output.squeeze(1))