In [2]:
import os
import sys

import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
import utils
from sklearn.utils import shuffle
from metrics import gini_norm
from DeepFM import DeepFM
from DataReader import FeatureDictionary, DataParser
gini_scorer = make_scorer(gini_norm, greater_is_better=True, needs_proba=True)
from sklearn.metrics import accuracy_score

In [75]:
# params
dfm_params = {
    "use_fm": True,
    "use_deep": True,
    "embedding_size": 8,
    "dropout_fm": [1.0, 1.0],
    "deep_layers": [32, 32],
    "dropout_deep": [0.5, 0.5, 0.5],
    "deep_layers_activation": tf.nn.relu,
    "epoch": 10,
    "batch_size": 1024,
    "learning_rate": 0.001,
    "optimizer_type": "adam",
    "batch_norm": 1,
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": gini_norm
}


def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,numeric_cols):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,numeric_cols=numeric_cols)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest,has_label=True)
    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])
    dfm = DeepFM(**dfm_params)
    
    
    
    dfm.fit(Xi_train, Xv_train, y_train,early_stopping=True)
    pred = dfm.predict(Xi_test,Xv_test)
    print(len(y_test))
    
    y_pred = []
    for item in pred:
        if(item>=0.5):y_pred.append(1)
        else:y_pred.append(0)
    return y_test,y_pred

In [87]:

data = utils.generate_training_data()
data = data.rename(columns={'label':'target','user_id':'id'})
print('load data success')
    
data = shuffle(data)
#data = data.iloc[:,:]
print(data[:2])

train = data[:int(0.7*data.shape[0])]
test = data[int(0.7*data.shape[0]):]
    
y_train = train['target']
X_train = train.drop(['id','target'],axis=1)
numerical_line = train.drop(['target','id','click_article_id'],axis=1).columns.tolist()
folds = list(StratifiedKFold(n_splits=3, shuffle=True,random_state=42).split(X_train, y_train))

load data success
           id  click_article_id  target         0         1         2  \
11874  137616            289003       1 -0.866647 -0.977833  0.133551   
23165   75233            283009       0 -0.397132 -0.956699 -0.781852   

              3         4         5         6    ...          240       241  \
11874 -0.891685 -0.816009 -0.258980  0.375519    ...    -0.216694  0.613482   
23165 -0.662407 -0.906118 -0.844351  0.766803    ...    -0.384559 -0.411390   

            242       243       244       245       246       247       248  \
11874 -0.786207  0.271031  0.069902 -0.303642  0.458376  0.562723 -0.022085   
23165 -0.200674 -0.443566  0.227839  0.471119 -0.337462  0.723471  0.027905   

            249  
11874  0.775234  
23165  0.841124  

[2 rows x 253 columns]


In [88]:
train['target'].unique()

array([1, 0])

In [93]:
y_true,y_pred = _run_base_model_dfm(train,test,folds,dfm_params,numerical_line)

#params: 149120
[1] train-result=-0.0524 [14.1 s]
[2] train-result=-0.0394 [10.1 s]
[3] train-result=-0.0028 [9.8 s]
[4] train-result=0.0195 [10.4 s]
[5] train-result=0.0486 [10.3 s]
[6] train-result=0.0793 [10.1 s]
[7] train-result=0.1059 [10.4 s]
[8] train-result=0.1302 [10.3 s]
[9] train-result=0.1603 [9.8 s]
[10] train-result=0.1949 [10.3 s]
7800


In [94]:
accuracy_score(y_true,y_pred)

0.767051282051282