In [7]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

In [8]:
import tensorflow as tf

In [9]:
df = pd.read_csv("features_63_label_2.csv").drop(["Unnamed: 0"], axis=1)

In [10]:
df_user_id = df["user_id"]
df_label = df["award_id"].factorize()[0]
df.drop(["user_id", "award_id"], axis=1, inplace=True)

In [11]:
dense_mapping = {
    "专业选修课成绩": "po_grade",
    "专业选修课学分": "po_credit",
    "专业必修课成绩": "pm_grade",
    "专业必修课学分": "pm_credit",
    "选修课成绩": "op_grade",
    "选修课学分": "op_credit",
    "公共必修课成绩": "cm_grade",
    "公共必修课学分": "cm_credit",
    "未分类课成绩": "uc_grade",
    "未分类课学分": "uc_credit",
    "总成绩": "tl_grade", 
    "总学分": "tl_credit",
}

In [12]:
df.rename(columns=dense_mapping, inplace=True)

In [13]:
df.columns

Index(['paper_count', 'paper_authOrder_0', 'paper_conferenceChoice_0',
       'paper_paperState_0', 'paper_labels_0', 'paper_authOrder_1',
       'paper_conferenceChoice_1', 'paper_paperState_1', 'paper_labels_1',
       'paper_authOrder_2', 'paper_conferenceChoice_2', 'paper_paperState_2',
       'paper_labels_2', 'patent_record_count', 'patent_record_classify_0',
       'patent_record_classify_1', 'patent_record_classify_2',
       'research_award_count', 'research_award_competeRange_0',
       'research_award_awardLevel_0', 'research_award_labels_0',
       'research_award_competeRange_1', 'research_award_awardLevel_1',
       'research_award_labels_1', 'research_award_competeRange_2',
       'research_award_awardLevel_2', 'research_award_labels_2',
       'research_project_count', 'research_project_projectType_0',
       'research_project_projectType_1', 'research_project_projectType_2',
       'competition_award_count', 'competition_award_competeRange_0',
       'competition_award

In [14]:
col = list(df.columns)

In [15]:
sparse_features = col[: -12]

In [16]:
sparse_features

['paper_count',
 'paper_authOrder_0',
 'paper_conferenceChoice_0',
 'paper_paperState_0',
 'paper_labels_0',
 'paper_authOrder_1',
 'paper_conferenceChoice_1',
 'paper_paperState_1',
 'paper_labels_1',
 'paper_authOrder_2',
 'paper_conferenceChoice_2',
 'paper_paperState_2',
 'paper_labels_2',
 'patent_record_count',
 'patent_record_classify_0',
 'patent_record_classify_1',
 'patent_record_classify_2',
 'research_award_count',
 'research_award_competeRange_0',
 'research_award_awardLevel_0',
 'research_award_labels_0',
 'research_award_competeRange_1',
 'research_award_awardLevel_1',
 'research_award_labels_1',
 'research_award_competeRange_2',
 'research_award_awardLevel_2',
 'research_award_labels_2',
 'research_project_count',
 'research_project_projectType_0',
 'research_project_projectType_1',
 'research_project_projectType_2',
 'competition_award_count',
 'competition_award_competeRange_0',
 'competition_award_awardLevel_0',
 'competition_award_labels_0',
 'competition_award_comp

In [17]:
dense_features = col[-12: ]

In [18]:
dense_features

['po_grade',
 'po_credit',
 'pm_grade',
 'pm_credit',
 'op_grade',
 'op_credit',
 'cm_grade',
 'cm_credit',
 'uc_grade',
 'uc_credit',
 'tl_grade',
 'tl_credit']

In [19]:
new_sparse = []
for k in sparse_features:
    if "count" in k:
        dense_features.append(k)
    else:
        new_sparse.append(k)

In [20]:
dense_features

['po_grade',
 'po_credit',
 'pm_grade',
 'pm_credit',
 'op_grade',
 'op_credit',
 'cm_grade',
 'cm_credit',
 'uc_grade',
 'uc_credit',
 'tl_grade',
 'tl_credit',
 'paper_count',
 'patent_record_count',
 'research_award_count',
 'research_project_count',
 'competition_award_count',
 'honor_title_count',
 'other_achievements_count']

In [21]:
sparse_features = new_sparse

In [22]:
for feat in sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat])
mms = MinMaxScaler(feature_range=(0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

In [23]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique(),embedding_dim=4 )
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

In [24]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [25]:
train_x, test_x, train_y, test_y = train_test_split(df, df_label, test_size=0.2, random_state=2021)

In [26]:
train_model_input = {name:train_x[name] for name in feature_names}
test_model_input = {name:test_x[name] for name in feature_names}

In [53]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [63]:
history = model.fit(train_model_input, train_y,
                        batch_size=256, epochs=40, verbose=2, validation_split=0.2, )

Train on 80 samples, validate on 20 samples
Epoch 1/40
 - 0s - loss: 0.5975 - binary_crossentropy: 0.5974 - val_loss: 0.5178 - val_binary_crossentropy: 0.5178
Epoch 2/40
 - 0s - loss: 0.5920 - binary_crossentropy: 0.5920 - val_loss: 0.5045 - val_binary_crossentropy: 0.5045
Epoch 3/40
 - 0s - loss: 0.5881 - binary_crossentropy: 0.5881 - val_loss: 0.4937 - val_binary_crossentropy: 0.4937
Epoch 4/40
 - 0s - loss: 0.5853 - binary_crossentropy: 0.5853 - val_loss: 0.4851 - val_binary_crossentropy: 0.4851
Epoch 5/40
 - 0s - loss: 0.5831 - binary_crossentropy: 0.5831 - val_loss: 0.4786 - val_binary_crossentropy: 0.4786
Epoch 6/40
 - 0s - loss: 0.5810 - binary_crossentropy: 0.5810 - val_loss: 0.4738 - val_binary_crossentropy: 0.4738
Epoch 7/40
 - 0s - loss: 0.5785 - binary_crossentropy: 0.5785 - val_loss: 0.4705 - val_binary_crossentropy: 0.4705
Epoch 8/40
 - 0s - loss: 0.5753 - binary_crossentropy: 0.5753 - val_loss: 0.4685 - val_binary_crossentropy: 0.4685
Epoch 9/40
 - 0s - loss: 0.5712 - bi

In [64]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [66]:
print("test LogLoss", round(log_loss(test_y, pred_ans), 4))
print("test AUC", round(roc_auc_score(test_y, pred_ans), 4))

test LogLoss 0.2527
test AUC 0.9875


In [67]:
pred_ans

array([[0.18658972],
       [0.95354056],
       [0.7153368 ],
       [0.2928978 ],
       [0.97949106],
       [0.7172941 ],
       [0.6912255 ],
       [0.2926982 ],
       [0.9644394 ],
       [0.07822895],
       [0.9675057 ],
       [0.31998906],
       [0.08443311],
       [0.06782943],
       [0.37820196],
       [0.16298962],
       [0.09874278],
       [0.1546762 ],
       [0.06636116],
       [0.14010528],
       [0.19755483],
       [0.21082819],
       [0.19293353],
       [0.0907647 ],
       [0.8614922 ],
       [0.16081329]], dtype=float32)

In [75]:
len([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

26

In [77]:
len(test_y)

26

In [78]:
from sklearn.metrics import accuracy_score

accuracy_score([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], test_y)

0.9230769230769231

In [153]:
from tensorflow.keras.utils import plot_model

plot_model(model, "DeepFM_model.png")

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.755888 to fit



In [2]:
from tensorflow.python.keras.models import  save_model,load_model

In [88]:
save_model(model, 'DeepFM.h5')

In [4]:
from deepctr.layers import custom_objects

In [5]:
model12 = load_model('DeepFM.h5',custom_objects)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [27]:
model12.predict(test_model_input)



array([[0.18658975],
       [0.95354056],
       [0.7153368 ],
       [0.29289782],
       [0.9794911 ],
       [0.7172941 ],
       [0.6912255 ],
       [0.29269814],
       [0.9644394 ],
       [0.07822895],
       [0.9675057 ],
       [0.31998906],
       [0.08443311],
       [0.06782943],
       [0.37820196],
       [0.16298962],
       [0.09874278],
       [0.1546762 ],
       [0.06636116],
       [0.1401053 ],
       [0.19755483],
       [0.21082821],
       [0.19293362],
       [0.0907647 ],
       [0.8614922 ],
       [0.16081326]], dtype=float32)

In [28]:
predict = []
for l in model12.predict(test_model_input):
    if l >= 0.5:
        predict.append(1)
    else:
        predict.append(0)



In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(predict, test_y)

0.9230769230769231

In [30]:
predicted = model12.predict(train_model_input)



In [31]:
predict = []
for l in model12.predict(train_model_input):
    if l >= 0.5:
        predict.append(1)
    else:
        predict.append(0)



In [32]:
from sklearn.metrics import accuracy_score

accuracy_score(predict, train_y)

0.82

### model_nulti

In [143]:
model_multi = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass')

In [104]:
import tensorflow as tf

In [None]:
tf.keras.losses.sparse_categorical_crossentropy

In [None]:
tf.keras.optimizers.SGD

In [None]:
tf.keras.metrics.categorical_crossentropy

In [144]:
model_multi.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

In [113]:
df_3 = pd.read_csv("features_63.csv").drop(["Unnamed: 0"], axis=1)

In [116]:
df_3.groupby("user_id")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe923701350>

In [131]:
labels_grt_10 = [i[0] for i in list(zip([i for i in range(0, 9)], list(df_3.groupby("award_id")['award_id'].count() > 20))) if i[1]]

In [132]:
labels_grt_10

[1, 2, 3]

In [147]:
df_4 = pd.DataFrame(columns=df_3.columns)
for index, row in df_3.iterrows():
    if row["award_id"] in labels_grt_10:
        df_4 = df_4.append(row, ignore_index=True)

In [148]:
df_user_id_4 = df_4["user_id"]
df_label_4 = df_4["award_id"].factorize()[0]
df_4.drop(["user_id", "award_id"], axis=1, inplace=True)
df_4.rename(columns=dense_mapping, inplace=True)
for feat in sparse_features:
        lbe = LabelEncoder()
        df_4[feat] = lbe.fit_transform(df_4[feat])
mms = MinMaxScaler(feature_range=(0, 1))
df_4[dense_features] = mms.fit_transform(df_4[dense_features])
fixlen_feature_columns_4 = [SparseFeat(feat, vocabulary_size=df_4[feat].nunique(),embedding_dim=4 )
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]
dnn_feature_columns_4 = fixlen_feature_columns_4
linear_feature_columns_4 = fixlen_feature_columns_4
feature_names_4 = get_feature_names(linear_feature_columns_4 + dnn_feature_columns_4)
train_x_4, test_x_4, train_y_4, test_y_4 = train_test_split(df_4, df_label_4, test_size=0.2, random_state=2021)
train_model_input_4 = {name:train_x_4[name] for name in feature_names}
test_model_input_4 = {name:test_x_4[name] for name in feature_names}

In [136]:
from tensorflow.python.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='binary_crossentropy')

In [137]:
history

<tensorflow.python.keras.callbacks.History at 0x7fe94154fd50>

In [154]:
history2 = model_multi.fit(train_model_input_4, train_y_4,
                        batch_size=256, epochs=40, verbose=2, validation_split=0.2, )

Train on 97 samples, validate on 25 samples
Epoch 1/40


InvalidArgumentError: indices[18,0] = 4 is not in [0, 4)
	 [[{{node sparse_emb_paper_authOrder_0_4/embedding_lookup}}]]