# Model stage II

In [51]:
import pandas as pd
import dask.dataframe as dd
import xlearn as xl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss

### config

In [52]:
import json
# read the config file
with open('config.json') as f:
    config = json.load(f)

DATA_PATH = config['DATA_PATH']

### data management

In [53]:
tr_FE = dd.read_csv(DATA_PATH+'tr_FE.csv').compute()
features = dd.read_csv('feature.csv').compute()
feature_columns = features.head(30)['feature'].tolist()

X = tr_FE[feature_columns]
y = tr_FE['click']

X = X.astype({col: 'int32' for col in X.select_dtypes('bool').columns})


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## LR

In [42]:
skLR = LogisticRegression(max_iter=100,random_state=42)
skLR.fit(X_train,y_train)
# Predict the probabilities of the test set
y_pred_proba = skLR.predict_proba(X_test)[:, 1]

# Calculate the AUC
auc = roc_auc_score(y_test, y_pred_proba)
loss = log_loss(y_test,y_pred_proba)
print(f'The AUC of the model is {auc}')
print(f'The log loss is {loss}')

## FM

In [43]:
# TODO

In [44]:
import torch
from sklearn.preprocessing import MinMaxScaler
from lightfm import LightFM
from scipy.sparse import coo_matrix
from lightfm.evaluation import auc_score

In [47]:
# 保存为 libsvm 格式
X_train.to_csv('train.txt', sep=' ', index=False, header=False)
X_test.to_csv('test.txt', sep=' ', index=False, header=False)
y_train.to_csv('train_y.txt', sep=' ', index=False, header=False)
y_test.to_csv('test_y.txt', sep=' ', index=False, header=False)

# 创建 FM 模型
fm_model = xl.create_fm()
fm_model.setTrain('train.txt')
fm_model.setValidate('test.txt')
fm_model.setSigmoid()
fm_model.fit(param={'task': 'binary', 'lr': 0.2, 'lambda': 0.002, 'metric': 'auc','epoch': 100}, 
             model_path='model.out')

# 预测
fm_model.setTest('test.txt')
fm_model.predict('model.out', 'output.txt')

# 读取预测结果并计算 AUC 和 Log Loss
y_pred = pd.read_csv('output.txt', header=None)
auc = roc_auc_score(y_test, y_pred)
loss = log_loss(y_test, y_pred)
print("AUC: ", auc)
print("Log Loss: ", loss)

AUC:  0.5576091966467143
Log Loss:  1.2785696185925026


## DeepFM

In [46]:
from deepctr_torch.inputs import get_feature_names, DenseFeat
from deepctr_torch.models import DeepFM
from sklearn.preprocessing import MinMaxScaler

# Convert X and y into a DataFrame
df = pd.DataFrame(X)
df['target'] = y

dense_features = df.columns.tolist()
dense_features.remove('target')

# Preprocessing
mms = MinMaxScaler(feature_range=(0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

# Split the data
train, test = train_test_split(df, test_size=0.2)

# Generate feature columns
feature_columns = [DenseFeat(feat, 1,) for feat in dense_features]
feature_names = get_feature_names(feature_columns)

# Convert the data into model input
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

# Create the model
model = DeepFM(feature_columns, feature_columns,task='binary',
                   l2_reg_embedding=1e-5, device=device)
model.compile("adam", "binary_crossentropy", metrics=['auc'])

# Train the model
history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2)

# Predict the test data
y_pred = model.predict(test_model_input, batch_size=256)

# Calculate AUC
auc = roc_auc_score(test['target'].values, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(test['target'].values, y_pred)
print("Log Loss: ", loss)


cpu
Train on 1293726 samples, validate on 323432 samples, 5054 steps per epoch
Epoch 1/10
25s - loss:  0.4338 - auc:  0.6538 - val_auc:  0.6725
Epoch 2/10
26s - loss:  0.4250 - auc:  0.6803 - val_auc:  0.6903


KeyboardInterrupt: 

## Wide & Deep

In [None]:
from deepctr_torch.models import WDL

# Convert X and y into a DataFrame
df = pd.DataFrame(X)
df['target'] = y

dense_features = df.columns.tolist()
dense_features.remove('target')

# Preprocessing
mms = MinMaxScaler(feature_range=(0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

# Split the data
train, test = train_test_split(df, test_size=0.2)

# Generate feature columns
feature_columns = [DenseFeat(feat, 1,) for feat in dense_features]
feature_names = get_feature_names(feature_columns)

# Convert the data into model input
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}


# Create the model
model = WDL(feature_columns, feature_columns,task='binary',
                   l2_reg_embedding=1e-5, device=device)
model.compile("adam", "binary_crossentropy", metrics=['auc'])

# Train the model
history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2)

# Predict the test data
y_pred = model.predict(test_model_input, batch_size=256)

# Calculate AUC
auc = roc_auc_score(test['target'].values, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(test['target'].values, y_pred)
print("Log Loss: ", loss)

## lightgbm

In [48]:
import lightgbm as lgb

# Train the model
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test, reference=dtrain)
param = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}
num_round = 100
bst = lgb.train(param, dtrain, num_round, valid_sets=[dtrain, dtest])

# Predict the test data
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Calculate AUC
auc = roc_auc_score(y_test, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(y_test, y_pred)
print("Log Loss: ", loss)


[LightGBM] [Info] Number of positive: 274523, number of negative: 1342635
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 989
[LightGBM] [Info] Number of data points in the train set: 1617158, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169756 -> initscore=-1.587354
[LightGBM] [Info] Start training from score -1.587354
AUC:  0.7255826761432671
Log Loss:  0.40886849827620014


## DCN（Deep & Cross Network）

In [49]:
from deepctr_torch.inputs import get_feature_names, SparseFeat, DenseFeat
from deepctr_torch.models import WDL, DCN, AutoInt
from sklearn.preprocessing import MinMaxScaler

# Convert X and y into a DataFrame
df = pd.DataFrame(X)
df['target'] = y

dense_features = df.columns.tolist()
dense_features.remove('target')

# Preprocessing
mms = MinMaxScaler(feature_range=(0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

# Split the data
train, test = train_test_split(df, test_size=0.2)

# Generate feature columns
feature_columns = [DenseFeat(feat, 1,) for feat in dense_features]
feature_names = get_feature_names(feature_columns)

# Convert the data into model input
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

# Create the model
# model = WDL(feature_columns, feature_columns, task='binary')
model = DCN(feature_columns, feature_columns, task='binary')
# model = AutoInt(feature_columns, feature_columns, task='binary')

model.compile("adam", "binary_crossentropy", metrics=['auc'])

# Train the model
history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2)

# Predict the test data
y_pred = model.predict(test_model_input, batch_size=256)

# Calculate AUC
auc = roc_auc_score(test['target'].values, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(test['target'].values, y_pred)
print("Log Loss: ", loss)


cpu
Train on 1293726 samples, validate on 323432 samples, 5054 steps per epoch
Epoch 1/10
28s - loss:  0.4323 - auc:  0.6598 - val_auc:  0.6704
Epoch 2/10
27s - loss:  0.4258 - auc:  0.6782 - val_auc:  0.6836
Epoch 3/10
27s - loss:  0.4223 - auc:  0.6889 - val_auc:  0.6900
Epoch 4/10
28s - loss:  0.4195 - auc:  0.6972 - val_auc:  0.6969
Epoch 5/10
36s - loss:  0.4177 - auc:  0.7020 - val_auc:  0.7009
Epoch 6/10
39s - loss:  0.4165 - auc:  0.7050 - val_auc:  0.7060
Epoch 7/10
54s - loss:  0.4155 - auc:  0.7077 - val_auc:  0.7088
Epoch 8/10
49s - loss:  0.4146 - auc:  0.7100 - val_auc:  0.7096
Epoch 9/10
32s - loss:  0.4139 - auc:  0.7119 - val_auc:  0.7125
Epoch 10/10
52s - loss:  0.4133 - auc:  0.7134 - val_auc:  0.7133
AUC:  0.7144237708530117
Log Loss:  0.41167094065908016


IndexError: list index out of range