# Model stage II

In [8]:
import pandas as pd
import dask.dataframe as dd
import xlearn as xl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss

### config

In [9]:
import json
# read the config file
with open('config.json') as f:
    config = json.load(f)

DATA_PATH = config['DATA_PATH']

### data management

In [10]:
tr_FE = dd.read_csv(DATA_PATH+'tr_FE.csv').compute()
features = dd.read_csv('feature.csv').compute()
feature_columns = features.head(30)['feature'].tolist()

X = tr_FE[feature_columns]
y = tr_FE['click']

X = X.astype({col: 'int32' for col in X.select_dtypes('bool').columns})


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## LR

In [None]:
skLR = LogisticRegression(max_iter=100,random_state=42)
skLR.fit(X_train,y_train)
# Predict the probabilities of the test set
y_pred_proba = skLR.predict_proba(X_test)[:, 1]

# Calculate the AUC
auc = roc_auc_score(y_test, y_pred_proba)
loss = log_loss(y_test,y_pred_proba)
print(f'The AUC of the model is {auc}')
print(f'The log loss is {loss}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


The AUC of the model is 0.635125340356276
The log loss is 0.43827477415305977


## FM

In [17]:
# TODO

In [5]:
import torch


In [9]:
device = torch.device('cpu')


## DeepFM

In [12]:
from deepctr_torch.inputs import get_feature_names, DenseFeat
from deepctr_torch.models import DeepFM
from sklearn.preprocessing import MinMaxScaler

# Convert X and y into a DataFrame
df = pd.DataFrame(X)
df['target'] = y

dense_features = df.columns.tolist()
dense_features.remove('target')

# Preprocessing
mms = MinMaxScaler(feature_range=(0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

# Split the data
train, test = train_test_split(df, test_size=0.2)

# Generate feature columns
feature_columns = [DenseFeat(feat, 1,) for feat in dense_features]
feature_names = get_feature_names(feature_columns)

# Convert the data into model input
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

# Create the model
model = DeepFM(feature_columns, feature_columns,task='binary',
                   l2_reg_embedding=1e-5, device=device)
model.compile("adam", "binary_crossentropy", metrics=['auc'])

# Train the model
history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2)

# Predict the test data
y_pred = model.predict(test_model_input, batch_size=256)

# Calculate AUC
auc = roc_auc_score(test['target'].values, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(test['target'].values, y_pred)
print("Log Loss: ", loss)


cpu
Train on 1293726 samples, validate on 323432 samples, 5054 steps per epoch
Epoch 1/10
31s - loss:  0.4325 - auc:  0.6550 - val_auc:  0.6763
Epoch 2/10
31s - loss:  0.4231 - auc:  0.6832 - val_auc:  0.6924
Epoch 3/10
32s - loss:  0.4193 - auc:  0.6949 - val_auc:  0.6997
Epoch 4/10
31s - loss:  0.4172 - auc:  0.7011 - val_auc:  0.7026
Epoch 5/10
31s - loss:  0.4155 - auc:  0.7056 - val_auc:  0.7072
Epoch 6/10
31s - loss:  0.4143 - auc:  0.7085 - val_auc:  0.7040
Epoch 7/10
30s - loss:  0.4135 - auc:  0.7105 - val_auc:  0.7080
Epoch 8/10
33s - loss:  0.4128 - auc:  0.7120 - val_auc:  0.7068
Epoch 9/10
32s - loss:  0.4123 - auc:  0.7132 - val_auc:  0.7118
Epoch 10/10
31s - loss:  0.4117 - auc:  0.7144 - val_auc:  0.7134
AUC:  0.715179730959022
Log Loss:  0.4118379257847679


## Wide & Deep

In [20]:
from deepctr_torch.models import WDL

# Convert X and y into a DataFrame
df = pd.DataFrame(X)
df['target'] = y

dense_features = df.columns.tolist()
dense_features.remove('target')

# Preprocessing
mms = MinMaxScaler(feature_range=(0, 1))
df[dense_features] = mms.fit_transform(df[dense_features])

# Split the data
train, test = train_test_split(df, test_size=0.2)

# Generate feature columns
feature_columns = [DenseFeat(feat, 1,) for feat in dense_features]
feature_names = get_feature_names(feature_columns)

# Convert the data into model input
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}


# Create the model
model = WDL(feature_columns, feature_columns,task='binary',
                   l2_reg_embedding=1e-5, device=device)
model.compile("adam", "binary_crossentropy", metrics=['auc'])

# Train the model
history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2)

# Predict the test data
y_pred = model.predict(test_model_input, batch_size=256)

# Calculate AUC
auc = roc_auc_score(test['target'].values, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(test['target'].values, y_pred)
print("Log Loss: ", loss)

cpu
Train on 1293726 samples, validate on 323432 samples, 5054 steps per epoch
Epoch 1/10
32s - loss:  0.4331 - auc:  0.6553 - val_auc:  0.6728
Epoch 2/10
31s - loss:  0.4244 - auc:  0.6805 - val_auc:  0.6853
Epoch 3/10
32s - loss:  0.4206 - auc:  0.6932 - val_auc:  0.6964
Epoch 4/10
32s - loss:  0.4180 - auc:  0.7010 - val_auc:  0.6994
Epoch 5/10
31s - loss:  0.4162 - auc:  0.7060 - val_auc:  0.7044
Epoch 6/10
32s - loss:  0.4147 - auc:  0.7096 - val_auc:  0.7073
Epoch 7/10
32s - loss:  0.4138 - auc:  0.7118 - val_auc:  0.7101
Epoch 8/10
32s - loss:  0.4130 - auc:  0.7138 - val_auc:  0.7083
Epoch 9/10
31s - loss:  0.4124 - auc:  0.7149 - val_auc:  0.7125
Epoch 10/10
31s - loss:  0.4119 - auc:  0.7161 - val_auc:  0.7141
AUC:  0.717321031219362
Log Loss:  0.41103908506793624
