# Model stage I

## LR

In [1]:
import pandas as pd
import dask.dataframe as dd
import xlearn as xl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss

### config

In [2]:
import json
# read the config file
with open('config.json') as f:
    config = json.load(f)

DATA_PATH = config['DATA_PATH']

### data management

In [3]:
tr_FE = dd.read_csv(DATA_PATH+'tr_FE.csv').compute()
features = dd.read_csv('feature.csv').compute()
feature_columns = features.head(30)['feature'].tolist()

X = tr_FE[feature_columns]
y = tr_FE['click']

X = X.astype({col: 'int32' for col in X.select_dtypes('bool').columns})


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Convert the training data
dump_svmlight_file(X_train, y_train, 'train.libsvm')

# Convert the test data
dump_svmlight_file(X_test, y_test, 'test.libsvm')

### model training

In [6]:
LR = xl.create_linear()
# Set the parameters of the model
LR.setTrain("train.libsvm")  # Training data
LR.setValidate("test.libsvm")  # Validation data

# Train the model
param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric':'auc','epoch': 100, 'opt':'sgd'}
LR.fit(param, "./model.out")

# Predict the test data
LR.setTest("test.libsvm")  # Test data
LR.setSigmoid()  # Convert output to probability
LR.predict("./model.out", "./output.txt")

# Load the prediction results
y_pred = np.loadtxt("./output.txt")

# Calculate AUC
auc = roc_auc_score(y_test, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(y_test, y_pred)

print("Log Loss: ", loss)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 96 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (train.libsvm.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (test.libsvm.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 30
[32m[------------] [0mTime 

## FM

### train fm

In [7]:
# Create FM model
fm_model = xl.create_fm()

# Set the parameters of the model
fm_model.setTrain("train.libsvm")  # Training data
fm_model.setValidate("test.libsvm")  # Validation data

# Train the model
param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric':'auc','epoch': 100, 'opt':'sgd'}
fm_model.fit(param, "./model.out")

# Predict the test data
fm_model.setTest("test.libsvm")  # Test data
fm_model.setSigmoid()  # Convert output to probability
fm_model.predict("./model.out", "./output.txt")

# Load the prediction results
y_pred = np.loadtxt("./output.txt")

# Calculate AUC
auc = roc_auc_score(y_test, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(y_test, y_pred)

print("Log Loss: ", loss)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 96 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (train.libsvm.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (test.libsvm.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 30
[32m[------------] [0mTime 

## FFM

In [6]:
# Create FM model
fm_model = xl.create_ffm()

# Set the parameters of the model
fm_model.setTrain("train.libsvm")  # Training data
fm_model.setValidate("test.libsvm")  # Validation data

# Train the model
param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric':'auc','epoch': 100,'opt':'sgd'}
fm_model.fit(param, "./model.out")

# Predict the test data
fm_model.setTest("test.libsvm")  # Test data
fm_model.setSigmoid()  # Convert output to probability
fm_model.predict("./model.out", "./output.txt")

# Load the prediction results
y_pred = np.loadtxt("./output.txt")

# Calculate AUC
auc = roc_auc_score(y_test, y_pred)
print("AUC: ", auc)

# Calculate log loss
loss = log_loss(y_test, y_pred)
print("Log Loss: ", loss)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 96 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (train.libsvm.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (test.libsvm.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 30
[32m[------------] [0mNumbe