# Amazu CTR的实现

In [5]:
# 导入必要的工具包
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xlearn as xl

## 1、准备数据

### 1.1 读取数据

In [2]:
# raw data 2 (for read)
fp_raw_train = "./data/train_GBDT.csv"
fp_raw_test = "./data/test_GBDT.csv"

# input
fp_train = "./data/fm/train_ffm.txt"
fp_valid = "./data/fm/valid_ffm.txt"
fp_test = "./data/fm/test_ffm.txt"

# output
fp_model_fm = "./data/fm/model_fm.out"
fp_model_ffm = "./data/fm/model_ffm.out"
fp_pred_fm  = "./data/fm/output_fm.txt"
fp_pred_ffm = "./data/fm/output_ffm.txt"

# submissions
fp_sub_fm = "./data/fm/Submission_FM.csv"
fp_sub_ffm = "./data/fm/Submission_FFM.csv"

### 1.2 数据分类

In [7]:
# ================ data prepare ================== #
# feature names
cols = []
for i in range(42):
    cols.append('Class_' + str(i+1))

cols_train = ['id', 'click']
cols_test = ['id']
cols_train.extend(cols)
cols_test.extend(cols)

# train set
df_train = pd.read_csv(fp_raw_train)

# test set
df_test = pd.read_csv(fp_raw_test)
df_test['click'] = -1

## 2、模型训练

In [8]:
# ----- merge train-test set ----- #
n_train = len(df_train)
n_test = len(df_test)
df = df_train.append(df_test)
del df_train, df_test
gc.collect()

52

In [9]:
# ----- format data file (format as libffm) for train/valid/test ----- #
def convert_to_ffm(df, numerics, categories, features, Label, n_train, train_size=0.5):
    """
    :function: generation of train/valid/test set format as libffm

    :parameters:
        :df, pandas dataframe include raw data of train and test.
        :numerics, name list of numerical features.
        :categories, name list of categorical features.
        :features, name list of all features.
        :Label, name of label in the df.
        :n_train, number of training samples.
        :train_size, the ratio of train_valid split.
    """
    catdict = {}
    # Flagging categorical and numerical fields
    for x in numerics:
        catdict[x] = 0
    for x in categories:
        catdict[x] = 1

    nrows = df.shape[0]

    # samples' number of train
    n1 = n_train * train_size

    with open(fp_train, "w") as file_train, \
            open(fp_valid, "w") as file_valid, \
            open(fp_test, "w")  as file_test:

        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):

            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[Label]))
            # For  fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                if (catdict[x] == 0):  # numerical
                    datastring = datastring + " " + str(i) + ":" + str(i) + ":" + str(datarow[x])
                else:  # categorical
                    datastring = datastring + " " + str(i) + ":" + str(int(datarow[x])) + ":1"
            datastring += '\n'

            if n < n1:
                file_train.write(datastring)
            elif n < n_train:
                file_valid.write(datastring)
            else:
                file_test.write(datastring)


convert_to_ffm(df, numerics=[], categories=cols, features=cols, Label='click', n_train=n_train, train_size=0.8)

In [10]:
# ================ FM ================== #
# setting
fm_model = xl.create_fm()  # Use factorization machine
fm_model.setTrain(fp_train)   # Training data
fm_model.setValidate(fp_test)  # Validation data
fm_model.setSigmoid()

param = {'task': 'binary',
         'k': 20,
         'lr': 0.02,
         'lambda': 0.002,
         'epoch': 100,
         'opt': 'adagrad'
         }

# training
fm_model.fit(param, fp_model_fm)

# testing
fm_model.setTest(fp_test)
fm_model.setSigmoid()
fm_model.predict(fp_model_fm, fp_pred_fm)