In [2]:
import xlearn as xl
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
#### Source: https://www.kaggle.com/c/avazu-ctr-prediction
# Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# small functions for converting 'hour'
def convert_hour(s):
    string = str(s)
    h = string[6:8]
    return h 

def convert_weekday(s):
    string = str(s)
    yyyy = int('20'+string[:2])
    mm = int(string[2:4])
    dd = int(string[4:6])
    weekday = datetime.date(yyyy,mm,dd).weekday()
    return weekday



# convert 'hour' in train to weekday and hour
test['weekday'] = test['hour'].apply(convert_weekday)
test['h'] = test['hour'].apply(convert_hour)
train['weekday'] = train['hour'].apply(convert_weekday)
train['h'] = train['hour'].apply(convert_hour)
a = test['weekday'].value_counts()

# use only the weekday in test.csv as training samples , a big and risky assumption 
Sample_train = train[train['weekday'].isin(a.index)]
test_new = test

# save these new table so that I do not need to load the source file everytime
Sample_train.to_csv('Submission_table.csv', sep=',', encoding='utf-8',index = False)
test_new.to_csv('test_new.csv', sep=',', encoding='utf-8',index = False)

In [3]:
# Loading point 1 
Sample_train = pd.read_csv('Submission_table.csv')
test_new = pd.read_csv('test_new.csv')

In [4]:
### Use only a subset of training samples
# Since there is only one day in both training and test data, so discard this column
test_new = test_new.drop(columns=['weekday'])
Sample_train = Sample_train.drop(columns=['weekday'])

In [5]:
#### Filed (feature) selection (based on heuristics and the neeed to simplify the model )
## drop some columns and make the remaining columns categorical
Sample_train = Sample_train.drop(columns=['id','hour','site_id',
                            'site_domain','app_id','app_domain','device_model','device_id','device_ip','C14','C17'])
# drop id, the primary key
# drop hour, it has been converted
# drop site_id, since site_category carries better info, and too many features in it
# drop site_domain, less irrelevant, too many features in it
# drop app_id and app_domain for similar reasons
# device_ip should carry useful information, however I am not very sure how to go about it right now.
# kernel keeps dying, so drop device_model too
# drop some anonymous colmns, C14 and C17, too many features

test_new = test_new.drop(columns=['id','hour','site_id',
                               'site_domain','app_id','app_domain','device_model','device_id','device_ip','C14','C17'])



In [13]:
# Test is very big, its processing killed the kernel, decided to use only 10%

Sample_test = test_new.sample(int(np.round(0.1*test_new.shape[0])))

In [26]:
# separate the label column form the training data
Sample_train_label = Sample_train['click']
Sample_train = Sample_train.drop(columns=['click'])

Sample_train_label.to_csv('Sample_train_label.csv',sep=',', encoding='utf-8',index = False)
Sample_train.to_csv('Sample_train.csv',sep=',', encoding='utf-8',index = False)
#Sample_test.to_csv('Sample_test.csv',sep=',', encoding='utf-8',index = False)

######## Load point 2 ############

In [27]:
suspect = Sample['device_id'] == 'a99f214a'
Sample[suspect]['click'].sum()

580806

In [27]:
# Convert data type to categorical, easier for creating dummies
for i in Sample_train.columns:
    Sample_train[i] = Sample_train[i].astype('category')
    
for j in Sample_test.columns:
    Sample_test[j] = Sample_test[j].astype('category')
    





In [37]:
# Constructing dummy variables for Logistic Regression and Factorization Machine
Sample_train_dummy = pd.DataFrame()
for i in Sample_train.columns:
    dummy = pd.get_dummies(Sample_train[i], prefix = i)
    Sample_train_dummy = pd.concat([Sample_train_dummy,dummy], axis=1)
    #368 dummies
Sample_test_dummy = pd.DataFrame()
for j in Sample_test.columns:
    dummy = pd.get_dummies(Sample_test[j], prefix = j)
    Sample_test_dummy = pd.concat([Sample_test_dummy,dummy], axis=1)
    #352 dummies

In [40]:
# Find out the features that are in test set but not in training set
inTestnotinTrain = Sample_test_dummy.columns[~Sample_test_dummy.columns.isin(Sample_train_dummy.columns)]

In [41]:
# Find out the features that are in training set but not in test set
inTrainnotinTest = Sample_train_dummy.columns[~Sample_train_dummy.columns.isin(Sample_test_dummy.columns)]

In [42]:
# Make sure both Training data and test data have the same predictors
for i in inTrainnotinTest:
    Sample_test_dummy[i] = 0
    
for j in inTestnotinTrain:
    Sample_train_dummy[j] = 0

In [85]:
######### Base Line Model ##########################
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

#log-loss evaluation, becasue the predictions are probabilities
LR1 = LogisticRegression(penalty ='l1',solver = 'saga')
scores = cross_val_score(LR1, Sample_train_dummy, Sample_train_label, cv=5, scoring = 'neg_log_loss')
scoresLR1 = -1*scores 

LR2 = LogisticRegression(penalty ='l2',solver = 'saga')
scores = cross_val_score(LR2, Sample_train_dummy, Sample_train_label, cv=5, scoring = 'neg_log_loss')
scoresLR2 = -1*scores 

##### The other possible models for classification problems #########
# 1. Support Vector Machine: Here the matrix is very sparce(many zero entries), SVM is not ideal
# 2. Tree-Based methods, eg Random Foreset, here the dimentionality is high (due to 100% categorical features), 
#   Random Forest is computationally more expensive, and prone to overfit (complicated models)



In [88]:
print(sum(scoresLR1)/5) # log-loss of logistic regression with L1 penalty
print(sum(scoresLR2)/5) # log-loss of logistic regression with L2 penalty

0.45476720520289576
0.45516997561531103


In [83]:
############# Factorization Machine #####################
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
#Sample_train_dummy.reset_index(drop = True)
#Sample_train_label.reset_index(drop = True)
def FM_CV(KFold, FM, X,y):
    l = []
    for train, test in KFold.split(X, y):
        # Start to train
        FM.fit(X.iloc[train], y.iloc[train]) 
        y_pred = FM.predict(X.iloc[test])
        loss = log_loss(y.iloc[test], y_pred, eps=1e-15)
        l.append(loss)
        score = sum(l)/len(l)
        return score
    
kf = KFold(n_splits=5)
#kf.get_n_splits(Sample_train_dummy)
# param:
#  0. binary classification
#  1. model scale: 0.1
#  2. epoch number: 10 (auto early-stop)
#  3. number of latent factor: 4
#  4. learning rate: 0.1
#  5. regular lambda: 0.01
#  6. use sgd optimization method
#  7. evaluation metric: accuarcy

# hyperparameter k
K_vector = [2,3,4,5,6,7,8,9,10]
FM_score = []
for i in K_vector: 
    fm_model = xl.FMModel(task='binary', init=0.1, 
                      epoch=10, k=i, lr=0.15, 
                      reg_lambda=0.01, opt='sgd', 
                      metric='acc')

    score = FM_CV(kf, fm_model, Sample_train_dummy,Sample_train_label)
    FM_score.append(score)


In [84]:
FM_score # FM model, k =4, average log-loss 0.4499, being the best so far

[0.4912672512911138,
 0.4669381348344845,
 0.4498570274823115,
 0.45737060188033274,
 0.45328016991364184,
 0.4547271786191499,
 0.45139659328669973,
 0.45419716577430996,
 0.45031769235750124]

In [91]:
#### Field-Aware Factorization Machine
# Dictionary of the fields, for converting DataFrame to Libffm format
Fields = {}
j=0
for i in Sample_train.columns:
    Fields[i] = j
    j = j + 1
# Get the union of features
Joint = pd.concat([Sample_train, Sample_test],axis=0)
Joint = Joint.astype('str')

# Dictionaries of the features for each field
for i in Fields:
    dictionary = i
    Features = Joint[i].value_counts().index
    exec('%s = {}' %dictionary)
    c = 0
    for j in Features:
        exec('%s[j] = c' %dictionary)
        c = c + 1

In [94]:
# Write Libfmm format for trianing data

import os
filename = 'Sample_train.txt'
file = open(filename, 'w') #truncate first
for i in range(0,Sample_train.shape[0]):
    Clause = str(Sample_train_label.iloc[i]) + ' ' # first entry should be the label
    
    for j in Fields:
        field = j
        field_index = Fields[j]
        feature = Sample_train[j].iloc[i]
        exec("feature_index = %s['%s']"% (field, feature))
        sub_clause = '%s:%s:1' %(field_index ,feature_index)
        Clause = Clause + sub_clause + ' '
        
    file.write(Clause)
    file.write('\n')
    
file.close()

In [103]:
## ffm - cross validation
# k = 2
ffm_model = xl.create_ffm()
ffm_model.setTrain('Sample_train.txt')
param = {'task':'binary', 'lr':0.15, 'lambda':0.002, 'opt':'sgd','epoch':10,'k':2}

ffm_model.cv(param)
#------------] Average log_loss: 0.550212
#[ ACTION     ] Finish Cross-Validation
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 147.73 (sec)

In [105]:
# k = 3
ffm_model = xl.create_ffm()
ffm_model.setTrain('Sample_train.txt')
param = {'task':'binary', 'lr':0.15, 'lambda':0.002, 'opt':'sgd','epoch':10,'k':3}
ffm_model.cv(param)
#[------------] Average log_loss: 0.500893
#[ ACTION     ] Finish Cross-Validation
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 146.65 (sec)

In [106]:
# k = 4
ffm_model = xl.create_ffm()
ffm_model.setTrain('Sample_train.txt')
param = {'task':'binary', 'lr':0.15, 'lambda':0.002, 'opt':'sgd','epoch':10,'k':4}
ffm_model.cv(param)
#[------------] Average log_loss: 0.493209
#[ ACTION     ] Finish Cross-Validation
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 146.67 (sec)


In [107]:
# k = 5
ffm_model = xl.create_ffm()
ffm_model.setTrain('Sample_train.txt')
param = {'task':'binary', 'lr':0.15, 'lambda':0.002, 'opt':'sgd','epoch':10,'k':5}
ffm_model.cv(param)
#[------------] Average log_loss: 0.486877
#[ ACTION     ] Finish Cross-Validation
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 200.83 (sec)

In [108]:
# k = 6
ffm_model = xl.create_ffm()
ffm_model.setTrain('Sample_train.txt')
param = {'task':'binary', 'lr':0.15, 'lambda':0.002, 'opt':'sgd','epoch':10,'k':6}
ffm_model.cv(param)
#[------------] Average log_loss: 0.485178
#[ ACTION     ] Finish Cross-Validation
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 200.60 (sec)


In [109]:
# k = 7
ffm_model = xl.create_ffm()
ffm_model.setTrain('Sample_train.txt')
param = {'task':'binary', 'lr':0.15, 'lambda':0.002, 'opt':'sgd','epoch':10,'k':7}
ffm_model.cv(param)
#[------------] Average log_loss: 0.466884
#[ ACTION     ] Finish Cross-Validation
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 208.30 (sec)



In [110]:
# k = 8
ffm_model = xl.create_ffm()
ffm_model.setTrain('Sample_train.txt')
param = {'task':'binary', 'lr':0.15, 'lambda':0.002, 'opt':'sgd','epoch':10,'k':8}
ffm_model.cv(param)
#[------------] Average log_loss: 0.523609
#[ ACTION     ] Finish Cross-Validation
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 204.80 (sec)


In [113]:
# Write Libffm format for test data
filename = 'Sample_test.txt'
file = open(filename, 'w') #truncate first
for i in range(0,Sample_test.shape[0]):
    Clause = str() 
    
    for j in Fields:
        field = j
        field_index = Fields[j]
        feature = Sample_train[j].iloc[i]
        exec("feature_index = %s['%s']"% (field, feature))
        sub_clause = '%s:%s:1' %(field_index ,feature_index)
        Clause = Clause + sub_clause + ' '
        
    file.write(Clause)
    file.write('\n')
    
file.close()

In [5]:
#### Summary ######
# FM performs only slightly better than the base line model (LR), and it should remain inconclusive.
# FFM shows no adventage here, possibily because the current model (fields and features) are too simplified.

In [114]:
### Predict with the best model from cross validation
# Factorization Machine with k = 4
fm_model = xl.FMModel(task='binary', init=0.1, 
                     epoch=10, k=4, lr=0.15, 
                      reg_lambda=0.01, opt='sgd', 
                      metric='acc')

fm_model.fit(Sample_train_dummy, Sample_train_label)


# Generate predictions
y_pred = fm_model.predict(Sample_test_dummy)
#[------------] Loss function: cross-entropy
#[------------] Score function: fm
#[------------] Number of Feature: 368
#[------------] Number of K: 4
#[------------] Time cost for loading model: 0.00 (sec)
#[ ACTION     ] Read Problem ...
#[------------] First check if the text file has been already converted to binary format.
#[------------] Binary file (/var/folders/cp/rtwd28t94wz4spwfb2c8bqpm0000gn/T/tmpjlau0e2j.bin) NOT found. Convert text file to binary file.
#[------------] Time cost for reading problem: 1.36 (sec)
#[ ACTION     ] Start to predict ...
#[------------] The test loss is: 0.091477  <----- This doesn't make sense, since there is no label in the test data
#[ ACTION     ] Clear the xLearn environment ...
#[------------] Total time cost: 1.68 (sec)


In [118]:
print(y_pred, y_pred.shape)

[0.0271347 0.0318787 0.0417414 ... 0.274749  0.0438931 0.0448321] (457746,)
