# CTR Prediction Model using Keras

In [1]:
import pickle
import numpy as np
import pandas as pd

from keras import layers, models
from keras import backend as K
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC

Using TensorFlow backend.


## 1. Data Load

In [2]:
train_file = '../data/train.tsv'
label_file = '../data/train_label.tsv'
article_file = '../data/train_data_article.tsv'

train_df = pd.read_csv(train_file, sep='\t')
label_df = pd.read_csv(label_file, sep='\t')
article_df = pd.read_csv(article_file, sep='\t')

In [3]:
img_file = '../data/train_image_features.pkl'
with open(img_file, 'rb') as fp:
    img_feature_dic = pickle.load(fp)

In [4]:
# article_df.head()

In [5]:
# train_df.head()

## 2. Data Preprocessing

### 1) train_data

In [6]:
def create_readlist(row):
    if row:
        return row.split(',')
    else:
        return []

In [7]:
train_df = train_df.fillna('')  # fillna

In [8]:
train_df['read_article_list'] = train_df['read_article_ids'].apply(create_readlist)
train_df['read_len'] = train_df['read_article_list'].apply(len)

In [9]:
train_df.head()

Unnamed: 0,article_id,hh,gender,age_range,read_article_ids,read_article_list,read_len
0,ed173d87cf27,17,m,50-,"d6e3c3f0d131,85270a430a54,4364abac22bf","[d6e3c3f0d131, 85270a430a54, 4364abac22bf]",3
1,2f6139c6b61e,6,m,40-44,"04f098ccc312,c353deabeed8,94599f81d773,09f0324...","[04f098ccc312, c353deabeed8, 94599f81d773, 09f...",15
2,bfde563f2df4,9,m,50-,,[],0
3,a5e218237de4,7,m,40-44,14a7cb6072af,[14a7cb6072af],1
4,01e234fed982,8,f,40-44,"bec684f0ace5,83882e5ba8f4,3460807b3aef,ae197ea...","[bec684f0ace5, 83882e5ba8f4, 3460807b3aef, ae1...",9


### 2) Label Encoding

In [10]:
sex = {'unknown': 0, 'm': 1, 'f': 2}
age = {'unknown': 0, '-14': 1, '15-19': 2, '20-24': 3, '25-29': 4,
        '30-34': 5, '35-39': 6, '40-44': 7, '45-49': 8, '50-': 9}

In [11]:
train_df['gender'] = train_df['gender'].apply(lambda row: sex[row])
train_df['age_range'] = train_df['age_range'].apply(lambda row: age[row])

In [12]:
train_df.head()

Unnamed: 0,article_id,hh,gender,age_range,read_article_ids,read_article_list,read_len
0,ed173d87cf27,17,1,9,"d6e3c3f0d131,85270a430a54,4364abac22bf","[d6e3c3f0d131, 85270a430a54, 4364abac22bf]",3
1,2f6139c6b61e,6,1,7,"04f098ccc312,c353deabeed8,94599f81d773,09f0324...","[04f098ccc312, c353deabeed8, 94599f81d773, 09f...",15
2,bfde563f2df4,9,1,9,,[],0
3,a5e218237de4,7,1,7,14a7cb6072af,[14a7cb6072af],1
4,01e234fed982,8,2,7,"bec684f0ace5,83882e5ba8f4,3460807b3aef,ae197ea...","[bec684f0ace5, 83882e5ba8f4, 3460807b3aef, ae1...",9


### 3) Read len Standard Scaling

In [13]:
train_df['read_len_norm'] = 0.0
train_df[['read_len_norm']] = StandardScaler().fit_transform(train_df[['read_len']])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [14]:
# train_df

### 4) read_article_ids 

In [15]:
# Sequence data max len
avg_read_len = train_df['read_len'].mean()
avg_read_len = int(np.ceil(avg_read_len))

In [16]:
max_features = 30000
max_len = avg_read_len

In [17]:
tokenizer = Tokenizer(num_words=max_features, filters='', lower=False)
tokenizer.fit_on_texts(train_df['read_article_list'].tolist())
sequences = tokenizer.texts_to_sequences(train_df['read_article_list'].tolist())

In [18]:
word_index = tokenizer.word_index
print(f'Found {len(word_index)} tokens.')

Found 1710 tokens.


In [19]:
X_read_ids = pad_sequences(sequences, maxlen=max_len)

In [20]:
X_read_ids.shape

(100, 20)

### 5) Feature Extraction

In [21]:
train_df = train_df[['article_id', 'hh', 'gender', 'age_range', 'read_len_norm']]

In [22]:
X = train_df.values
y = label_df['label'].values

In [23]:
# concat sequence data
X = np.column_stack((X, X_read_ids))

In [24]:
X.shape

(100, 25)

In [25]:
train_x, train_y = RandomUnderSampler().fit_sample(X, y)

In [26]:
a_ids = train_x[:, 0].tolist()
x_hh = train_x[:, 1].astype(int)
x_gender = train_x[:, 2].astype(int)
x_age = train_x[:, 3].astype(int)
x_rln = train_x[:, 4]
x_reads = train_x[:, 5:].astype(int)

In [31]:
rnd_idx = np.random.permutation(len(train_x))

## 3. Create Generator

In [29]:
def generator(X, y, batch_size=256, shuffle=True):
    a_ids = X[:, 0].tolist()
    x_hh = X[:, 1].astype(int)
    x_gender = X[:, 2].astype(int)
    x_age = X[:, 3].astype(int)
    x_rln = X[:, 4]
    x_reads = X[:, 5:].astype(int)
    
    

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

## 4. CTR Model

In [84]:
# parameters
hh_dim = 24
age_dim = 10
embedding_dim = 128
img_dim = 2048

In [102]:
K.clear_session()

####
# input
###
# hh input
input_hh = layers.Input(shape=(1, ), dtype='int32', name='hh')
# gender input
input_gender = layers.Input(shape=(3,), dtype='float32', name='gender')
# age input
input_age = layers.Input(shape=(1, ), dtype='int32', name='age')
# input read_len
input_rlen = layers.Input(shape=(1,), dtype='float32', name='rlen')
# read_ids input
input_reads = layers.Input(shape=(max_len, ), dtype='int32', name='reads')
# img features input
input_img = layers.Input(shape=(img_dim,), dtype='float32', name='img')

####
# embedding
####
embed_hh = layers.Embedding(hh_dim, 8, input_length=1)(input_hh)
embed_hh = layers.Reshape((8 ,))(embed_hh)
embed_age = layers.Embedding(age_dim, 4, input_length=1)(input_age)
embed_age = layers.Reshape((4 ,))(embed_age)
embed_reads = layers.Embedding(len(word_index)+1, embedding_dim, input_length=max_len)(input_reads)

# concat flat features
concat_flat = layers.concatenate([embed_hh, embed_age, input_rlen, input_gender], axis=-1)
output_flat = layers.Dense(128, activation='relu')(concat_flat)

# LSTM layer
output_reads = layers.Bidirectional(layers.LSTM(128))(embed_reads)

# concat all features
concat_all = layers.concatenate([output_flat, output_reads, input_img], axis=-1)

# layers
output = layers.Dense(4096, activation='relu')(concat_all)
output = layers.Dense(512, activation='relu')(output)
logits = layers.Dense(1, activation='sigmoid')(output)

In [89]:
model = models.Model([input_hh, input_gender, 
                      input_age, input_rlen, 
                      input_reads, input_img], logits)

In [91]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 8)         192         input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 4)         40          input_3[0][0]                    
__________________________________________________________________________________________________
reshape_1 