# Traing script to create recommendation system

# 1. Prepare necessary things.

## Import libraries

In [1]:
# Data analyze
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Train test split
from sklearn.model_selection import train_test_split

# tensorflow.keras
import tensorflow as tf
import tensorflow.keras.metrics
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Flatten, Dot
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Pickle
import pickle

# Datetime
from datetime import datetime

# Ignore warning
import warnings
warnings.filterwarnings("ignore")

## Define hyperparameters

In [2]:
POOR_SERVICE_THRESHOLD = 20 # Service with momo transactions less than this threshold will be droped

MONEY_RANGE_DIVIDER = 100000 # VND divide by range of 100k
MONEY_RANGE_RICHKID = 5 # People who spent more than this range will be considered as rich kids and be grouped

VISIT_COUNT_MAX = 5 # Visit more than or equal <max> time(s) will be consider as fan ^^
VISIT_COUNT_MIN = 1 # Visit less than or equal <min> time(s) will not be considered and be droped

TRAIN_TRIGGER = True # Turn off this trigger if you dont want to run the training process
TRAIN_TEST_SIZE = 0.3
TRAIN_EPOCHS = 1
TRAIN_RANDOM_STATE = 1987

# 3. Create model and training

In [3]:
data = pd.read_csv("./data/transactions2train.csv", index_col=0)

In [4]:
data.head()

Unnamed: 0,user_id,service_id,amount,visit_count,total_amount,favor,service_group
0,1000121902738565702,6624136956482465650,385000,1,586000,0.656997,fnb
1,1000121902738565702,7241075282237721387,67000,1,586000,0.114334,cvs
2,1000121902738565702,922770840377832605,134000,1,586000,0.228669,food
3,1000125153834388186,4696475929912495191,1133000,1,1725000,0.656812,fnb
4,1000125153834388186,7435641053063427914,75000,1,1725000,0.043478,cvs


In [5]:
# data_food = data[data['service_group'] == 'food']
data_food = data
data_food = data_food.drop(columns=['total_amount'])
df_total_sum = data_food.groupby('user_id', as_index=False).sum()[['user_id', 'amount']]
df_total_sum = df_total_sum.rename(columns={'amount':'total_amount'})
data_food = data_food.merge(df_total_sum, how='left')
data_food['favor'] = data_food['amount'] / data_food['total_amount']
data_food

Unnamed: 0,user_id,service_id,amount,visit_count,favor,service_group,total_amount
0,1000121902738565702,6624136956482465650,385000,1,0.656997,fnb,586000.0
1,1000121902738565702,7241075282237721387,67000,1,0.114334,cvs,586000.0
2,1000121902738565702,922770840377832605,134000,1,0.228669,food,586000.0
3,1000125153834388186,4696475929912495191,1133000,1,0.656812,fnb,1725000.0
4,1000125153834388186,7435641053063427914,75000,1,0.043478,cvs,1725000.0
...,...,...,...,...,...,...,...
724864,999967723811113076,6683159578094575932,176000,1,0.095496,cvs,1843000.0
724865,999983852027909885,1161222413519607568,164000,1,0.224044,fnb,732000.0
724866,999983852027909885,1519391066364067495,393000,1,0.536885,fnb,732000.0
724867,999983852027909885,7217455669218222351,46000,1,0.062842,beverage,732000.0


In [6]:
user_freq = data_food.user_id.value_counts()
new_user = user_freq[user_freq < 3].index
data_food = data_food[~data_food['user_id'].isin(new_user)]

In [7]:
data_food.sample(5)

Unnamed: 0,user_id,service_id,amount,visit_count,favor,service_group,total_amount
427113,58745366349304396,5658857270301870068,81000,1,0.018746,beverage,4321000.0
490944,6607650307945901986,4470526277233866905,261000,1,0.229551,fnb,1137000.0
341240,4889764220252469642,9105773273715501159,234000,2,0.087183,other,2684000.0
92170,2048649772533853149,7241075282237721387,311000,1,0.071874,cvs,4327000.0
336554,4836514331076184893,1331731186241134178,1495000,1,0.571483,other,2616000.0


In [8]:
def EmbeddedInput(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return (inp, Embedding(n_in+1, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp))

In [9]:
# create map to convert from code to id and vice versal
usercode2id = data_food['user_id'].astype('category')
usercode2id = dict(enumerate(usercode2id.cat.categories))
servicecode2id = data_food['service_id'].astype('category')
servicecode2id = dict(enumerate(servicecode2id.cat.categories))
userid2code = dict(map(reversed, usercode2id.items()))
serviceid2code = dict(map(reversed, servicecode2id.items()))

# create code columns
data_food['user_code'] = data_food['user_id'].astype('category').cat.codes
data_food['service_code'] = data_food['service_id'].astype('category').cat.codes

In [10]:
data_food

Unnamed: 0,user_id,service_id,amount,visit_count,favor,service_group,total_amount,user_code,service_code
0,1000121902738565702,6624136956482465650,385000,1,0.656997,fnb,586000.0,13290,401
1,1000121902738565702,7241075282237721387,67000,1,0.114334,cvs,586000.0,13290,440
2,1000121902738565702,922770840377832605,134000,1,0.228669,food,586000.0,13290,54
3,1000125153834388186,4696475929912495191,1133000,1,0.656812,fnb,1725000.0,13291,281
4,1000125153834388186,7435641053063427914,75000,1,0.043478,cvs,1725000.0,13291,456
...,...,...,...,...,...,...,...,...,...
724864,999967723811113076,6683159578094575932,176000,1,0.095496,cvs,1843000.0,13288,404
724865,999983852027909885,1161222413519607568,164000,1,0.224044,fnb,732000.0,13289,70
724866,999983852027909885,1519391066364067495,393000,1,0.536885,fnb,732000.0,13289,96
724867,999983852027909885,7217455669218222351,46000,1,0.062842,beverage,732000.0,13289,438


In [11]:
n_user = int(data_food['user_code'].max())
print(n_user)
n_service = int(data_food['service_code'].max())
print(n_service)
n_dim = 128

123241
546


In [12]:
user_in, u = EmbeddedInput('user_in', n_user, n_dim, 1e-4)
service_in, s = EmbeddedInput('service_in', n_service, n_dim, 1e-4)

# user_in, u = EmbeddedInput('user_in', n_user, n_dim, 1e-4)
# service_in, s = EmbeddedInput('service_in', n_service, n_dim, 1e-4)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [13]:
# x = Concatenate()([u, s])
# x = Flatten()(x)
x = Dot(1, normalize=True)([u, s])
x = Flatten()(x)
x = Dense(32, activation='relu')(x)
x = Dense(16, activation='relu')(x)
x = Dense(1)(x)
nn = Model([user_in, service_in], x)
nn.compile(Adam(0.0005), loss='mse')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
data_food[data_food['visit_count'] == 1]
X_train, X_test, y_train, y_test = train_test_split(data_food[['user_code', 'service_code']], data_food['favor'],
                                                    test_size=TRAIN_TEST_SIZE,
                                                    random_state=TRAIN_RANDOM_STATE)

In [15]:
if TRAIN_TRIGGER is True:
    nn.fit([X_train['user_code'], X_train['service_code']], y_train,
          validation_data=([X_test['user_code'], X_test['service_code']], y_test),
          epochs=TRAIN_EPOCHS)

Train on 477268 samples, validate on 204544 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


**Save model to file**

In [16]:
now = datetime.now()
filename = './data/Model_test.h5'
nn.save(filename)

del nn

In [17]:
# save file to use later
with open('./data/usercode2id.dict', 'wb') as usercode2idfile:
    pickle.dump(usercode2id, usercode2idfile)
with open('./data/servicecode2id.dict', 'wb') as servicecode2idfile:
    pickle.dump(servicecode2id, servicecode2idfile)
with open('./data/userid2code.dict', 'wb') as userid2codefile:
    pickle.dump(userid2code, userid2codefile)
with open('./data/serviceid2code.dict', 'wb') as serviceid2codefile:
    pickle.dump(serviceid2code, serviceid2codefile)