In [None]:
!pip install tensorflow==1.15
!pip install keras==2.2.4 

In [None]:

!pip install implicit==0.4.2
!pip install pandas
!pip install matplotlib

implicit.__version__

In [None]:

!pip install holidays
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn scipy matplotlib


In [None]:
!pip install deepctr[cpu]

## Importing the Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import warnings
import math
from math import sqrt
import sys
import holidays
import datetime

from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from deepctr.models import WDL,DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense

import scipy.sparse as sparse
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
import implicit

# Reading the Expedia Hotel dataset

In [2]:
df = pd.read_csv('../../train.csv', sep=',', nrows=1500000)
destinations = pd.read_csv('../../destinations.csv', sep=',')
df.shape

(1500000, 24)

In [3]:
#merge only top 10 most correlated columns with rating column
df = pd.merge(df,destinations[['srch_destination_id','d33', 'd64', 'd52', 'd120', 'd72', 'd136', 'd7', 'd59', 'd50', 'd30']],on='srch_destination_id')

## Renaming the Columns

In [4]:
# rename 2 columns
df = df.rename(columns={'hotel_cluster': 'item_id', 'is_booking': 'rating'})
df = df.dropna()

## Feature Engineering

In [5]:
# sort values
from pandas.tseries.offsets import Week
df = df.sort_values("date_time").reset_index()
df.drop('index',axis=1,inplace=True)

In [6]:
df["date_time"] =  pd.to_datetime(df["date_time"], infer_datetime_format=True)
df["date_time"] = df.date_time.dt.strftime('%Y-%m-%d')
#df["date_time_timestamp"] =  pd.to_datetime(df["date_time"], infer_datetime_format=True)

In [7]:
d = datetime.timedelta(days=14)
df['lagged_date_time'] = df["date_time"].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d") + d)

def extract_week(feature,week,lag):
    df[feature] =  pd.to_datetime(df[feature], infer_datetime_format=True)
    df[feature] = df.date_time.dt.strftime('%Y-%m-%d')
    if lag == True:
        d = datetime.timedelta(days=14)
        df['lag_date_time'] = df[feature].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d") + d)
        df['week'] = pd.DatetimeIndex(df['lag_date_time']).week
        df['year']=pd.DatetimeIndex(df['lag_date_time']).year
        
        # countinue week numbers for the next year
        df[week] = df['week'].where(df['year'] ==2013 , df['week']+52)
extract_week('date_time','click_week',lag=True)

# extract month from date_time
df['click_month'] = pd.DatetimeIndex(df['date_time']).month

  # Remove the CWD from sys.path while we load stuff.


In [8]:
df['checkin_month'] = pd.DatetimeIndex(df['srch_ci']).month
df['checkout_month'] = pd.DatetimeIndex(df['srch_co']).month

df['checkin_year'] = pd.DatetimeIndex(df['srch_ci']).year
df['checkout_year'] = pd.DatetimeIndex(df['srch_co']).year

In [9]:
# Define holidays in some countries
ca_holidays = holidays.Canada()
us_holidays = holidays.UnitedStates()

# check if checkin or checkout date is in holiday of different countries

df['north_am_ci'] = df['srch_ci'].apply(lambda x: 1 if x in (us_holidays or ca_holidays)  else 0)
df['north_am_co'] = df['srch_co'].apply(lambda x: 1 if x in (us_holidays or ca_holidays)  else 0)

# Define features

In [10]:
# categ_sparse / conti_dense
sparse_features = ["site_name", #ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, …)
"posa_continent", #ID of continent associated with site_name
"user_location_country", #The ID of the country the customer is located
"user_id", #ID of user
"is_mobile", #1 when a user connected from a mobile device, 0 otherwise
"is_package", #1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise
"channel", #ID of a marketing channel
"cnt", #Numer of similar events in the context of the same user session
"srch_destination_id", #ID of the destination where the hotel search was performed'
"srch_destination_type_id", #Type of destination
"hotel_continent", #'Hotel continent',
"hotel_country", #Hotel country
"item_id", #(hotel_cluster)ID of a hotel cluster
"north_am_ci", # 1 if check-in date it's a holiday in north America
"north_am_co",# 1 if check-out date it's a holiday in north America
'hotel_market', #Hotel market

#hotel search latent attributes highly correlated with rating:
'd33', 'd64','d52','d120', 'd72', 'd136', 'd7', 'd59', 'd50', 'd30'] 

dense_features = ["srch_adults_cnt", #The number of adults specified in the hotel room
"srch_children_cnt", #The number of (extra occupancy) children specified in the hotel room
"srch_rm_cnt", #The number of hotel rooms specified in the search
"click_week",
"click_month",
"checkin_month",
"checkout_month",
"checkin_year",
"checkout_year"]
target = ['rating']

### Simple preprocessing

In [11]:
# Label Encoding for sparse features,and normalization for dense numerical features
for feat in sparse_features:
    lbe = LabelEncoder()
    df[feat] = lbe.fit_transform(df[feat])

In [12]:
mms = MinMaxScaler(feature_range=(0,1))
df[dense_features] = mms.fit_transform(df[dense_features])

### Generate feature columns
For sparse features, we transform them into dense vectors by embedding techniques. For dense numerical features, we concatenate them to the input tensors of fully connected layer.

In [13]:
# count #unique features for each sparse field
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(),embedding_dim=4)
                          for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


### Generate the training samples and train the model

In [14]:
# generate input data for model
train, test = train_test_split(df, test_size=0.3)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [15]:
train.shape, test.shape

((658258, 46), (282111, 46))

# Best DeepFM Model after hyper-parameter tuning

In [16]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128,128)
            , seed=1024, dnn_dropout=0.5, dnn_activation='relu',task='binary',
               fm_group=['default_group'],dnn_use_bn=False)

model.compile("adam", "mse", metrics=['mse'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [17]:
history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Train on 526606 samples, validate on 131652 samples
Epoch 1/10
526606/526606 - 46s - loss: 0.0760 - mean_squared_error: 0.0741 - val_loss: 0.0738 - val_mean_squared_error: 0.0713
Epoch 2/10
526606/526606 - 39s - loss: 0.0729 - mean_squared_error: 0.0698 - val_loss: 0.0744 - val_mean_squared_error: 0.0715
Epoch 3/10
526606/526606 - 38s - loss: 0.0721 - mean_squared_error: 0.0689 - val_loss: 0.0749 - val_mean_squared_error: 0.0719
Epoch 4/10
526606/526606 - 37s - loss: 0.0714 - mean_squared_error: 0.0683 - val_loss: 0.0748 - val_mean_squared_error: 0.0718
Epoch 5/10
526606/526606 - 37s - loss: 0.0711 - mean_squared_error: 0.0678 - val_loss: 0.0754 - val_mean_squared_error: 0.0723
Epoch 6/10
526606/526606 - 38s - loss: 0.0708 - mean_squared_error: 0.0675 - val_loss: 0.0753 - val_mean_squared_error: 0.0721
Epoch 7/10
526606/526606 - 38s - loss: 0.0705 - mean_squared_error: 0.0672 - val_loss: 0.0759 - val_mean_squared_error: 0.0727
Epoch 8/10
526606/526606 - 37s - loss: 0.0702 - mean_square

In [18]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [19]:
auc = roc_auc_score(test[target].values, pred_ans)
print("RMSE:\t%f" % np.round(math.sqrt(mean_squared_error(test[target].values, pred_ans)),3),
      "MAE:\t%f" % np.round(mean_absolute_error(test[target].values, pred_ans),3),
      "MSE:\t%f" % np.round(mean_squared_error(test[target].values, pred_ans),3),
      "AUC:\t%f" % np.round(auc,3),
      sep='\n')

RMSE:	0.273000
MAE:	0.138000
MSE:	0.074000
AUC:	0.779000


In [20]:
warnings.filterwarnings("ignore")
new_df = test[['rating','item_id','user_id']]

#replace the rating with algorithm generated output
new_df['rating']=pred_ans

In [21]:
#csr_matrix((data, (row, col))
sparse_item_user = sparse.csr_matrix((new_df['rating'].astype(float),(new_df['item_id'], new_df['user_id'])),shape=(100, 35881))
#sparse_user_item = sparse.csr_matrix((new_df['rating'].astype(float),(new_df['user_id'], new_df['item_id'])),shape=(35881,100))
sparse_user_item=sparse_item_user.T.tocsr()
print(sparse_item_user)

model = implicit.als.AlternatingLeastSquares(factors=20,regularization=0.1,iterations=20)
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')
model.fit(sparse_item_user)



  (0, 10)	0.01240617036819458
  (0, 16)	3.874301910400391e-06
  (0, 50)	0.019399583339691162
  (0, 97)	0.03585764765739441
  (0, 126)	0.1047477126121521
  (0, 132)	0.021856248378753662
  (0, 179)	0.23404031991958618
  (0, 228)	0.04281380772590637
  (0, 243)	0.017525970935821533
  (0, 284)	0.01376265287399292
  (0, 296)	0.1255107820034027
  (0, 309)	2.9593706130981445e-05
  (0, 311)	5.662441253662109e-07
  (0, 315)	0.026033490896224976
  (0, 317)	0.34003037214279175
  (0, 326)	0.036186009645462036
  (0, 356)	0.04201209545135498
  (0, 383)	1.2248754501342773e-05
  (0, 386)	0.07515415549278259
  (0, 387)	0.032805293798446655
  (0, 419)	0.2491685450077057
  (0, 422)	0.019149601459503174
  (0, 453)	0.019590705633163452
  (0, 460)	0.13862496614456177
  (0, 504)	0.008764892816543579
  :	:
  (99, 35600)	0.10637187957763672
  (99, 35603)	0.15611150860786438
  (99, 35611)	0.02877911925315857
  (99, 35613)	0.16640320420265198
  (99, 35624)	0.006576120853424072
  (99, 35640)	3.4868717193603516e-06

  0%|          | 0/20 [00:00<?, ?it/s]

## Recommend 5 hotel clusters to a user

In [22]:
#Get Recommendations

user_id =   800
print(sparse_user_item.shape)
print(np.isscalar(user_id))
recommended = model.recommend(user_id, sparse_user_item)
recommended

(35881, 100)
True


[(47, 0.045618538),
 (23, 0.042194076),
 (40, 0.027297895),
 (76, 0.024190199),
 (51, 0.019864712),
 (64, 0.010259116),
 (82, 0.007913187),
 (46, 0.005700084),
 (49, 0.005463779),
 (14, 0.0050842687)]

In [23]:
recommended_df = pd.DataFrame(columns=['user_id','rec1','rec2','rec3','rec4','rec5'],
                              index=range(len(new_df['user_id'][:5].unique())))


In [24]:
for i,x in enumerate(new_df['user_id'][:5].unique()):
    recommended = model.recommend(x, sparse_user_item)
    recommended_df['user_id'].iloc[i]=x
    recommended_df['rec1'].iloc[i]=recommended[0][0]
    recommended_df['rec2'].iloc[i]=recommended[1][0]
    recommended_df['rec3'].iloc[i]=recommended[2][0]
    recommended_df['rec4'].iloc[i]=recommended[3][0]
    recommended_df['rec5'].iloc[i]=recommended[4][0]

In [25]:
recommended_df

Unnamed: 0,user_id,rec1,rec2,rec3,rec4,rec5
0,13985,32,47,77,13,21
1,15067,21,25,82,13,5
2,17111,47,10,13,94,6
3,19558,32,70,41,40,51
4,14819,21,13,69,37,97
