In [0]:
# preprocessing imports
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence

In [2]:
from google.colab import drive 
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [48]:
raw_data=pd.read_csv('/content/gdrive/My Drive/Final Year Project/DeepConn/Deep_Learning_Recommender_System/unembedded_grouped_cleaned_data.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,user_id,food_id,rating,userReviews,foodReviews
0,0,9974,998,5,"['delicious', 'and', 'easy', 'to', 'make']","['delicious', 'and', 'easy', 'to', 'make']"
1,1,10340,998,5,"['feeling', 'healthy', 'great', 'replication',...","['delicious', 'and', 'easy', 'to', 'make']"
2,2,12047,998,5,"['yum', 'will', 'make', 'again']","['delicious', 'and', 'easy', 'to', 'make']"
3,3,13451,998,5,"['turned', 'out', 'great', 'i', 'added', 'pean...","['delicious', 'and', 'easy', 'to', 'make']"
4,8,18226,998,5,"['fantastic', 'tasting', 'completely', 'simple...","['delicious', 'and', 'easy', 'to', 'make']"


In [0]:
import os.path
import numpy as np


def init_embeddings_map(fname):
    with open(fname) as glove:
        return {l[0]: np.asarray(l[1:], dtype="float32") for l in
                [line.split() for line in glove]}


def get_embed_and_pad_func(i_seq_len, u_seq_len, pad_value, embedding_map):
    def embed(row):
        sentence = row["userReviews"].split()[:u_seq_len]
        reviews = list(map(lambda word: embedding_map.get(word)
            if word in embedding_map else pad_value, sentence))
        row["userReviews"] = reviews +\
                [pad_value] * (u_seq_len - len(reviews))
        sentence = row["foodReviews"].split()[:i_seq_len]
        reviews = list(map(lambda word: embedding_map.get(word)
            if word in embedding_map else pad_value, sentence))
        row["foodReviews"] = reviews +\
                [pad_value] * (i_seq_len - len(reviews))
        return row
    return embed


In [0]:
# functions we implemented
# from custom_functions import init_embeddings_map, get_embed_and_pad_func

In [0]:

# link="https://drive.google.com/open?id=1AaNzc7XvqG6RiIVeYHDg10HuwTcDyutl"
# # emb_size = 50
# # embedding_map = init_embeddings_map("glove.6B." + str(emb_size) + "d.txt")
# downloaded = drive.CreateFile({'id':"1AaNzc7XvqG6RiIVeYHDg10HuwTcDyutl"}) 
# downloaded.GetContentFile('glove.6B.50d.txt')
embedding_map = init_embeddings_map('/content/gdrive/My Drive/DeepConn/Deep_Learning_Recommender_System/glove.6B.50d.txt')

In [0]:
# Train/test split for our model is unique, we need to hold out a
# set of users and movies so that our network never learns those 
test_size = 0.005

# get test_size percentage of users
unique_users = raw_data.loc[:, "user_id"].unique()
users_size = len(unique_users)
test_idx = np.random.choice(users_size,
                              size=int(users_size * test_size),
                              replace=False)

# get test users
test_users = unique_users[test_idx]

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = raw_data[raw_data["user_id"].isin(test_users)]
train = raw_data[raw_data["user_id"].isin(train_users)]

unique_test_movies = test["food_id"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["food_id"].isin(unique_test_movies))).dropna()

In [0]:
user_seq_sizes = raw_data.loc[:, "userReviews"].apply(lambda x: x.split()).apply(len)
item_seq_sizes = raw_data.loc[:, "foodReviews"].apply(lambda x: x.split()).apply(len)

In [0]:
u_ptile = 40
i_ptile = 30
u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))
i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))

In [0]:
embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * 50), embedding_map)
    
train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)

# DeepCoNN Recommendation Model

In [0]:
# modeling imports
import tensorflow as tf
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.layers import Input, Dense
from keras.layers.merge import Add, Dot, Concatenate

In [0]:
class DeepCoNN():
    def __init__(self,
                 embedding_size,
                 hidden_size,
                 u_seq_len,
                 m_seq_len,
                 filters=2,
                 kernel_size=10,
                 strides=6):
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)
        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)
        self.joined = Concatenate()([self.towerU, self.towerM])
        self.outNeuron = Dense(1)(self.joined)

    def create_deepconn_tower(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        tower = Conv1D(filters=self.filters,
                       kernel_size=self.kernel_size,
                       activation="tanh")(input_layer)
        tower = MaxPooling1D()(tower)
        tower = Flatten()(tower)
        tower = Dense(self.hidden_size, activation="relu")(tower)
        return input_layer, tower

    def create_deepconn_dp(self):
        dotproduct = Dot(axes=1)([self.towerU, self.towerM])
        output = Add()([self.outNeuron, dotproduct])
        self.model = Model(inputs=[self.inputU, self.inputM], outputs=[output])
        self.model.compile(optimizer='Adam', loss='mse')
        
    def train(self, train_data, batch_size, epochs=3500):
        tensorboard = TensorBoard(log_dir="tf_logs/{}".format(pd.Timestamp(int(time()), unit="s")))
        self.create_deepconn_dp()
        print(self.model.summary())
        
        user_reviews = np.array(list(train_data.loc[:, "userReviews"]))
        movie_reviews = np.array(list(train_data.loc[:, "foodReviews"]))

        self.train_inputs = [user_reviews, movie_reviews]
        self.train_outputs = train_data.loc[:, "rating"]
        
        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      callbacks=[tensorboard],
                                      validation_split=0.05,
                                      batch_size=batch_size,
                                      epochs=epochs)
        
        

In [57]:
print(hidden_size)
print(u_seq_len)
print(i_seq_len)
print(train_embedded.loc[0])

64
11
11
Unnamed: 0                                                     0
user_id                                                     9974
food_id                                                      998
rating                                                         5
userReviews    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
foodReviews    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
Name: 0, dtype: object


In [58]:
hidden_size = 64
deepconn = DeepCoNN(50, hidden_size, u_seq_len, i_seq_len)

batch_size = 32
deepconn.train(train_embedded, batch_size, epochs=1)

deepconn.model.save("cnn.h5")
# print(train_embedded.loc[0])

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 11, 50)       0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 11, 50)       0                                            
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 2, 2)         1002        input_11[0][0]                   
__________________________________________________________________________________________________
conv1d_12 (Conv1D)              (None, 2, 2)         1002        input_12[0][0]                   
____________________________________________________________________________________________

In [61]:
user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "foodReviews"]))

test_inputs = [user_reviews, movie_reviews]
print(test_inputs)
# dat = pd.DataFrame(test_inputs)
# dat.to_csv("/content/gdrive/My Drive/DeepConn/Deep_Learning_Recommender_System//test_data.csv")

true_rating = np.array(list(test_embedded.loc[:, "rating"])).reshape((-1, 1))

predictions = deepconn.model.predict(test_inputs)

error = np.square(predictions - true_rating)

print("MSE:", np.average(error))

[array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0