In [1]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
import sys

In [25]:
class EmbModel(tf.keras.Model):
    def __init__(self, useridlength, category_length):
        super(EmbModel, self).__init__()
        self.d_steps = 1
        self.useridlength = useridlength
        self.category_length = category_length
        self.model = self.init_model()
        print(self.useridlength)
        
    def call(self, inputs):
        return
    
    def init_model(self):
        poi_latitude_input = keras.layers.Input(shape=(1,), name='poi_latitude')
        poi_longitude_input = keras.layers.Input(shape=(1,), name='poi_longitude')
        poi_concat_input = tf.keras.layers.Concatenate(axis=-1)([poi_latitude_input, poi_longitude_input])
        #input_length:  #This is the length of input sequences, as you would define for any input layer of a Keras model. 
                        #For example, if all of your input documents are comprised of 1000 words, this would be 1000
        #input_dim: 
                        #This is the size of the vocabulary in the text data. 
                        #For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.
        poi_dense = keras.layers.Dense(128)(poi_concat_input)
        poi_reshape = keras.layers.Reshape((1, 128))(poi_dense)
        
        category_input = keras.layers.Input(shape=(1), name='category_input')
        category_emb = keras.layers.Embedding(self.category_length, 128)(category_input)    
        category_concat = tf.keras.layers.Concatenate(axis=-1)([category_emb, poi_reshape])
    
        user_input = keras.layers.Input(shape=(1,), name='user_id')
        user_emb = keras.layers.Embedding(self.useridlength, 256)(user_input)
        #user_reshape = layers.Reshape((1, 256))(user_emb)
                                    
        dot = keras.layers.Dot(axes=(2))([category_concat, user_emb])
            
        model = Model([category_input, poi_latitude_input, poi_longitude_input, user_input], dot)
        model.summary()
        return model
    
    def compile_model(self, optimizer):
        super(EmbModel, self).compile(run_eagerly=True)
        self.optimizer = optimizer
        
    def train_step(self, data):
        if len(data) == 3:
            real_data, labels, sample_weight = data
        else:
            sample_weight = None
            real_data, labels = data
        cat_data = real_data[0]
        lat_data = real_data[1]
        long_data = real_data[2]
        user_data = real_data[3]

        for i in range(self.d_steps):
            with tf.GradientTape() as tape:
                
                #print(latlong_data[0])
                #print(latlong_data[1])
                #print(user_data)
                
                dotproduct = self.model(real_data)
                print(dotproduct)
                #print(dotproduct)
                # Loss function = ||S-GroundTruth|| 
                loss = tf.math.abs(tf.subtract(tf.cast(dotproduct, tf.float64), labels))
                #print(loss)
            d_gradient = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(d_gradient, self.model.trainable_variables))
        return {'loss': loss}
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.out_units)
    
    def predict_step(self, data):
        sample_weight = None
        cat_data = real_data[0]
        lat_data = real_data[1]
        long_data = real_data[2]
        user_data = real_data[3]
        return self.model([cat_data, lat_data, long_data, user_data])

In [3]:
def one_hot_encode(ground_truth, lst):
    result = []
    for category in lst:
        oh_encoding = np.zeros(len(ground_truth))
        if category in ground_truth:
            print(category)
            index = np.where(ground_truth == category)[0][0]
            
            #Get index og category, and insert 1 into the vector.
            result.append(index)
    return result

In [27]:
print("Loading checkins")
checkin_cols = ['user_id', 'poi_id', 'timestamp', 'timezone']
checkins = pd.read_csv(r'C:\Users\lasse\Desktop\RecommenderDL\datasets\Den_checkins.csv', sep=',', names=checkin_cols, encoding='latin-1').dropna(axis=1)

print("Loading POIs")
venue_cols = ['poi_id', 'latitude', 'longitude', 'category', 'country_code']
pois = pd.read_csv(r'C:\Users\lasse\Desktop\RecommenderDL\datasets\Den_pois.csv', sep=',', names=venue_cols, encoding='latin-1')

c = pd.DataFrame(checkins, columns=['user_id', 'poi_id'])
p = pd.DataFrame(pois, columns=['poi_id', 'latitude', 'longitude', 'category'])

cp = p.merge(c, on='poi_id')

#One checkin for each user
users = checkins.copy()
users.drop_duplicates(subset="user_id", keep = 'first', inplace = True)
print("Checkins: ", len(checkins))
print("Users: ", len(users))
len_checkins = len(checkins)
len_users = len(users)

#The rest of the checkins and categories
checkins_rest = users.merge(checkins, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
print("Gotten: ", len(checkins_rest))

#One of each category in checkins_rest
categories1 = pd.DataFrame(pois, columns=['poi_id', 'category'])
categories1 = checkins_rest.merge(categories1, on='poi_id')
users_categories1 = categories1.copy()
users_categories1.drop_duplicates(subset="category", keep = 'first', inplace = True)
print("Unique categories in checkins_rest: ", len(users_categories1))

#FINISHING NOTE: Vi vil have alle colplement af checkins_rest, n책r vi sammenligner med categories_cat_no_cat, s책 har vi de 3 gange checkins vi har brug for. S책 kan vi lave train/test p책 det store af dem.
categories_cat_no_cat = pd.DataFrame(users_categories1, columns=['user_id', 'poi_id', 'timestamp', 'timezone', 'category'])
checkins_rest_no_merge = pd.DataFrame(checkins_rest, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])
poisandcategories = pd.DataFrame(pois, columns=['poi_id', 'category'])
checkins_rest_no_merge = checkins_rest_no_merge.merge(poisandcategories, on='poi_id')
print("Total categories: ", len(categories_cat_no_cat))

test = categories_cat_no_cat.merge(checkins_rest_no_merge, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
#test = pd.DataFrame(checkins_rest, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])
#cattest = pd.DataFrame(pois, columns=['poi_id', 'category'])
#test.merge(cattest, on='poi_id')
#checkins_rest = cattest.merge(test, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']

restset = pd.DataFrame(test, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])
userset = users
categoryset = pd.DataFrame(users_categories1, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])

print("Overview:")
print(len(userset))
print(len(categoryset))
print(len(restset))

encoding = pd.DataFrame(userset, columns=['user_id'])
encoding_array = {}
temp = 0
for user in encoding.iterrows():
    user = user[0]
    value = encoding._get_value(user, 'user_id')
    encoding_array[value] = temp
    temp += 1

print(encoding_array)
    
###################################
###################################
####### Iteration 1: User #########
###################################
###################################

print("Step 1")
checkin_data = categoryset.merge(pois, on='poi_id')
#checkin_data.drop_duplicates(subset="user_id", keep = 'first', inplace = True)

df = checkin_data.set_index('user_id').poi_id.str.get_dummies(',')
df = df.groupby('user_id').max()

print("Step 2")
checkin_data_no_duplicates = checkin_data.copy()
checkin_data_no_duplicates.drop_duplicates(subset ="poi_id", keep = 'first', inplace = True)
checkin_data_no_duplicates = pd.DataFrame(checkin_data_no_duplicates, columns = ['poi_id', 'category'])

#Extract categorical data
print("Step 3")
categories = pd.DataFrame(checkin_data, columns=['category'])
categories.drop_duplicates(subset ="category", keep = 'first', inplace = True)
category_length = len(categories)
categories_numpy = categories.to_numpy()


#Extracting all of the users and the pois

print("Step 3.5")
listofusers = pd.DataFrame(checkin_data, columns= ['user_id']).groupby('user_id').max().sample(frac=1)
listofpois = pd.DataFrame(checkin_data, columns= ['poi_id', 'latitude', 'longitude']).groupby('poi_id').max().sample(frac=1)
userarray = listofusers.index.to_numpy()
poiarray = listofpois.index.to_numpy()
userdataframe = pd.DataFrame(userarray, columns = ['Users'])
poidataframe = pd.DataFrame(poiarray, columns = ['Poi'])
dot = userdataframe.merge(poidataframe, how='cross')

print("Step 4")

rows_list = []
for i in range(len(dot)):
    temp = dot.loc[i, "Poi"]
    latitude = listofpois.loc[temp]['latitude']
    longitude = listofpois.loc[temp]['longitude']
    dict1 = {'latitude':latitude, 'longitude':longitude}
    rows_list.append(dict1)
    #latitude = poiarray[i]
latlong = pd.DataFrame(rows_list)

#Creating dataset
print("Step 5")
userdot = pd.DataFrame(dot, columns= ['Users'])
latlong['latitude'] = pd.to_numeric(latlong['latitude'])
latlong['longitude'] = pd.to_numeric(latlong['longitude'])
dataset = pd.concat([userdot, latlong], axis=1)

print("Step 6")
rows_list = []
category_list = []
groundtruth = 0
for i in range(len(dot)):
    temp = cp.loc[(cp['poi_id'] == dot.loc[i, "Poi"]) & (cp['user_id'] == dot.loc[i, "Users"])]
    if temp.empty:
        groundtruth = 0
    else:
        groundtruth = 1
    #temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    #temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    dict1 = {'ground_truth':float(groundtruth)}
    rows_list.append(dict1)
    #Extract category from the list
    category = checkin_data_no_duplicates.loc[checkin_data_no_duplicates['poi_id'] == dot.loc[i, "Poi"]]
    cat = category['category']
    index = np.where(categories_numpy == [cat])[0][0]
    category_list.append(index)
#category_label = 
groundtruth = pd.DataFrame(rows_list)
#result = pd.concat([dot, groundtruth], axis=1)

print("Step 7")
datasetst = pd.concat([dataset, groundtruth], axis=1)
categories = pd.DataFrame(category_list, columns=['category'])
datasetstst = pd.concat([datasetst, categories], axis=1)

Loading checkins
Loading POIs
Checkins:  10473
Users:  963
Gotten:  9510
Unique categories in checkins_rest:  279
Total categories:  279
Overview:
963
279
9228
{233919: 0, 190585: 1, 24779: 2, 30835: 3, 3884: 4, 18507: 5, 125878: 6, 79872: 7, 112400: 8, 16753: 9, 198380: 10, 3166: 11, 248459: 12, 166003: 13, 65942: 14, 85610: 15, 212753: 16, 11129: 17, 134643: 18, 98834: 19, 228886: 20, 9912: 21, 141345: 22, 36884: 23, 240687: 24, 259108: 25, 80758: 26, 132466: 27, 23885: 28, 81032: 29, 44648: 30, 24890: 31, 174210: 32, 178954: 33, 201854: 34, 75696: 35, 167913: 36, 93131: 37, 196441: 38, 3354: 39, 245399: 40, 139066: 41, 204627: 42, 110075: 43, 259848: 44, 99909: 45, 12829: 46, 162192: 47, 41675: 48, 43277: 49, 79388: 50, 3457: 51, 221413: 52, 145644: 53, 88502: 54, 87745: 55, 123798: 56, 201860: 57, 34500: 58, 64920: 59, 74284: 60, 215891: 61, 76403: 62, 81724: 63, 6085: 64, 234549: 65, 1643: 66, 150809: 67, 113696: 68, 57410: 69, 132704: 70, 183987: 71, 256254: 72, 26478: 73, 164480

In [28]:
print("Step 8")
dataset_numpy = datasetstst.to_numpy()

labels = pd.DataFrame(category_list, columns=['ground_truth'])
labels_numpy = labels.to_numpy()

x_train, x_test, y_train, y_test = train_test_split(dataset_numpy, labels_numpy, test_size=0.05, random_state=0)

print("Step 9")
x_train_df = pd.DataFrame(x_train, columns=['Users', 'latitude', 'longitude', 'ground_truth', 'category'])
x_test_df = pd.DataFrame(x_test, columns=['Users', 'latitude', 'longitude', 'ground_truth', 'category'])

x_test_df.to_csv(r'/user/student.aau.dk/lharde18/Data-output/Den/y_train_df.csv', sep=',', index=['User','Latitude','Longitude', '0'])

for index, row in x_train_df.iterrows():
    usr = x_train_df.loc[index, 'Users']
    x_train_df.xs(index)['Users']=encoding_array.get(usr)

users = pd.DataFrame(x_train_df, columns=['Users'])
lat = pd.DataFrame(x_train_df, columns=['latitude'])
long = pd.DataFrame(x_train_df, columns=['longitude'])
cat = pd.DataFrame(x_train_df, columns=['category'])
labels = pd.DataFrame(x_train_df, columns=['ground_truth'])
    
print("Step 10")
users = tf.convert_to_tensor(
    users, dtype=None, dtype_hint=None, name=None)
lat = tf.convert_to_tensor(
    lat, dtype=None, dtype_hint=None, name=None)
long = tf.convert_to_tensor(
    long, dtype=None, dtype_hint=None, name=None)
cat = tf.convert_to_tensor(
    cat, dtype='int64', dtype_hint=None, name=None)
labels = tf.convert_to_tensor(
    labels, dtype=None, dtype_hint=None, name=None)

Step 8
Step 9
Step 10


In [31]:
print(category_length)
model = EmbModel(len(users), category_length)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
optimizer = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.0, beta_2=0.99, epsilon=1e-8)

model.compile(
    optimizer
)

#Train_data, [dataset.user_id, dataset.poi_id]. Label: ground_truth
model.fit([cat, lat, long, users], labels, epochs = 25, batch_size=27)

279
Model: "functional_29"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
poi_latitude (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
poi_longitude (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
concatenate_28 (Concatenate)    (None, 2)            0           poi_latitude[0][0]               
                                                                 poi_longitude[0][0]              
__________________________________________________________________________________________________
category_input (InputLayer)     [(None, 1)]          0                            

KeyboardInterrupt: 

## 