In [1]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer , scale
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from sklearn.utils import shuffle
from keras.utils import plot_model

In [2]:
##

In [3]:
checkin_cols = ['user_id', 'poi_id', 'timestamp', 'timezone']
checkins = pd.read_csv(r'C:\Users\lasse\Desktop\RecommenderDL\datasets\DenSweGerNet_checkins.csv', sep=',', names=checkin_cols, encoding='latin-1').dropna(axis=1)
#checkins.iloc[2:]

venue_cols = ['poi_id', 'latitude', 'longitude', 'category', 'country_code']
pois = pd.read_csv(r'C:\Users\lasse\Desktop\RecommenderDL\datasets\DenSweGerNet_pois.csv', sep=',', names=venue_cols, encoding='latin-1')

In [4]:
#Aka Checkins_1_of_each_user
print("Getting users from subset")
users = checkins.copy()
users.drop_duplicates(subset="user_id", keep = 'first', inplace = True)

Getting users from subset


In [24]:
class EmbModel(tf.keras.Model):
    def __init__(self, useridlength, category_length):
        super(EmbModel, self).__init__()
        self.d_steps = 1
        self.useridlength = useridlength
        self.category_length = category_length
        self.model = self.init_model()
        print(self.useridlength)
        
    @tf.function
    def __call__(self, x):
        print('Tracing with', x)
        return x * self.v    
    
    
    def init_model(self):
        poi_latitude_input = keras.Input(shape=(1,), name='poi_latitude')
        poi_longitude_input = keras.Input(shape=(1,), name='poi_longitude')
        poi_concat_input = tf.keras.layers.Concatenate(axis=-1)([poi_latitude_input, poi_longitude_input])
        #input_length:  #This is the length of input sequences, as you would define for any input layer of a Keras model. 
                        #For example, if all of your input documents are comprised of 1000 words, this would be 1000
        #input_dim: 
                        #This is the size of the vocabulary in the text data. 
                        #For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.
        poi_dense = layers.Dense(128)(poi_concat_input)
        poi_reshape = layers.Reshape((1, 128))(poi_dense)
        
        category_input = keras.Input(shape=(1), name='category_input')
        category_emb = layers.Embedding(self.category_length, 128)(category_input)
        
        category_concat = tf.keras.layers.Concatenate(axis=-1)([category_emb, poi_reshape])
    
        user_input = keras.Input(shape=(1,), name='user_id')
        user_emb = layers.Embedding(self.useridlength, 256)(user_input)
        #user_reshape = layers.Reshape((1, 256))(user_emb)
        
                                    
        dot = layers.Dot(axes=(2))([category_concat, user_emb])
        
        
        model = Model([category_input, [poi_latitude_input, poi_longitude_input], user_input], dot)
        model.summary()
        return model
    
    def compile_model(self, optimizer):
        super(EmbModel, self).compile(run_eagerly=True)
        self.optimizer = optimizer
        
    def train_step(self, data):
        if len(data) == 3:
            real_data, labels, sample_weight = data
        else:
            sample_weight = None
            real_data, labels = data
        user_data = real_data[0]
        latlong_data = real_data[1]
        cat_data = real_data[2]
        print(user_data)
        print(latlong_data)
        print(cat_data)
        
        for i in range(self.d_steps):
            
            with tf.GradientTape() as tape:
                
                #print(latlong_data[0])
                #print(latlong_data[1])
                #print(user_data)
                
                dotproduct = self.model([cat_data, [latlong_data[0], latlong_data[1]], user_data])
                #print(dotproduct)
                # Loss function = ||S-GroundTruth|| 
                loss = tf.math.abs(tf.subtract(tf.cast(dotproduct, tf.float64), labels))
                #print(loss)
            d_gradient = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(d_gradient, self.model.trainable_variables))
        return {'loss': loss}
    
    def predict_step(self, data):
        sample_weight = None
        real_data = data[0]
        user_data = real_data[0]
        latlong_data = real_data[1]
        return self.model([[latlong_data[0], latlong_data[1]], user_data])
        

In [56]:
model = EmbModel(13290, 428)

optimizer = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.0, beta_2=0.99, epsilon=1e-8)

model.compile(
    optimizer
)

tf.saved_model.save(model, r'C:\Users\lasse\Desktop\RecommenderDL\Kode')
#model.save('/user/student.aau.dk/lharde18/Data/model_%s' % i, save_format='tf')

Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
poi_latitude (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
poi_longitude (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
concatenate_8 (Concatenate)     (None, 2)            0           poi_latitude[0][0]               
                                                                 poi_longitude[0][0]              
__________________________________________________________________________________________________
category_input (InputLayer)     [(None, 1)]          0                                 

## Find the complement

In [26]:
#One checkin for each user
users = checkins.copy()
users.drop_duplicates(subset="user_id", keep = 'first', inplace = True)
print("Checkins: ", len(checkins))
print("Users: ", len(users))
len_checkins = len(checkins)
len_users = len(users)
print("Expected: 410706")

#The rest of the checkins and categories
checkins_rest = users.merge(checkins, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
print("Gotten: ", len(checkins_rest))

#One of each category in checkins_rest
categories1 = pd.DataFrame(pois, columns=['poi_id', 'category'])
categories1 = checkins_rest.merge(categories1, on='poi_id')
users_categories1 = categories1.copy()
users_categories1.drop_duplicates(subset="category", keep = 'first', inplace = True)
print("Unique categories in checkins_rest: ", len(users_categories1))

#FINISHING NOTE: Vi vil have alle colplement af checkins_rest, når vi sammenligner med categories_cat_no_cat, så har vi de 3 gange checkins vi har brug for. Så kan vi lave train/test på det store af dem.
categories_cat_no_cat = pd.DataFrame(users_categories1, columns=['user_id', 'poi_id', 'timestamp', 'timezone', 'category'])
checkins_rest_no_merge = pd.DataFrame(checkins_rest, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])
poisandcategories = pd.DataFrame(pois, columns=['poi_id', 'category'])
checkins_rest_no_merge = checkins_rest_no_merge.merge(poisandcategories, on='poi_id')
print("Total categories: ", len(categories_cat_no_cat))

test = categories_cat_no_cat.merge(checkins_rest_no_merge, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
#test = pd.DataFrame(checkins_rest, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])
#cattest = pd.DataFrame(pois, columns=['poi_id', 'category'])
#test.merge(cattest, on='poi_id')
#checkins_rest = cattest.merge(test, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']

restset = pd.DataFrame(test, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])
userset = users
categoryset = pd.DataFrame(users_categories1, columns=['user_id', 'poi_id', 'timestamp', 'timezone'])

print(categoryset)


Checkins:  423996
Users:  13290
Expected: 410706
Gotten:  410700
Unique categories in checkins_rest:  415
Total categories:  415
        user_id                    poi_id                       timestamp  \
0        190585  4adcdb00f964a520775e21e3  Tue May 07 17:10:37 +0000 2013   
1         16753  4d5c36cce7f7548138c81797  Wed Jun 13 17:16:23 +0000 2012   
2          3166  4ae5cfc4f964a52039a221e3  Tue Apr 03 22:14:46 +0000 2012   
6        240687  4e9b5b84722edf21e2ceb163  Tue Apr 03 19:21:58 +0000 2012   
7         85610  4b1283caf964a5206b8a23e3  Tue Apr 03 19:52:59 +0000 2012   
...         ...                       ...                             ...   
342016    71499  4d52edc77ee1a35da9529f34  Sun May 13 16:36:42 +0000 2012   
382312   116071  4da98e5d1e72c1ab9bdda5bf  Sun Oct 28 14:44:35 +0000 2012   
404308    32187  4c9f262c7c096dcbd1e5dbd1  Thu Apr 11 06:13:10 +0000 2013   
408912    86434  51a259e4498e4b9d85d1dbba  Sat Jul 13 12:05:02 +0000 2013   
409864   179215  51e4138

In [36]:
encoding = pd.DataFrame(userset, columns=['user_id'])
#print(encoding)
encoding_array = {}
temp = 0
for user in encoding.iterrows():
    user = user[0]
    value = encoding._get_value(user, 'user_id')
    encoding_array[value] = temp
    temp += 1

In [52]:
print(encoding_array)

{233919: 0, 190585: 1, 24779: 2, 30835: 3, 3884: 4, 18507: 5, 125878: 6, 79872: 7, 112400: 8, 16753: 9, 198380: 10, 3166: 11, 248459: 12, 166003: 13, 65942: 14, 85610: 15, 212753: 16, 11129: 17, 134643: 18, 98834: 19, 228886: 20, 9912: 21, 141345: 22, 36884: 23, 240687: 24, 259108: 25, 80758: 26, 132466: 27, 23885: 28, 81032: 29, 44648: 30, 24890: 31, 174210: 32, 178954: 33, 201854: 34, 75696: 35, 167913: 36, 93131: 37, 196441: 38, 3354: 39, 245399: 40, 139066: 41, 204627: 42, 110075: 43, 259848: 44, 99909: 45, 12829: 46, 162192: 47, 41675: 48, 43277: 49, 79388: 50, 3457: 51, 221413: 52, 145644: 53, 88502: 54, 87745: 55, 123798: 56, 201860: 57, 34500: 58, 64920: 59, 74284: 60, 215891: 61, 76403: 62, 81724: 63, 6085: 64, 234549: 65, 1643: 66, 150809: 67, 113696: 68, 57410: 69, 132704: 70, 183987: 71, 256254: 72, 26478: 73, 164480: 74, 5523: 75, 148702: 76, 230601: 77, 103351: 78, 129961: 79, 56432: 80, 152507: 81, 54512: 82, 186735: 83, 80978: 84, 50064: 85, 120094: 86, 86918: 87, 82541

In [22]:
index = 0
for user in encoding.iterrows():
    user = user[0]
    encoding.xs(user)['user_id']=encoding_array.get(user)
    #encoding.at[index,'user_id']=encoding_array.get(user)
    index += 1

In [23]:
print(encoding)

            user_id
524329.0          0
731652.0          1
731653.0          2
731654.0          3
731655.0          4
...             ...
33245179.0    13285
33257534.0    13286
33259975.0    13287
33260240.0    13288
33263302.0    13289

[13290 rows x 1 columns]


In [None]:
encoding = pd.DataFrame(userset, columns=['user_id', ])

# Iteration 1: User

In [45]:
print("Step 1")
checkin_data = userset.merge(pois, on='poi_id')
df = checkin_data.set_index('user_id').poi_id.str.get_dummies(',')
df = df.groupby('user_id').max()

print("Step 2")
checkin_data_no_duplicates = checkin_data.copy()
checkin_data_no_duplicates.drop_duplicates(subset ="poi_id", keep = 'first', inplace = True)
checkin_data_no_duplicates = pd.DataFrame(checkin_data_no_duplicates, columns = ['poi_id', 'category'])

#Extract categorical data
print("Step 3")
categories = pd.DataFrame(checkin_data, columns=['category'])
categories.drop_duplicates(subset ="category", keep = 'first', inplace = True)
category_length = len(categories)
categories_numpy = categories.to_numpy()


#Extracting all of the users and the pois
print("Step 3.5")
listofusers = pd.DataFrame(checkin_data, columns= ['user_id']).groupby('user_id').max().sample(frac=1)
listofpois = pd.DataFrame(checkin_data, columns= ['poi_id', 'latitude', 'longitude']).groupby('poi_id').max().sample(frac=1)
userarray = listofusers.index.to_numpy()
poiarray = listofpois.index.to_numpy()
userdataframe = pd.DataFrame(userarray, columns = ['Users'])
poidataframe = pd.DataFrame(poiarray, columns = ['Poi'])
dot = userdataframe.merge(poidataframe, how='cross')

print("Step 4")
rows_list = []
for i in range(len(dot)):
    temp = dot.loc[i, "Poi"]
    latitude = listofpois.loc[temp]['latitude']
    longitude = listofpois.loc[temp]['longitude']
    dict1 = {'latitude':latitude, 'longitude':longitude}
    rows_list.append(dict1)
    #latitude = poiarray[i]
latlong = pd.DataFrame(rows_list)

#Creating dataset
print("Step 5")
userdot = pd.DataFrame(dot, columns= ['Users'])
latlong['latitude'] = pd.to_numeric(latlong['latitude'])
latlong['longitude'] = pd.to_numeric(latlong['longitude'])
dataset = pd.concat([userdot, latlong], axis=1)

#Extracting ground_truth from incidence matrix
print("Step 6")
rows_list = []
category_list = []
for i in range(len(dot)):
    temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    dict1 = {'ground_truth':float(temp)}
    rows_list.append(dict1)
    #Extract category from the list
    category = checkin_data_no_duplicates.loc[checkin_data_no_duplicates['poi_id'] == dot.loc[i, "Poi"]]
    cat = category['category']
    index = np.where(categories_numpy == [cat])[0][0]
    category_list.append(index)
#category_label = 
groundtruth = pd.DataFrame(rows_list)
#result = pd.concat([dot, groundtruth], axis=1)

print("Step 7")
categories = pd.DataFrame(category_list, columns=['category'])
datasetst = pd.concat([dataset, categories], axis=1)

print("Step 8")
dataset_numpy = datasetst.to_numpy()
labels_numpy = groundtruth.to_numpy()
categories_numpy = categories.to_numpy()
#x_train, x_test, y_train, y_test = train_test_split(dataset_numpy, labels_numpy, test_size=0.05, random_state=0)

print("Step 9")
x_train_df = pd.DataFrame(dataset_numpy, columns=['User','Latitude','Longitude', '0'])
#x_test_df = pd.DataFrame(x_test, columns=['User','Latitude','Longitude', '0'])
y_train_df = pd.DataFrame(labels_numpy)
#y_test_df = pd.DataFrame(y_test)

#Dataset with Users
dataset1_df = pd.DataFrame(x_train_df['User'])

index = 0
for user in dataset1_df.iterrows():
    user = user[0]
    value = dataset1_df._get_value(user, 'User')
    dataset1_df.xs(user)['User']=encoding_array.get(value)
    #encoding.at[index,'user_id']=encoding_array.get(user)
    index += 1
    
#Dataset with Poi's
dataset2_df = pd.DataFrame(x_train_df[['Latitude', 'Longitude']])

dataset3_df = pd.DataFrame(x_train_df[['0']])

print("Step 10")
dataset1 = tf.convert_to_tensor(
    dataset1_df, dtype=None, dtype_hint=None, name=None)
dataset2 = tf.convert_to_tensor(
    dataset2_df, dtype=None, dtype_hint=None, name=None)
dataset3 = tf.convert_to_tensor(
    dataset3_df, dtype='int64', dtype_hint=None, name=None)
labels = tf.convert_to_tensor(
    y_train_df, dtype=None, dtype_hint=None, name=None)


#Train_data, [dataset.user_id, dataset.poi_id]. Label: ground_truth
model.fit([dataset1, dataset2, dataset3], labels, epochs = 20, batch_size=2)

#model.save('/user/student.aau.dk/lharde18/Data/model_%s' % i, save_format='tf')

Step 1
Step 2
Step 3
Step 3.5
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9


In [55]:
print(y_train_df)

         0
0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
...    ...
89635  0.0
89636  0.0
89637  0.0
89638  0.0
89639  0.0

[89640 rows x 1 columns]


# Iteration 2: Categories

In [51]:
print("Step 1")
checkin_data = categoryset.merge(pois, on='poi_id')
df = checkin_data.set_index('user_id').poi_id.str.get_dummies(',')
df = df.groupby('user_id').max()

print("Step 2")
checkin_data_no_duplicates = checkin_data.copy()
checkin_data_no_duplicates.drop_duplicates(subset ="poi_id", keep = 'first', inplace = True)
checkin_data_no_duplicates = pd.DataFrame(checkin_data_no_duplicates, columns = ['poi_id', 'category'])

#Extract categorical data
print("Step 3")
categories = pd.DataFrame(checkin_data, columns=['category'])
categories.drop_duplicates(subset ="category", keep = 'first', inplace = True)
category_length = len(categories)
categories_numpy = categories.to_numpy()


#Extracting all of the users and the pois
print("Step 3.5")
listofusers = pd.DataFrame(checkin_data, columns= ['user_id']).groupby('user_id').max().sample(frac=1)
listofpois = pd.DataFrame(checkin_data, columns= ['poi_id', 'latitude', 'longitude']).groupby('poi_id').max().sample(frac=1)
userarray = listofusers.index.to_numpy()
poiarray = listofpois.index.to_numpy()
userdataframe = pd.DataFrame(userarray, columns = ['Users'])
poidataframe = pd.DataFrame(poiarray, columns = ['Poi'])
dot = userdataframe.merge(poidataframe, how='cross')

print("Step 4")
rows_list = []
for i in range(len(dot)):
    temp = dot.loc[i, "Poi"]
    latitude = listofpois.loc[temp]['latitude']
    longitude = listofpois.loc[temp]['longitude']
    dict1 = {'latitude':latitude, 'longitude':longitude}
    rows_list.append(dict1)
    #latitude = poiarray[i]
latlong = pd.DataFrame(rows_list)

#Creating dataset
print("Step 5")
userdot = pd.DataFrame(dot, columns= ['Users'])
latlong['latitude'] = pd.to_numeric(latlong['latitude'])
latlong['longitude'] = pd.to_numeric(latlong['longitude'])
dataset = pd.concat([userdot, latlong], axis=1)

#Extracting ground_truth from incidence matrix
print("Step 6")
rows_list = []
category_list = []
for i in range(len(dot)):
    temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    dict1 = {'ground_truth':float(temp)}
    rows_list.append(dict1)
    #Extract category from the list
    category = checkin_data_no_duplicates.loc[checkin_data_no_duplicates['poi_id'] == dot.loc[i, "Poi"]]
    cat = category['category']
    index = np.where(categories_numpy == [cat])[0][0]
    category_list.append(index)
#category_label = 
groundtruth = pd.DataFrame(rows_list)
#result = pd.concat([dot, groundtruth], axis=1)

print("Step 7")
categories = pd.DataFrame(category_list, columns=['category'])
datasetst = pd.concat([dataset, categories], axis=1)

print("Step 8")
dataset_numpy = datasetst.to_numpy()
labels_numpy = groundtruth.to_numpy()
categories_numpy = categories.to_numpy()
#x_train, x_test, y_train, y_test = train_test_split(dataset_numpy, labels_numpy, test_size=0.05, random_state=0)

print("Step 9")
x_train_df = pd.DataFrame(dataset_numpy, columns=['User','Latitude','Longitude', '0'])
#x_test_df = pd.DataFrame(x_test, columns=['User','Latitude','Longitude', '0'])
y_train_df = pd.DataFrame(labels_numpy)
#y_test_df = pd.DataFrame(y_test)

#Dataset with Users
dataset1_df = pd.DataFrame(x_train_df['User'])

index = 0
for user in dataset1_df.iterrows():
    user = user[0]
    value = dataset1_df._get_value(user, 'User')
    dataset1_df.xs(user)['User']=encoding_array.get(value)
    #encoding.at[index,'user_id']=encoding_array.get(user)
    index += 1
    
#Dataset with Poi's
dataset2_df = pd.DataFrame(x_train_df[['Latitude', 'Longitude']])

dataset3_df = pd.DataFrame(x_train_df[['0']])

print("Step 10")
dataset1 = tf.convert_to_tensor(
    dataset1_df, dtype=None, dtype_hint=None, name=None)
dataset2 = tf.convert_to_tensor(
    dataset2_df, dtype=None, dtype_hint=None, name=None)
dataset3 = tf.convert_to_tensor(
    dataset3_df, dtype='int64', dtype_hint=None, name=None)
labels = tf.convert_to_tensor(
    y_train_df, dtype=None, dtype_hint=None, name=None)


#Train_data, [dataset.user_id, dataset.poi_id]. Label: ground_truth
model.fit([dataset1, dataset2, dataset3], labels, epochs = 20, batch_size=2)

#model.save('/user/student.aau.dk/lharde18/Data/model_%s' % i, save_format='tf')

Step 1
Step 2
Step 3
Step 3.5
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
Step 10
Epoch 1/20
  356/44820 [..............................] - ETA: 38:21 - loss: 0.19

KeyboardInterrupt: 

In [49]:
print(dataset1_df)

        User
0      105.0
1      105.0
2      105.0
3      105.0
4      105.0
...      ...
89635  127.0
89636  127.0
89637  127.0
89638  127.0
89639  127.0

[89640 rows x 1 columns]


In [50]:
#Dataset with Poi's
dataset2_df = pd.DataFrame(x_train_df[['Latitude', 'Longitude']])

dataset3_df = pd.DataFrame(x_train_df[['0']])

print("Step 10")
dataset1 = tf.convert_to_tensor(
    dataset1_df, dtype=None, dtype_hint=None, name=None)
dataset2 = tf.convert_to_tensor(
    dataset2_df, dtype=None, dtype_hint=None, name=None)
dataset3 = tf.convert_to_tensor(
    dataset3_df, dtype='int64', dtype_hint=None, name=None)
labels = tf.convert_to_tensor(
    y_train_df, dtype=None, dtype_hint=None, name=None)


#Train_data, [dataset.user_id, dataset.poi_id]. Label: ground_truth
model.fit([dataset1, dataset2, dataset3], labels, epochs = 20, batch_size=2)

#model.save('/user/student.aau.dk/lharde18/Data/model_%s' % i, save_format='tf')

Step 10
Epoch 1/20
 1047/44820 [..............................] - ETA: 38:38 - loss: 0.8372

KeyboardInterrupt: 

In [44]:
len(dataset3_df)

89640

In [32]:
#Partition 1
print("Partition 1")
checkin_data = checkins.merge(pois, on='poi_id')
df = users.set_index('user_id').poi_id.str.get_dummies(',')
df = df.groupby('user_id').max()

Partition 1


In [33]:
checkin_data_no_duplicates = checkin_data.copy()
checkin_data_no_duplicates.drop_duplicates(subset ="poi_id",
                     keep = 'first', inplace = True)
checkin_data_no_duplicates = pd.DataFrame(checkin_data_no_duplicates, columns = ['poi_id', 'category'])

In [34]:
print(checkin_data)

        user_id                    poi_id                       timestamp  \
0        233919  4c2f45da66e40f47aa8ec18b  Tue Apr 03 18:19:44 +0000 2012   
1        190585  4adcdb00f964a520775e21e3  Tue Apr 03 18:30:09 +0000 2012   
2         24779  4adcdb00f964a520775e21e3  Sun Apr 29 11:33:55 +0000 2012   
3         30835  4adcdb00f964a520775e21e3  Wed Jun 27 14:49:26 +0000 2012   
4          3884  4adcdb00f964a520775e21e3  Mon Nov 12 11:53:23 +0000 2012   
...         ...                       ...                             ...   
423991    91075  4c9dc0f2d3c2b60cc657c4bc  Mon Sep 16 18:38:55 +0000 2013   
423992   185228  4ec4c1df77c8d69510ee11ed  Mon Sep 16 18:59:30 +0000 2013   
423993   258948  4cf520ff6195721e82cf55c1  Mon Sep 16 20:26:11 +0000 2013   
423994   209871  4bcf2a839854d13a977ef54d  Mon Sep 16 21:03:21 +0000 2013   
423995   170216  4a27db7ff964a52024941fe3  Mon Sep 16 21:34:37 +0000 2013   

        timezone   latitude  longitude                 category country_cod

In [35]:
#Extract categorical data
print("Extract categorical data")
categories = pd.DataFrame(checkin_data, columns=['category'])
categories.drop_duplicates(subset ="category",
                           keep = 'first', inplace = True)
category_length = len(categories)
categories_numpy = categories.to_numpy()


Extract categorical data


In [36]:
print(categories)

                               category
0                        Ice Cream Shop
1               Scandinavian Restaurant
10                General Entertainment
12                       Home (private)
13                        Train Station
...                                 ...
354939  Southern / Soul Food Restaurant
395454                    Ski Chairlift
417546                       Ski Chalet
422185                    Frozen Yogurt
423150                   Hunting Supply

[415 rows x 1 columns]


In [37]:
#Extracting all of the users and the pois
print("Extracting all of the users and the pois")
listofusers = pd.DataFrame(checkin_data, columns= ['user_id']).groupby('user_id').max().sample(frac=1)
listofpois = pd.DataFrame(checkin_data, columns= ['poi_id', 'latitude', 'longitude']).groupby('poi_id').max().sample(frac=1)
userarray = listofusers.index.to_numpy()
poiarray = listofpois.index.to_numpy()
userdataframe = pd.DataFrame(userarray, columns = ['Users'])
poidataframe = pd.DataFrame(poiarray, columns = ['Poi'])
dot = userdataframe.merge(poidataframe, how='cross')

Extracting all of the users and the pois


MemoryError: Unable to allocate 8.16 GiB for an array with shape (1094737170,) and data type int64

In [None]:
#LatLong
print("LatLong")
rows_list = []
for i in range(len(dot)):
    temp = dot.loc[i, "Poi"]
    latitude = listofpois.loc[temp]['latitude']
    longitude = listofpois.loc[temp]['longitude']
    dict1 = {'latitude':latitude, 'longitude':longitude}
    rows_list.append(dict1)
    #latitude = poiarray[i]
latlong = pd.DataFrame(rows_list)

In [None]:
#Creating dataset
print("Creating dataset")
userdot = pd.DataFrame(dot, columns= ['Users'])
latlong['latitude'] = pd.to_numeric(latlong['latitude'])
latlong['longitude'] = pd.to_numeric(latlong['longitude'])
dataset = pd.concat([userdot, latlong], axis=1)

In [None]:
#Extracting ground_truth from incidence matrix
print("Extracting ground_truth from incidence matrix")
rows_list = []
category_list = []
for i in range(len(dot)):
    temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    dict1 = {'ground_truth':float(temp)}
    rows_list.append(dict1)
    #Extract category from the list
    category = checkin_data_no_duplicates.loc[checkin_data_no_duplicates['poi_id'] == dot.loc[i, "Poi"]]
    cat = category['category']
    index = np.where(categories_numpy == [cat])[0][0]
    category_list.append(index)
#category_label = 
groundtruth = pd.DataFrame(rows_list)
#result = pd.concat([dot, groundtruth], axis=1)

In [None]:
categories = pd.DataFrame(category_list, columns=['category'])
datasetst = pd.concat([dataset, categories], axis=1)

dataset_numpy = datasetst.to_numpy()
labels_numpy = groundtruth.to_numpy()
categories_numpy = categories.to_numpy()
x_train, x_test, y_train, y_test = train_test_split(dataset_numpy, labels_numpy, test_size=0.05, random_state=0)

x_train_df = pd.DataFrame(x_train, columns=['User','Latitude','Longitude', '0'])
x_test_df = pd.DataFrame(x_test, columns=['User','Latitude','Longitude', '0'])
y_train_df = pd.DataFrame(y_train)
y_test_df = pd.DataFrame(y_test)

In [None]:
#Dataset with Users
dataset1_df = pd.DataFrame(x_train_df['User'])

#Dataset with Poi's
dataset2_df = pd.DataFrame(x_train_df[['Latitude', 'Longitude']])

dataset3_df = pd.DataFrame(x_train_df[['0']])

dataset1_df.to_csv(r'C:\Users\lasse\Desktop\RecommenderDL\datasets\dataset1_df_partition1.csv',sep=',', index=['User'])
dataset2_df.to_csv(r'C:\Users\lasse\Desktop\RecommenderDL\datasets\dataset2_df_partition1.csv',sep=',', index=['Latitude', 'Longitude'])
dataset3_df.to_csv(r'C:\Users\lasse\Desktop\RecommenderDL\datasets\dataset3_df_partition1.csv',sep=',', index=['0'])


In [None]:

#######################################
#                                     #
# ____________[:6500]________________ #
#                                     #
#######################################

userset_subset = userset[:6500]

print("Step 1")
checkin_data = userset_subset.merge(pois, on='poi_id')
df = checkin_data.set_index('user_id').poi_id.str.get_dummies(',')
df = df.groupby('user_id').max()

print("Step 2")
checkin_data_no_duplicates = checkin_data.copy()
checkin_data_no_duplicates.drop_duplicates(subset ="poi_id", keep = 'first', inplace = True)
checkin_data_no_duplicates = pd.DataFrame(checkin_data_no_duplicates, columns = ['poi_id', 'category'])

#Extract categorical data
print("Step 3")
categories = pd.DataFrame(checkin_data, columns=['category'])
categories.drop_duplicates(subset ="category", keep = 'first', inplace = True)
category_length = len(categories)
categories_numpy = categories.to_numpy()


#Extracting all of the users and the pois
print("Step 3.5")
listofusers = pd.DataFrame(checkin_data, columns= ['user_id']).groupby('user_id').max().sample(frac=1)
listofpois = pd.DataFrame(checkin_data, columns= ['poi_id', 'latitude', 'longitude']).groupby('poi_id').max().sample(frac=1)
userarray = listofusers.index.to_numpy()
poiarray = listofpois.index.to_numpy()
userdataframe = pd.DataFrame(userarray, columns = ['Users'])
poidataframe = pd.DataFrame(poiarray, columns = ['Poi'])
dot = userdataframe.merge(poidataframe, how='cross')

print("Step 4")
rows_list = []
for i in range(len(dot)):
    temp = dot.loc[i, "Poi"]
    latitude = listofpois.loc[temp]['latitude']
    longitude = listofpois.loc[temp]['longitude']
    dict1 = {'latitude':latitude, 'longitude':longitude}
    rows_list.append(dict1)
    #latitude = poiarray[i]
latlong = pd.DataFrame(rows_list)

#Creating dataset
print("Step 5")
userdot = pd.DataFrame(dot, columns= ['Users'])
latlong['latitude'] = pd.to_numeric(latlong['latitude'])
latlong['longitude'] = pd.to_numeric(latlong['longitude'])
dataset = pd.concat([userdot, latlong], axis=1)

#Extracting ground_truth from incidence matrix
print("Step 6")
rows_list = []
category_list = []
for i in range(len(dot)):
    temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    dict1 = {'ground_truth':float(temp)}
    rows_list.append(dict1)
    #Extract category from the list
    category = checkin_data_no_duplicates.loc[checkin_data_no_duplicates['poi_id'] == dot.loc[i, "Poi"]]
    cat = category['category']
    index = np.where(categories_numpy == [cat])[0][0]
    category_list.append(index)
#category_label = 
groundtruth = pd.DataFrame(rows_list)
#result = pd.concat([dot, groundtruth], axis=1)

print("Step 7")
categories = pd.DataFrame(category_list, columns=['category'])
datasetst = pd.concat([dataset, categories], axis=1)

print("Step 8")
dataset_numpy = datasetst.to_numpy()
labels_numpy = groundtruth.to_numpy()
categories_numpy = categories.to_numpy()
#x_train, x_test, y_train, y_test = train_test_split(dataset_numpy, labels_numpy, test_size=0.05, random_state=0)

print("Step 9")
x_train_df = pd.DataFrame(dataset_numpy, columns=['User','Latitude','Longitude', '0'])
#x_test_df = pd.DataFrame(x_test, columns=['User','Latitude','Longitude', '0'])
y_train_df = pd.DataFrame(labels_numpy)
#y_test_df = pd.DataFrame(y_test)

#Dataset with Users
dataset1_df = pd.DataFrame(x_train_df['User'])

#Dataset with Poi's
dataset2_df = pd.DataFrame(x_train_df[['Latitude', 'Longitude']])

dataset3_df = pd.DataFrame(x_train_df[['0']])

print("Step 10")
dataset1 = tf.convert_to_tensor(
    dataset1_df, dtype=None, dtype_hint=None, name=None)
dataset2 = tf.convert_to_tensor(
    dataset2_df, dtype=None, dtype_hint=None, name=None)
dataset3 = tf.convert_to_tensor(
    dataset3_df, dtype='int64', dtype_hint=None, name=None)
labels = tf.convert_to_tensor(
    y_train_df, dtype=None, dtype_hint=None, name=None)


#Train_data, [dataset.user_id, dataset.poi_id]. Label: ground_truth
model.fit([dataset1, dataset2, dataset3], labels, epochs = 20, batch_size=2)

#######################################
#                                     #
# ____________[9500:]________________ #
#                                     #
#######################################

userset_subset = userset[6500:]

print("Step 1")
checkin_data = userset_subset.merge(pois, on='poi_id')
df = checkin_data.set_index('user_id').poi_id.str.get_dummies(',')
df = df.groupby('user_id').max()

print("Step 2")
checkin_data_no_duplicates = checkin_data.copy()
checkin_data_no_duplicates.drop_duplicates(subset ="poi_id", keep = 'first', inplace = True)
checkin_data_no_duplicates = pd.DataFrame(checkin_data_no_duplicates, columns = ['poi_id', 'category'])

#Extract categorical data
print("Step 3")
categories = pd.DataFrame(checkin_data, columns=['category'])
categories.drop_duplicates(subset ="category", keep = 'first', inplace = True)
category_length = len(categories)
categories_numpy = categories.to_numpy()


#Extracting all of the users and the pois
print("Step 3.5")
listofusers = pd.DataFrame(checkin_data, columns= ['user_id']).groupby('user_id').max().sample(frac=1)
listofpois = pd.DataFrame(checkin_data, columns= ['poi_id', 'latitude', 'longitude']).groupby('poi_id').max().sample(frac=1)
userarray = listofusers.index.to_numpy()
poiarray = listofpois.index.to_numpy()
userdataframe = pd.DataFrame(userarray, columns = ['Users'])
poidataframe = pd.DataFrame(poiarray, columns = ['Poi'])
dot = userdataframe.merge(poidataframe, how='cross')

print("Step 4")
rows_list = []
for i in range(len(dot)):
    temp = dot.loc[i, "Poi"]
    latitude = listofpois.loc[temp]['latitude']
    longitude = listofpois.loc[temp]['longitude']
    dict1 = {'latitude':latitude, 'longitude':longitude}
    rows_list.append(dict1)
    #latitude = poiarray[i]
latlong = pd.DataFrame(rows_list)

#Creating dataset
print("Step 5")
userdot = pd.DataFrame(dot, columns= ['Users'])
latlong['latitude'] = pd.to_numeric(latlong['latitude'])
latlong['longitude'] = pd.to_numeric(latlong['longitude'])
dataset = pd.concat([userdot, latlong], axis=1)

#Extracting ground_truth from incidence matrix
print("Step 6")
rows_list = []
category_list = []
for i in range(len(dot)):
    temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    dict1 = {'ground_truth':float(temp)}
    rows_list.append(dict1)
    #Extract category from the list
    category = checkin_data_no_duplicates.loc[checkin_data_no_duplicates['poi_id'] == dot.loc[i, "Poi"]]
    cat = category['category']
    index = np.where(categories_numpy == [cat])[0][0]
    category_list.append(index)
#category_label = 
groundtruth = pd.DataFrame(rows_list)
#result = pd.concat([dot, groundtruth], axis=1)

print("Step 7")
categories = pd.DataFrame(category_list, columns=['category'])
datasetst = pd.concat([dataset, categories], axis=1)

print("Step 8")
dataset_numpy = datasetst.to_numpy()
labels_numpy = groundtruth.to_numpy()
categories_numpy = categories.to_numpy()
#x_train, x_test, y_train, y_test = train_test_split(dataset_numpy, labels_numpy, test_size=0.05, random_state=0)

print("Step 9")
x_train_df = pd.DataFrame(dataset_numpy, columns=['User','Latitude','Longitude', '0'])
#x_test_df = pd.DataFrame(x_test, columns=['User','Latitude','Longitude', '0'])
y_train_df = pd.DataFrame(labels_numpy)
#y_test_df = pd.DataFrame(y_test)

#Dataset with Users
dataset1_df = pd.DataFrame(x_train_df['User'])

#Dataset with Poi's
dataset2_df = pd.DataFrame(x_train_df[['Latitude', 'Longitude']])

dataset3_df = pd.DataFrame(x_train_df[['0']])

print("Step 10")
dataset1 = tf.convert_to_tensor(
    dataset1_df, dtype=None, dtype_hint=None, name=None)
dataset2 = tf.convert_to_tensor(
    dataset2_df, dtype=None, dtype_hint=None, name=None)
dataset3 = tf.convert_to_tensor(
    dataset3_df, dtype='int64', dtype_hint=None, name=None)
labels = tf.convert_to_tensor(
    y_train_df, dtype=None, dtype_hint=None, name=None)


#Train_data, [dataset.user_id, dataset.poi_id]. Label: ground_truth
model.fit([dataset1, dataset2, dataset3], labels, epochs = 20, batch_size=2)

#######################################
#                                     #
# ____________Categories_____________ #
#                                     #
#######################################

userset_subset = categoryset

print("Step 1")
checkin_data = userset_subset.merge(pois, on='poi_id')
df = checkin_data.set_index('user_id').poi_id.str.get_dummies(',')
df = df.groupby('user_id').max()

print("Step 2")
checkin_data_no_duplicates = checkin_data.copy()
checkin_data_no_duplicates.drop_duplicates(subset ="poi_id", keep = 'first', inplace = True)
checkin_data_no_duplicates = pd.DataFrame(checkin_data_no_duplicates, columns = ['poi_id', 'category'])

#Extract categorical data
print("Step 3")
categories = pd.DataFrame(checkin_data, columns=['category'])
categories.drop_duplicates(subset ="category", keep = 'first', inplace = True)
category_length = len(categories)
categories_numpy = categories.to_numpy()


#Extracting all of the users and the pois
print("Step 3.5")
listofusers = pd.DataFrame(checkin_data, columns= ['user_id']).groupby('user_id').max().sample(frac=1)
listofpois = pd.DataFrame(checkin_data, columns= ['poi_id', 'latitude', 'longitude']).groupby('poi_id').max().sample(frac=1)
userarray = listofusers.index.to_numpy()
poiarray = listofpois.index.to_numpy()
userdataframe = pd.DataFrame(userarray, columns = ['Users'])
poidataframe = pd.DataFrame(poiarray, columns = ['Poi'])
dot = userdataframe.merge(poidataframe, how='cross')

print("Step 4")
rows_list = []
for i in range(len(dot)):
    temp = dot.loc[i, "Poi"]
    latitude = listofpois.loc[temp]['latitude']
    longitude = listofpois.loc[temp]['longitude']
    dict1 = {'latitude':latitude, 'longitude':longitude}
    rows_list.append(dict1)
    #latitude = poiarray[i]
latlong = pd.DataFrame(rows_list)

#Creating dataset
print("Step 5")
userdot = pd.DataFrame(dot, columns= ['Users'])
latlong['latitude'] = pd.to_numeric(latlong['latitude'])
latlong['longitude'] = pd.to_numeric(latlong['longitude'])
dataset = pd.concat([userdot, latlong], axis=1)

#Extracting ground_truth from incidence matrix
print("Step 6")
rows_list = []
category_list = []
for i in range(len(dot)):
    temp = df[dot.loc[i, "Poi"]][dot.loc[i, "Users"]]
    dict1 = {'ground_truth':float(temp)}
    rows_list.append(dict1)
    #Extract category from the list
    category = checkin_data_no_duplicates.loc[checkin_data_no_duplicates['poi_id'] == dot.loc[i, "Poi"]]
    cat = category['category']
    index = np.where(categories_numpy == [cat])[0][0]
    category_list.append(index)
#category_label = 
groundtruth = pd.DataFrame(rows_list)
#result = pd.concat([dot, groundtruth], axis=1)

print("Step 7")
categories = pd.DataFrame(category_list, columns=['category'])
datasetst = pd.concat([dataset, categories], axis=1)

print("Step 8")
dataset_numpy = datasetst.to_numpy()
labels_numpy = groundtruth.to_numpy()
categories_numpy = categories.to_numpy()
#x_train, x_test, y_train, y_test = train_test_split(dataset_numpy, labels_numpy, test_size=0.05, random_state=0)

print("Step 9")
x_train_df = pd.DataFrame(dataset_numpy, columns=['User','Latitude','Longitude', '0'])
#x_test_df = pd.DataFrame(x_test, columns=['User','Latitude','Longitude', '0'])
y_train_df = pd.DataFrame(labels_numpy)
#y_test_df = pd.DataFrame(y_test)

#Dataset with Users
dataset1_df = pd.DataFrame(x_train_df['User'])

#Dataset with Poi's
dataset2_df = pd.DataFrame(x_train_df[['Latitude', 'Longitude']])

dataset3_df = pd.DataFrame(x_train_df[['0']])

print("Step 10")
dataset1 = tf.convert_to_tensor(
    dataset1_df, dtype=None, dtype_hint=None, name=None)
dataset2 = tf.convert_to_tensor(
    dataset2_df, dtype=None, dtype_hint=None, name=None)
dataset3 = tf.convert_to_tensor(
    dataset3_df, dtype='int64', dtype_hint=None, name=None)
labels = tf.convert_to_tensor(
    y_train_df, dtype=None, dtype_hint=None, name=None)


#Train_data, [dataset.user_id, dataset.poi_id]. Label: ground_truth
model.fit([dataset1, dataset2, dataset3], labels, epochs = 20, batch_size=2)

model.save('/user/student.aau.dk/lharde18/Data/model_%s' % i, save_format='tf')