# Gowalla Dataset

In [31]:
filename = 'sdfsdf' + '%d_mu%.1e_iter%d.npz' % (100, 0.01, 5)
print filename


sdfsdf100_mu1.0e-02_iter5.npz


In [30]:
import numpy 
numpy.random.seed(12345)

a = [[2,2],[3,4]]
b = a
y = numpy.random.beta(a[0][0], b[0][0])
numpy.random.seed(12345)
x = numpy.random.beta(a, b)
print x,y

[[ 0.37456294  0.59353422]
 [ 0.52974641  0.19755835]] 0.374562938619


[Gowalla dataset](https://snap.stanford.edu/data/loc-gowalla.html) contains user-venue checkins. This is the script that pre-processes the full dataset and splits it into non-overlapping training, validation, test sets. The data is used in the paper: ["modeling user exposure in recommendation"](http://arxiv.org/abs/1510.07025).

In [7]:
import json
import os

import numpy as np
import pandas as pd

Change this to wherever you keep the [processed data](http://dawenl.github.io/data/gowalla_pro.zip)

In [8]:
DATA_DIR = '../../data/Yelp/100_feature_restaurant_only_location_only_Numeric_Id_small'

In [9]:
df = pd.read_table(os.path.join(DATA_DIR, 'checkins.tsv'), header=None, sep='\t', names=['uid', 'sid', 'rating'])

In [10]:
df

Unnamed: 0,uid,sid,rating
0,5,1000468,1
1,15,1001500,1
2,17,1000822,1
3,26,1004853,1
4,26,1003800,1
5,26,1004317,1
6,26,1002272,1
7,30,1002674,1
8,33,1000971,1
9,33,1004659,1


In [11]:
def get_count(df, id):
    playcount_groupbyid = df[[id, 'rating']].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(df, min_sc=20):
    # Only keep the triplets for songs which were listened to by at least min_sc users. 
    songcount = get_count(df, 'sid')
    df = df[df['sid'].isin(songcount.index[songcount >= min_sc])]
    
    # Update both usercount and songcount after filtering
    usercount, songcount = get_count(df, 'uid'), get_count(df, 'sid') 
    return df, usercount, songcount

In [12]:
df, usercount, songcount = filter_triplets(df)

In [13]:
sparsity_level = float(df.shape[0]) / (usercount.shape[0] * songcount.shape[0])
print "After filtering, there are %d triplets from %d users and %d venues (sparsity level %.3f%%)" % (df.shape[0], 
                                                                                                      usercount.shape[0], 
                                                                                                      songcount.shape[0], 
                                                                                                      sparsity_level * 100)

After filtering, there are 9876 triplets from 6262 users and 230 venues (sparsity level 0.686%)


In [14]:
unique_uid = sorted(pd.unique(df['uid']))
unique_sid = sorted(pd.unique(df['sid']))

In [15]:
uid2idx = dict((uid, idx) for (idx, uid) in enumerate(unique_uid))
sid2idx = dict((sid, idx) for (idx, sid) in enumerate(unique_sid))

In [16]:
with open(os.path.join(DATA_DIR, 'sid2idx.json'), 'w') as f:
    json.dump(sid2idx, f)

In [17]:
with open(os.path.join(DATA_DIR, 'uid2idx.json'), 'w') as f:
    json.dump(uid2idx, f)

In [18]:
with open(os.path.join(DATA_DIR, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

In [19]:
with open(os.path.join(DATA_DIR, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

## Generate train/test/vad sets

Pick out 20% of the checkins for heldout test

In [20]:
np.random.seed(12345)
n_ratings = df.shape[0]
test = np.random.choice(n_ratings, size=int(0.20 * n_ratings), replace=False)

In [21]:
test_idx = np.zeros(n_ratings, dtype=bool)
test_idx[test] = True

test_df = df[test_idx]
train_df = df[~test_idx]

Make sure there is no empty row/column in the training data

In [22]:
print "There are total of %d unique users in the training set and %d unique users in the entire dataset" % \
(len(pd.unique(train_df['uid'])), len(pd.unique(df['uid'])))

There are total of 5317 unique users in the training set and 6262 unique users in the entire dataset


In [23]:
print "There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_df['sid'])), len(pd.unique(df['sid'])))

There are total of 230 unique items in the training set and 230 unique items in the entire dataset


We can see the some of the users do not have any checkins in the training set, so we move those users from test set

In [24]:
train_uid = set(pd.unique(train_df['uid']))

In [25]:
left_uid = list()
for i, uid in enumerate(pd.unique(df['uid'])):
    if uid not in train_uid:
        left_uid.append(uid)

In [26]:
move_idx = test_df['uid'].isin(left_uid)

In [27]:
train_df = train_df.append(test_df[move_idx])
test_df = test_df[~move_idx]

In [28]:
# make sure we are good
print "There are total of %d unique users in the training set and %d unique users in the entire dataset" % \
(len(pd.unique(train_df['uid'])), len(pd.unique(df['uid'])))

There are total of 6262 unique users in the training set and 6262 unique users in the entire dataset


Pick out 10% of the training rating as validation set

In [29]:
np.random.seed(13579)
n_ratings = train_df.shape[0]
vad = np.random.choice(n_ratings, size=int(0.10 * n_ratings), replace=False)

In [30]:
vad_idx = np.zeros(n_ratings, dtype=bool)
vad_idx[vad] = True

vad_df = train_df[vad_idx]
train_df = train_df[~vad_idx]

Again make sure there is no empty row/column in the training data

In [31]:
print "There are total of %d unique users in the training set and %d unique users in the entire dataset" % \
(len(pd.unique(train_df['uid'])), len(pd.unique(df['uid'])))

There are total of 5741 unique users in the training set and 6262 unique users in the entire dataset


In [32]:
print "There are total of %d unique items in the training set and %d unique items in the entire dataset" % \
(len(pd.unique(train_df['sid'])), len(pd.unique(df['sid'])))

There are total of 230 unique items in the training set and 230 unique items in the entire dataset


We can see the some of the users do not have any checkins in the training set, so we move those users from validation set

In [33]:
train_uid = set(pd.unique(train_df['uid']))

In [34]:
left_uid = list()
for i, uid in enumerate(pd.unique(df['uid'])):
    if uid not in train_uid:
        left_uid.append(uid)

In [35]:
move_idx = vad_df['uid'].isin(left_uid)

In [36]:
train_df = train_df.append(vad_df[move_idx])
vad_df = vad_df[~move_idx]

In [37]:
print train_df.shape, vad_df.shape

(8520, 3) (359, 3)


In [38]:
# make sure we are good
print "There are total of %d unique users in the training set and %d unique users in the entire dataset" % \
(len(pd.unique(train_df['uid'])), len(pd.unique(df['uid'])))

There are total of 6262 unique users in the training set and 6262 unique users in the entire dataset


## Numerize the data into (user_index, item_index, count) format

In [39]:
uid = map(lambda x: uid2idx[x], train_df['uid'])
sid = map(lambda x: sid2idx[x], train_df['sid'])

In [40]:
train_df['uid'] = uid
train_df['sid'] = sid

In [41]:
train_df.to_csv(os.path.join(DATA_DIR, 'train.num.csv'), index=False)

In [42]:
uid = map(lambda x: uid2idx[x], test_df['uid'])
sid = map(lambda x: sid2idx[x], test_df['sid'])

In [43]:
test_df['uid'] = uid
test_df['sid'] = sid

In [44]:
test_df.to_csv(os.path.join(DATA_DIR, 'test.num.csv'), index=False)

In [45]:
uid = map(lambda x: uid2idx[x], vad_df['uid'])
sid = map(lambda x: sid2idx[x], vad_df['sid'])

In [46]:
vad_df['uid'] = uid
vad_df['sid'] = sid

In [47]:
vad_df.to_csv(os.path.join(DATA_DIR, 'vad.num.csv'), index=False)