#### Downloading Dataset

In [1]:
!mkdir dataset
!curl https://files.grouplens.org/datasets/hetrec2011/hetrec2011-delicious-2k.zip --output dataset/delicious.zip
!unzip -o dataset/delicious.zip -d dataset
!rm -rf dataset/delicious.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13.2M  100 13.2M    0     0  25.0M      0 --:--:-- --:--:-- --:--:-- 25.0M
Archive:  dataset/delicious.zip
  inflating: dataset/bookmark_tags.dat  
  inflating: dataset/bookmarks.dat   
  inflating: dataset/readme.txt      
  inflating: dataset/tags.dat        
  inflating: dataset/user_contacts.dat  
  inflating: dataset/user_contacts-timestamps.dat  
  inflating: dataset/user_taggedbookmarks.dat  
  inflating: dataset/user_taggedbookmarks-timestamps.dat  


#### Parsing data

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pickle

In [3]:
# User Contacts
user_contacts = pd.read_csv('dataset/user_contacts-timestamps.dat', sep='\t')
user_contacts['timestamp'] = pd.to_datetime(user_contacts['timestamp'], unit='ms')
user_contacts.sample(5)

Unnamed: 0,userID,contactID,timestamp
1879,14206,88153,2006-08-17 10:40:35
11470,80334,97975,2010-07-22 19:07:15
957,7948,74967,2009-05-16 14:17:45
7925,53844,95213,2006-09-22 06:37:13
1602,12299,101198,2009-07-19 19:17:53


In [4]:
# Bookmarks
# Some of these files are not encoded in UTF-8 (default) but in ANSI (windoes proprietory) so using ISO-8859-1 encoding which is subset of ANSI (-_-)
bookmarks = pd.read_csv('dataset/bookmarks.dat', sep='\t', index_col='id', encoding="ISO-8859-1")
bookmark_tags = pd.read_csv('dataset/bookmark_tags.dat', sep='\t')
tags = pd.read_csv('dataset/tags.dat', sep='\t', index_col='id', encoding="ISO-8859-1")

In [5]:
bookmarks.sample(5)

Unnamed: 0_level_0,md5,title,url,md5Principal,urlPrincipal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
72965,b13439d1e1d60b3235bdfaf1fd2dfc1f,Understanding Bushonomics: How We Got Into Thi...,http://www.americanprogress.org/issues/2008/08...,e9da54a86fe33ac316e5982fbf61bd54,www.americanprogress.org
72580,70bfc03f155a4b3ba19914b634ddb999,Abstract | Does the mind map learning strategy...,http://www.biomedcentral.com/1472-6920/10/61,f3d298e491e3ec4c723bf0e6afef5824,www.biomedcentral.com
82174,7f080fb69ab5829995d9cbbafb885203,Becta research - Research - Becta research - T...,http://research.becta.org.uk/index.php?section...,2b2175fd72851f5831386880520c97bd,research.becta.org.uk
46177,904877cf44fc3347838b5bb9948b9689,Copyrights in the Digital Age - FREE Presentat...,http://themes.pppst.com/copyrights.html,c26aa475f520e41b8abc3ed256bbe091,themes.pppst.com
50317,c110bd913d55cb390c0c6a07f2a5b326,Museum of London Releases Augmented Reality Ap...,http://www.petapixel.com/2010/05/24/museum-of-...,bd456a9e53edd7a431373b54b404f8ea,www.petapixel.com


In [6]:
bookmark_tags.sample(5)

Unnamed: 0,bookmarkID,tagID,tagWeight
220521,45507,122,9
134731,26680,434,75
125475,24936,1536,6
200261,40908,793,4
371494,80872,174,615


In [7]:
tags.sample(5)

Unnamed: 0_level_0,value
id,Unnamed: 1_level_1
18319,deutsche_bahn
52945,field_trips
32018,aardvark
56977,tyrant
39303,relationality


# New Section

In [69]:
# User tags
user_taggedbookmarks = pd.read_csv('dataset/user_taggedbookmarks-timestamps.dat', sep='\t')
user_taggedbookmarks['timestamp'] = pd.to_datetime(user_taggedbookmarks['timestamp'], unit = 'ms')
user_taggedbookmarks.sample(5)

Unnamed: 0,userID,bookmarkID,tagID,timestamp
424597,103491,26077,153,2010-06-25 03:14:36
87336,11853,4850,505,2010-10-17 05:28:01
336343,76601,102970,859,2010-05-27 00:57:19
174153,30103,53322,544,2008-01-08 07:44:57
307596,68360,94330,68,2010-03-11 21:50:24


#### Data Analysis and Preprocessing

We have:  
- Users : userID  
- Items : tagID
- Relations : User - User

In [70]:
print(f"No of users : {len(user_taggedbookmarks['userID'].unique())}")
print(f"No of items : {len(user_taggedbookmarks['tagID'].unique())}")
print(f"No of events : {len(user_taggedbookmarks)}")
print(f"No of user links : {len(user_contacts)}")

No of users : 1867
No of items : 40897
No of events : 437593
No of user links : 15328


In [71]:
# Removing bookmarks
data = user_taggedbookmarks.drop("bookmarkID", axis=1)

**Rules for preprocessing dataset:**

1. Dataset format : [index, user, item, timestamp]
2. Sort by users and timestamps
3. Remove users or items with less than 10 occurances
4. Parse dataset into user -> array(sessions) format and session -> array((timestamp, item))
5. Interactions within 3600 seconds are in same session
6. Remove item repitions in the same session
7. Remove sessions that have 1 item only or too many items (here > 40)
8. Split sessions further if possible using max session size 20 and min session size 1
9. Remove users with less data (ex. having < 3 sessions)
10. Map user and item values to sequential labels for further usage

In [72]:
# Parameters for preprocessing
SESSION_TIME = timedelta(seconds=60*60)
MAX_SESSION_LENGTH = 20
MIN_REQUIRED_SESSIONS = 3
MIN_ITEM_SUPPORT = 10

In [73]:
# Remove values with insufficient data

# Items
tag_support = data.groupby("tagID").size()
data = data[np.in1d(data["tagID"], tag_support[tag_support >= MIN_ITEM_SUPPORT].index)]

# Users
user_support = data.groupby("userID").size()
data = data[np.in1d(data["userID"], user_support[user_support >= MIN_ITEM_SUPPORT].index)]

In [74]:
print(f"No of users : {len(data['userID'].unique())}")
print(f"No of items : {len(data['tagID'].unique())}")

No of users : 1743
No of items : 5064


In [75]:
# Sort by users and timestamps
data = data.sort_values(by=["userID", "timestamp"])
data.head(10)

Unnamed: 0,userID,tagID,timestamp
151,8,112,2010-10-28 18:50:28
150,8,111,2010-10-29 16:15:24
149,8,2,2010-10-29 16:52:46
146,8,24,2010-10-29 16:53:56
147,8,25,2010-10-29 16:53:56
148,8,66,2010-10-29 16:53:56
145,8,24,2010-10-29 17:07:15
144,8,24,2010-10-29 17:09:03
143,8,24,2010-10-29 17:09:59
141,8,82,2010-10-29 17:28:55


In [76]:
# session -> list of sessions in format (timestamp, tag)
def collapse_session(session):
  new_session = [session[0]]
  for i in range(1, len(session)):
    last_session = new_session[-1]
    current_session = session[i]
    if current_session[1] != last_session[1]:
      new_session.append(current_session)
  
  return new_session

# user_sessions -> sessions of a user -> setof(user: array((timestamp, tag)))
def collapse_repeating_session(user_sessions):
  for user, session in user_sessions.items():
      for i in range(len(session)):
        session[i] = collapse_session(session[i])

# Remove sessions with only one event 
def remove_invalid_sessions(user_sessions):
  new_user_sessions = {}
  for user in user_sessions.keys():
        if user not in new_user_sessions:
            new_user_sessions[user] = []
        current = user_sessions[user]
        for session in current:
            if len(session) > 1 and len(session) <= MAX_SESSION_LENGTH*2:
                new_user_sessions[user].append(session)
  return new_user_sessions


# session -> list of sessions in format (timestamp, tag)
def split_session(session):
  splits = [session[i:i+MAX_SESSION_LENGTH] for i in range(0, len(session), MAX_SESSION_LENGTH)]
  # check last session length
  if len(splits[-1]) < 2:
    return splits[:-1]
  return splits

# session -> list of sessions in format (timestamp, tag)
def split_long_sessions(user_sessions):
    for user, sessions in user_sessions.items():
        user_sessions[user] = []
        for session in sessions:
          user_sessions[user] += split_session(session)

# dataset -> session dataset (columns : [index, user, item, timestamp])
# Assumes dataset is sorted by user and timestamp
def split_dataset_to_sessions(dataset):
  user_sessions = {}
  current_session = []
  for row in dataset.itertuples():
    userID, tagID, timestamp = row[1:] # Ignore index
    event = (timestamp, tagID)
    
    # New User
    if userID not in user_sessions:
      user_sessions[userID] = []
      current_session = []
      user_sessions[userID].append(current_session)
      current_session.append(event)
      continue
    
    # Existing user
    last_event = current_session[-1]
    timedelta = event[0] - last_event[0]
    if timedelta < SESSION_TIME:
      current_session.append(event)
    else:
      current_session = [event]
      user_sessions[userID].append(current_session)

  collapse_repeating_session(user_sessions)
  user_sessions = remove_invalid_sessions(user_sessions)
  split_long_sessions(user_sessions)

  # Remove users with less sessions
  to_remove = set()
  for user, sessions in user_sessions.items():
    if (len(sessions) < MIN_REQUIRED_SESSIONS):
      to_remove.add(user)
  for user in to_remove:
    del user_sessions[user]
  
  return user_sessions
  # Final sessions data available for user

In [77]:
# Convert dataset to sessions
sessions = split_dataset_to_sessions(data)

In [78]:
# Calculate statistics from session data

users = sessions.keys()
items = set()
num_sessions = 0
num_interactions = 0
interactions_per_user = []
interactions_per_session = []

for _, ses in sessions.items():
  num_sessions += len(ses)
  user_interactions = 0
  for session in ses:
    num_interactions += len(session)
    interactions_per_session.append(len(session))
    user_interactions += len(session)
    for event in session:
      items.add(event[1])
  interactions_per_user.append(user_interactions)

Results from paper   
- No of users : 1643  
- No of items : 5005  
- No of session : 45603  
- No of interactions : 257639  
- No of interactions per session : 5.6  
- No of interactions per user : 156.8  

In [79]:
print("Results from preprocessing")
print(f"No of users : {len(users)}")
print(f"No of items : {len(items)}")
print(f"No of session : {num_sessions}")
print(f"No of interactions : {num_interactions}")
print(f"No of interactions per session : {np.array(interactions_per_session).mean()}")
print(f"No of interactions per user : {np.array(interactions_per_user).mean()}")

Results from preprocessing
No of users : 1648
No of items : 5038
No of session : 55965
No of interactions : 314096
No of interactions per session : 5.612364870901456
No of interactions per user : 190.59223300970874


In [80]:
# Remapping users and items
remapped_sessions = {}
for user, sess in sessions.items():
  remapped_sessions[len(remapped_sessions)] = sess

In [81]:
items = {}
for user, sess in remapped_sessions.items():
  for session in sess:
    for i in range(len(session)):
      a = session[i][1]
      if a not in items:
        items[a] = len(items)
      session[i] = (session[i][0], items[a])

PADDING_ITEM = len(items)

In [82]:
print(list(remapped_sessions.keys())[:10])
print(remapped_sessions[0][:2])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[[(Timestamp('2010-10-29 16:15:24'), 0), (Timestamp('2010-10-29 16:52:46'), 1), (Timestamp('2010-10-29 16:53:56'), 2), (Timestamp('2010-10-29 16:53:56'), 3), (Timestamp('2010-10-29 16:53:56'), 4), (Timestamp('2010-10-29 17:07:15'), 2), (Timestamp('2010-10-29 17:28:55'), 5), (Timestamp('2010-10-29 17:28:55'), 6)], [(Timestamp('2010-10-29 21:21:22'), 1), (Timestamp('2010-10-29 21:21:22'), 3), (Timestamp('2010-10-29 21:21:22'), 4), (Timestamp('2010-10-29 21:21:22'), 5), (Timestamp('2010-10-29 21:21:22'), 7), (Timestamp('2010-10-29 21:25:35'), 1), (Timestamp('2010-10-29 21:25:35'), 3), (Timestamp('2010-10-29 21:25:35'), 5), (Timestamp('2010-10-29 21:25:35'), 8), (Timestamp('2010-10-29 21:37:15'), 9), (Timestamp('2010-10-29 21:37:15'), 10)]]


#### Splitting

In [83]:
def split_to_sets(data, test=0.1, val=0.1): # ingoring split ratios for now
  train = {}
  val = {}
  test = {}
  for user, sess in data.items():
    train[user] = sess[:-2]
    val[user] = [sess[-2]]
    test[user] = [sess[-1]]
  return train, val, test

In [84]:
train, val, test = split_to_sets(remapped_sessions)

In [85]:
# Calculate statistics from session data

def get_data_stats(sessions):
  users = sessions.keys()
  items = set()
  num_sessions = 0
  num_interactions = 0
  interactions_per_user = []
  interactions_per_session = []

  for _, ses in sessions.items():
    num_sessions += len(ses)
    user_interactions = 0
    for session in ses:
      num_interactions += len(session)
      interactions_per_session.append(len(session))
      user_interactions += len(session)
      for event in session:
        items.add(event[1])
    interactions_per_user.append(user_interactions)
  print("Results from preprocessing")
  print(f"No of users : {len(users)}")
  print(f"No of items : {len(items)}")
  print(f"No of session : {num_sessions}")
  print(f"No of interactions : {num_interactions}")
  print(f"No of interactions per session : {np.array(interactions_per_session).mean()}")
  print(f"No of interactions per user : {np.array(interactions_per_user).mean()}")

In [86]:
get_data_stats(train)

Results from preprocessing
No of users : 1648
No of items : 5036
No of session : 52669
No of interactions : 297192
No of interactions per session : 5.642636085743037
No of interactions per user : 180.33495145631068


In [87]:
get_data_stats(val)

Results from preprocessing
No of users : 1648
No of items : 2491
No of session : 1648
No of interactions : 8845
No of interactions per session : 5.367111650485437
No of interactions per user : 5.367111650485437


In [88]:
get_data_stats(test)

Results from preprocessing
No of users : 1648
No of items : 2367
No of session : 1648
No of interactions : 8059
No of interactions per session : 4.890169902912621
No of interactions per user : 4.890169902912621


In [89]:
def pad_and_clean_sessions(data, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM):
  sess_lens = {}
  padded_data = {}
  
  for user, sess in data.items():
    sess_lens[user] = []
    padded_data[user] = []

    for session in sess:
      item_cnt = len(session)
      session_data = [x[1] for x in session]
      sess_lens[user].append(item_cnt)
      if item_cnt < max_len:
        session_data += [pad_item] * (max_len - item_cnt)
      padded_data[user].append(session_data)
  return padded_data, sess_lens

In [90]:
train, train_lens = pad_and_clean_sessions(train, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM)

In [91]:
val, val_lens = pad_and_clean_sessions(val, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM)

In [92]:
test, test_lens = pad_and_clean_sessions(test, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM)

#### For SASRec

In [None]:
clean_sessions = {}
for user, sessions in remapped_sessions.items():
  clean_sessions[user] = []
  for sess in sessions:
    for item in sess:
      clean_sessions[user].append(item[1])

#### Saving the data

In [61]:
# To save data in Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [93]:
PROCESSED_DATA_PATH = "/content/drive/MyDrive/Major Project/delicious_processed_split.pickle"

In [94]:
processed_data = {
    "train": train,
    "train_lens": train_lens,
    "test": test,
    "test_lens": test_lens,
    "val": val,
    "val_lens": val_lens,
    "item_mapping": items,
    "tagname_mapping": tags
}

with open(PROCESSED_DATA_PATH, "wb") as savefile:
  pickle.dump(processed_data, savefile)