#### Downloading Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!mkdir dataset 
!cp "/content/drive/MyDrive/Major Project/reddit_data.csv.zip" dataset
!unzip -o dataset/reddit_data.csv.zip -d dataset
!rm -rf dataset/reddit_data.csv.zip

Archive:  dataset/reddit_data.csv.zip
  inflating: dataset/reddit_data.csv  


#### Parsing data

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn import preprocessing
import pickle

In [4]:
# Reddit data
reddit_data = pd.read_csv('dataset/reddit_data.csv')
reddit_data.head(5)

Unnamed: 0,username,subreddit,utc
0,kabanossi,photoshopbattles,1482748000.0
1,kabanossi,GetMotivated,1482748000.0
2,kabanossi,vmware,1482748000.0
3,kabanossi,carporn,1482748000.0
4,kabanossi,DIY,1482747000.0


In [5]:
reddit_data['utc'] = pd.to_datetime(reddit_data['utc'], unit='s')

In [6]:
reddit_data.head(5)
print(reddit_data.dtypes)

username             object
subreddit            object
utc          datetime64[ns]
dtype: object


#### Data Analysis and Preprocessing

We have:  
- Users : username  
- Items : subreddit

In [7]:
print(f"No of users : {len(reddit_data['username'].unique())}")
print(f"No of items : {len(reddit_data['subreddit'].unique())}")
print(f"No of events : {len(reddit_data)}")

No of users : 22610
No of items : 34967
No of events : 14000000


In [8]:
# Encoding user and items (use different encoders so inverse map is possible later)
user_encoder = preprocessing.LabelEncoder()
item_encoder = preprocessing.LabelEncoder()
reddit_data['username'] = user_encoder.fit_transform(reddit_data['username'])
reddit_data['subreddit'] = item_encoder.fit_transform(reddit_data['subreddit'])

In [9]:
print(reddit_data.sample(5))
print(reddit_data.dtypes)

          username  subreddit                 utc
10062180     15259      25454 2016-12-27 17:28:30
7298981      11714       9895 2016-08-04 15:52:35
12955351      9609      31879 2016-07-18 19:13:07
6787221       4460       2774 2016-11-21 19:21:01
13954870      8886      11716 2016-07-21 08:24:54
username              int64
subreddit             int64
utc          datetime64[ns]
dtype: object


**Rules for preprocessing dataset:**

1. Dataset format : [index, user, item, timestamp]
2. Sort by users and timestamps
3. Remove users or items with less than 10 occurances
4. Parse dataset into user -> array(sessions) format and session -> array((timestamp, item))
5. Interactions within 3600 seconds are in same session
6. Remove item repitions in the same session
7. Remove sessions that have 1 item only or too many items (here > 40)
8. Split sessions further if possible using max session size 20 and min session size 1
9. Remove users with less data (ex. having < 3 sessions)
10. Map user and item values to sequential labels for further usage

In [10]:
# Parameters for preprocessing
SESSION_TIME = timedelta(seconds=60*60)
MAX_SESSION_LENGTH = 20
MIN_REQUIRED_SESSIONS = 3
MIN_ITEM_SUPPORT = 10

In [11]:
# Remove values with insufficient data

# Items
item_support = reddit_data['subreddit'].value_counts()
data = reddit_data[~reddit_data['subreddit'].isin(item_support[item_support < MIN_ITEM_SUPPORT].index)]

# Users
user_support = data['username'].value_counts()
data = data[~data['username'].isin(user_support[user_support < MIN_ITEM_SUPPORT].index)]

In [12]:
print(f"No of users : {len(data['username'].unique())}")
print(f"No of items : {len(data['subreddit'].unique())}")

No of users : 21742
No of items : 13937


In [13]:
# Sort by users and timestamps
data = data.sort_values(by=["username", "utc"])
data.head(10)

Unnamed: 0,username,subreddit,utc
5892046,0,28497,2015-12-29 17:43:17
5892045,0,28497,2015-12-29 18:35:49
5892044,0,608,2015-12-30 15:54:03
5892043,0,1402,2015-12-30 16:19:23
5892042,0,23645,2015-12-30 16:39:05
5892041,0,4837,2015-12-31 16:25:46
5892040,0,15204,2015-12-31 17:20:29
5892039,0,1402,2015-12-31 17:47:43
5892038,0,31004,2015-12-31 19:14:58
5892037,0,24563,2016-01-02 00:32:33


In [14]:
# session -> list of sessions in format (timestamp, tag)
def collapse_session(session):
  new_session = [session[0]]
  for i in range(1, len(session)):
    last_session = new_session[-1]
    current_session = session[i]
    if current_session[1] != last_session[1]:
      new_session.append(current_session)
  
  return new_session

# user_sessions -> sessions of a user -> setof(user: array((timestamp, tag)))
def collapse_repeating_session(user_sessions):
  for user, session in user_sessions.items():
      for i in range(len(session)):
        session[i] = collapse_session(session[i])

# Remove sessions with only one event 
def remove_invalid_sessions(user_sessions):
  new_user_sessions = {}
  for user in user_sessions.keys():
        if user not in new_user_sessions:
            new_user_sessions[user] = []
        current = user_sessions[user]
        for session in current:
            if len(session) > 1 and len(session) <= MAX_SESSION_LENGTH*2:
                new_user_sessions[user].append(session)
  return new_user_sessions


# session -> list of sessions in format (timestamp, tag)
def split_session(session):
  splits = [session[i:i+MAX_SESSION_LENGTH] for i in range(0, len(session), MAX_SESSION_LENGTH)]
  # check last session length
  if len(splits[-1]) < 2:
    return splits[:-1]
  return splits

# session -> list of sessions in format (timestamp, tag)
def split_long_sessions(user_sessions):
    for user, sessions in user_sessions.items():
        user_sessions[user] = []
        for session in sessions:
          user_sessions[user] += split_session(session)

# dataset -> session dataset (columns : [index, user, item, timestamp])
# Assumes dataset is sorted by user and timestamp
def split_dataset_to_sessions(dataset):
  user_sessions = {}
  current_session = []
  for row in dataset.itertuples():
    userID, subID, timestamp = row[1:] # Ignore index
    event = (timestamp, subID)
    
    # New User
    if userID not in user_sessions:
      user_sessions[userID] = []
      current_session = []
      user_sessions[userID].append(current_session)
      current_session.append(event)
      continue
    
    # Existing user
    last_event = current_session[-1]
    timedelta = event[0] - last_event[0]
    if timedelta < SESSION_TIME:
      current_session.append(event)
    else:
      current_session = [event]
      user_sessions[userID].append(current_session)
  
  print("Sessions Created .....")

  collapse_repeating_session(user_sessions)
  print("Duplicates Removed .....")

  user_sessions = remove_invalid_sessions(user_sessions)
  print("Invalid Sessions Removed .....")

  split_long_sessions(user_sessions)
  print("Long Sessions Split .....")

  # Remove users with less sessions
  to_remove = set()
  for user, sessions in user_sessions.items():
    if (len(sessions) < MIN_REQUIRED_SESSIONS):
      to_remove.add(user)
  for user in to_remove:
    del user_sessions[user]
  print(f"{len(to_remove)} Users Removed .....")
  
  print(f"Processing complete .....")
  return user_sessions
  # Final sessions data available for user

In [15]:
# Convert dataset to sessions
sessions = split_dataset_to_sessions(data)

Sessions Created .....
Duplicates Removed .....
Invalid Sessions Removed .....
Long Sessions Split .....
3556 Users Removed .....
Processing complete .....


In [16]:
# Calculate statistics from session data

users = sessions.keys()
items = set()
num_sessions = 0
num_interactions = 0
interactions_per_user = []
interactions_per_session = []

for _, ses in sessions.items():
  num_sessions += len(ses)
  user_interactions = 0
  for session in ses:
    num_interactions += len(session)
    interactions_per_session.append(len(session))
    user_interactions += len(session)
    for event in session:
      items.add(event[1])
  interactions_per_user.append(user_interactions)

Results from paper   
- No of users : 18173  
- No of items : 13521  
- No of session : 1119225  
- No of interactions : 2868050  
- No of interactions per session : 2.6 
- No of interactions per user : 157.8

In [17]:
print("Results from preprocessing")
print(f"No of users : {len(users)}")
print(f"No of items : {len(items)}")
print(f"No of session : {num_sessions}")
print(f"No of interactions : {num_interactions}")
print(f"No of interactions per session : {np.array(interactions_per_session).mean()}")
print(f"No of interactions per user : {np.array(interactions_per_user).mean()}")

Results from preprocessing
No of users : 18186
No of items : 13737
No of session : 1123442
No of interactions : 3388177
No of interactions per session : 3.015889560831801
No of interactions per user : 186.30688441658418


In [18]:
# Remapping users and items
remapped_sessions = {}
for user, sess in sessions.items():
  remapped_sessions[len(remapped_sessions)] = sess

In [19]:
items = {}
for user, sess in remapped_sessions.items():
  for session in sess:
    for i in range(len(session)):
      a = session[i][1]
      if a not in items:
        items[a] = len(items)
      session[i] = (session[i][0], items[a])

In [20]:
print(list(remapped_sessions.keys())[:10])
print(remapped_sessions[0][:2])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[[(Timestamp('2015-12-30 15:54:03'), 0), (Timestamp('2015-12-30 16:19:23'), 1), (Timestamp('2015-12-30 16:39:05'), 2)], [(Timestamp('2015-12-31 16:25:46'), 3), (Timestamp('2015-12-31 17:20:29'), 4), (Timestamp('2015-12-31 17:47:43'), 1)]]


In [27]:
item_name_mapping = dict(zip(item_encoder.classes_, item_encoder.transform(item_encoder.classes_)))

In [21]:
def split_to_sets(data, test=0.1, val=0.1): # ingoring split ratios for now
  train = {}
  val = {}
  test = {}
  for user, sess in data.items():
    train[user] = sess[:-2]
    val[user] = [sess[-2]]
    test[user] = [sess[-1]]
  return train, val, test

In [22]:
train, val, test = split_to_sets(remapped_sessions)

In [23]:
# Calculate statistics from session data

def get_data_stats(sessions):
  users = sessions.keys()
  items = set()
  num_sessions = 0
  num_interactions = 0
  interactions_per_user = []
  interactions_per_session = []

  for _, ses in sessions.items():
    num_sessions += len(ses)
    user_interactions = 0
    for session in ses:
      num_interactions += len(session)
      interactions_per_session.append(len(session))
      user_interactions += len(session)
      for event in session:
        items.add(event[1])
    interactions_per_user.append(user_interactions)
  print("Results from preprocessing")
  print(f"No of users : {len(users)}")
  print(f"No of items : {len(items)}")
  print(f"No of session : {num_sessions}")
  print(f"No of interactions : {num_interactions}")
  print(f"No of interactions per session : {np.array(interactions_per_session).mean()}")
  print(f"No of interactions per user : {np.array(interactions_per_user).mean()}")

In [24]:
get_data_stats(train)

Results from preprocessing
No of users : 18186
No of items : 13698
No of session : 1087070
No of interactions : 3283757
No of interactions per session : 3.0207410746318084
No of interactions per user : 180.56510502584405


In [25]:
get_data_stats(val)

Results from preprocessing
No of users : 18186
No of items : 5612
No of session : 18186
No of interactions : 52098
No of interactions per session : 2.864731111844276
No of interactions per user : 2.864731111844276


In [26]:
get_data_stats(test)

Results from preprocessing
No of users : 18186
No of items : 5500
No of session : 18186
No of interactions : 52322
No of interactions per session : 2.877048278895854
No of interactions per user : 2.877048278895854


In [None]:
PADDING_ITEM = len(items)

In [None]:
def pad_and_clean_sessions(data, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM):
  sess_lens = {}
  padded_data = {}
  
  for user, sess in data.items():
    sess_lens[user] = []
    padded_data[user] = []

    for session in sess:
      item_cnt = len(session)
      session_data = [x[1] for x in session]
      sess_lens[user].append(item_cnt)
      if item_cnt < max_len:
        session_data += [pad_item] * (max_len - item_cnt)
      padded_data[user].append(session_data)
  return padded_data, sess_lens

In [None]:
train, train_lens = pad_and_clean_sessions(train, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM)
val, val_lens = pad_and_clean_sessions(val, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM)
test, test_lens = pad_and_clean_sessions(test, max_len=MAX_SESSION_LENGTH, pad_item=PADDING_ITEM)

In [None]:
item_name_mapping = dict(zip(item_encoder.classes_, item_encoder.transform(item_encoder.classes_)))

#### Saving the data

In [None]:
PROCESSED_DATA_PATH = "/content/drive/MyDrive/Major Project/reddit_processed_split_mapped.pickle"

In [None]:
processed_data = {
    "train": train,
    "train_lens": train_lens,
    "test": test,
    "test_lens": test_lens,
    "val": val,
    "val_lens": val_lens,
    "item_index_mapping": items,
    "item_name_mapping": item_name_mapping
}

with open(PROCESSED_DATA_PATH, "wb") as savefile:
  pickle.dump(processed_data, savefile)