In [1]:
from collections import defaultdict
import numpy as np
import math
from collections import Counter
import random

In [2]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [3]:
print("Reading data...")
data_user_items = list(parseData("./australian_users_items.json"))
print("Done for the user_items.")

Reading data...
Done for the user_items.


In [4]:
print("Reading the second dataset...")
data_user_reviews = list(parseData("./australian_user_reviews.json"))
print("Done for the user_reviews.")

Reading the second dataset...
Done for the user_reviews.


In [5]:
############# Preprocess the data and put them into dictionary ############

In [6]:
# Since there are users that have more than 1 data entry, let's ignore the earlier duplicate.
# Sanitize the data where there might be multiple entries of the same user id.
# Ignores duplicate.
user_item = []
users = []
for d in data_user_items[: int(len(data_user_items) / 10)]:
    user = d['user_id']
    
    if user in users:
        continue
        
    users.append(user)
    items = d['items']
    for item in items:
        user_item.append([user, item])

In [7]:
len(user_item)

869621

In [8]:
####### Random shuffle the data #####
np.random.shuffle(user_item)

boundary_1 = 0.7
boundary_2 = 0.9

data_train_userItem = user_item[:int(len(user_item) * boundary_1)]
data_validation_userItem = user_item[int(len(user_item) * boundary_1) : int(len(user_item) * boundary_2)]
data_test_userItem = user_item[int(len(user_item) * boundary_2):]

user_itemIds = defaultdict(set)
users = set()
itemIds = set()
for user, item in data_validation_userItem:
    user_itemIds[user].add(item['item_id'])
    users.add(user)
    itemIds.add(item['item_id'])
    
users = list(users)
itemIds = list(itemIds)

for i in range(len(data_validation_userItem)):
    while True:
        user = users[random.randint(0, len(users) - 1)]
        itemId = itemIds[random.randint(0, len(itemIds) - 1)]
        if not itemId in user_itemIds[user]:
            data_validation_userItem.append([user, {'item_id': itemId}])
            break

In [31]:
for i in range(len(data_test_userItem)):
    while True:
        user = users[random.randint(0, len(users) - 1)]
        itemId = itemIds[random.randint(0, len(itemIds) - 1)]
        if not itemId in user_itemIds[user]:
            data_test_userItem.append([user, {'item_id': itemId}])
            break

In [9]:
data_train_userItem[1]

['76561198073910418',
 {'item_id': '12240',
  'item_name': 'Grand Theft Auto: Vice City',
  'playtime_2weeks': 0,
  'playtime_forever': 0}]

In [10]:
len(data_train_userItem), len(data_validation_userItem), len(data_test_userItem)

(608734, 347848, 86963)

In [11]:
########### Process to add the item ids into the dictionary ###########
def getAllAttributes(data_userItems): 
    user_itemIds = defaultdict(set)
    itemId_users = defaultdict(set)
    user_itemId_playtime = defaultdict(lambda: defaultdict(int))
    itemId_item = {}
    user_itemCount = Counter()
    
    for user, item in data_train_userItem:
        item_id, item_name, playtime = item['item_id'], item['item_name'], item['playtime_forever']
        
        user_itemIds[user].add(item_id)
        itemId_users[item_id].add(user)
        user_itemId_playtime[user][item_id] = playtime
        itemId_item[item_id] = item
        user_itemCount[user] += 1
        
    return user_itemIds, itemId_users, user_itemId_playtime, itemId_item, user_itemCount 

In [12]:
user_itemIds, itemId_users, user_itemId_playtime, itemId_item, user_itemCount = getAllAttributes(data_train_userItem)

In [13]:
######### Feature 1: the most popular items #############
userCount_itemId = [(len(itemId_users[itemId]), itemId) for itemId in itemId_users]
userCount_itemId.sort()
userCount_itemId.reverse()

In [14]:
userCount_itemId[:5]

[(4288, '730'),
 (4207, '4000'),
 (3993, '205790'),
 (3809, '304930'),
 (3486, '223530')]

In [15]:
def getMostKItemIds(userCount_itemId, k): 
    return [x[1] for x in userCount_itemId[:k]]

In [34]:
def eval_on_(data, pred_function):
    tp = fp = tn = fn = 0
    progress = 0
    progress_percentage = 0
    for user, item in data:
        
        progress += 1
        if progress % (len(data) // 500) == 0:
            progress_percentage += .2
            print('progress: ' + str(progress_percentage) + '%')
            
        if pred_function(user, item):
            
            # predicts true.
            if 'item_name' in item:
                tp += 1
            else:
                fp += 1
                
        else:
            
            # Predicts false.
            if not 'item_name' in item:
                tn += 1
            else:
                fn += 1
                
    return (tp + tn) / (tp + tn + fp + fn)

In [17]:
len(data_validation_userItem)

347848

In [18]:
popular_itemIds = getMostKItemIds(userCount_itemId, len(userCount_itemId) // 2) 

In [19]:
def item_popular_f(user, item):
    if item['item_id'] in popular_itemIds:
        return True
    
    return False

eval_on_(data_validation_userItem, item_popular_f)

progress: 20%
progress: 40%
progress: 60%
progress: 80%
progress: 100%


0.6797279271405902

In [20]:
######### Feature 2: Calculate item Jaccard similarity based on purchased Users ############
def item_similarity(user_itemIds):
    N = defaultdict(int) # Item maps to its count.
    C = defaultdict(lambda: defaultdict(int)) # ItemId maps to another itemId to their number of matches.
    W = defaultdict(lambda: defaultdict(int)) # similarity: itemId-itemId intersection / itemId-itemId union
    for user in user_itemIds:
        
        # the items which are purchased by the user.
        for i in user_itemIds[user]:
            N[i] += 1 
            
            for j in user_itemIds[user]: 
                if i == j: 
                    continue
                C[i][j] += 1
                C[i]
                
    print("Finished the first part.")
    for i, related_item in C.items(): 
        for j, cij in related_item.items(): 
            W[i][j] = cij / (N[i] + N[j] - cij)
    return W

In [21]:
W = item_similarity(user_itemIds)

Finished the first part.


In [22]:
sim_threshold = 0.05
cnt_threshold = 0.14
def item_similarity_f(user, item):
    itemId = item['item_id']
    similarCount = 0

    count = 0
    for cur_itemId in user_itemIds[user]:

        if W[itemId][cur_itemId] > sim_threshold:
            similarCount += 1

    return similarCount > len(user_itemIds[user]) * cnt_threshold

print(eval_on_(data_validation_userItem, item_similarity_f))

progress: 20%
progress: 40%
progress: 60%
progress: 80%
progress: 100%
0.8492646213288563


In [23]:
print(eval_on_(data_test_userItem, item_similarity_f))

progress: 20%
progress: 40%
progress: 60%
progress: 80%
progress: 100%
0.8510515966560491


In [24]:
######### Feature 3: calculate user cosine similarity based on playtime forever #############

In [25]:
def playtime_total(user): 
    norm_v = 0 
    for i in user_itemIds[user]:
        norm_v += user_itemId_playtime[user][i]
    return norm_v

In [26]:
def all_playtime_total(): 
    N = defaultdict(int)
    for u in user_itemIds.keys():
        N[u] = playtime_total(u)
    return N 

In [27]:
all_playtime_total_N = all_playtime_total() 

In [28]:
def cosine(itemId_users, user_itemId_playtime, all_playtime_total_N): 
    C = defaultdict(lambda: defaultdict(int))
    for key in itemId_users: 
        for u1 in itemId_users[key]:
            for u2 in itemId_users[key]:
                if u1 == u2:
                    continue
                    
                C[u1][u2] += user_itemId_playtime[u1][key] * user_itemId_playtime[u2][key]

    print("Finished the first part.")
    for key in C:
        for each in C[key]:
            if all_playtime_total_N[key] * all_playtime_total_N[each] == 0:
                C[key][each] = 0
                continue
            C[key][each] = C[key][each] / (all_playtime_total_N[key] * all_playtime_total_N[each])**0.5
    return C

In [29]:
C = cosine(itemId_users, user_itemId_playtime, all_playtime_total_N)

Finished the first part.


In [35]:
len(C)

7679

In [33]:
for i in range(5):
    sim_threshold = 0.05 + i * 0.01
    cnt_threshold = 0.2
    def item_similarity_f(user, item):
        itemId = item['item_id']
        purchasedCount = 0

        cnt_similarUsers = []
        for other_user in C[user]:
            cnt_similarUsers.append([C[user][other_user], other_user])
        cnt_similarUsers.sort(reverse=True)
        cnt_similarUsers = cnt_similarUsers[: int(len(cnt_similarUsers) * sim_threshold)]

        # Finds out how many similar users have purchased this item before.
        similarUsers = [pair[1] for pair in cnt_similarUsers]
        for u in similarUsers:
            if itemId in user_itemIds[u]:
                purchasedCount += 1

        return purchasedCount > len(similarUsers) * cnt_threshold

    print(sim_threshold, eval_on_(data_validation_userItem, item_similarity_f))

KeyboardInterrupt: 