In [4]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


<h1> Imports + Reading csv files + basic info</h1>

In [1]:
import pandas as pd
import os
import numpy as np
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import sklearn.preprocessing as pp

def printCustomerActions(customer_id, dataframe):
    print(dataframe[dataframe['customer_id'] == customer_id])

#vyzaduje maticu row(produkt) a stlpec(user)
def cosine_similarities(mat):
    col_normed_mat = pp.normalize(mat.tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat

def getCustomerItemsDF(customer_id, dataframe):
    return dataframe[dataframe['customer_id'] == customer_id]

def getCustomerItemsSet(customer_id, dataframe):
    return set(dataframe[dataframe['customer_id'] == customer_id]["product_id"].unique())

def getCustomerItemsList(customer_id, dataframe):
    return list(dataframe[dataframe['customer_id'] == customer_id]["product_id"].unique())

def getSimilarCustomers(similarities, targetCustomer, numOfSimilars):
    targetSimilars = similarities[targetCustomer, :].toarray().flatten()
    lst = pd.Series(targetSimilars)
    i = lst.nlargest(numOfSimilars)
    i.index.values.tolist()
    return i

def recommendItems(similarities, targetCustomer, dataframe, numOfSimilars):
    try:
        result = getSimilarCustomers(similarities, targetCustomer, numOfSimilars)
        items = {}
        targetCustomerItems = getCustomerItemsSet(targetCustomer, eventsDF)
        similarCustomers = result.keys()
        finalItems = []
        for similarCustomer in similarCustomers:
            customerActions = getCustomerItemsDF(similarCustomer, eventsDF)
            groupActions = customerActions.groupby("product_id").sum()["type"];
            for item in groupActions.keys():
                #print("{} : {}".format(item, groupActions[item]))
                #if item not in targetCustomerItems:
                if item not in items:
                    items[item] = groupActions[item]
                else:
                    items[item] += groupActions[item]
        #print(items)
        #print(targetCustomerItems)
        for i in range(10):
            if len(items.keys()) != 0:
                maxKey = max(items, key=items.get)
                #print(maxKey)
                finalItems.append(maxKey)
                items.pop(maxKey)
            #printCustomerActions(similarCustomer, eventsDF)
        return finalItems
    except IndexError:
        return list()

def recommendItemBased(similarities, targetItem, numOfSimilars):
    targetSimilars = similarities[targetItem,:].toarray().flatten()
    lst = pd.Series(targetSimilars)
    i = lst.nlargest(numOfSimilars + 1)
    result = i.index.values.tolist()
    result.remove(targetItem)
    return result
    
def splitDataframe(df, train_percent=.8, validate_percent=.2, seed=None):
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[:train_end]
    test = df.iloc[train_end:]
    return train, test

def calculatePrediction(recommendeditems, actualItems):
    hits = 0
    for item in actualItems:
        if item in recommendeditems:
            hits += 1
    return hits

def getPopularItems(dataframe, n):
    itemInfo = dataframe[['product_id', 'type']].groupby("product_id").agg("sum")
    itemInfo = itemInfo.sort_values(by=['type'], ascending=False)
    return list(itemInfo.index.values)[:n]

def getUsersBestItem(dataframe, targetUser):
    productInfo = dataframe[dataframe['customer_id'] == targetUser].groupby("product_id").agg("sum")["type"]
    keys = productInfo.keys()
    max = 0
    maxkey = 0
    for key in keys:
        if productInfo[key] > max:
            max = productInfo[key]
            maxkey = key
    return key

eventFile = "vi_dataset_events.csv"
catalogFile = "vi_dataset_catalog.csv"
eventsDF = pd.read_csv(os.path.basename(eventFile))
catalogDF = pd.read_csv(os.path.basename(catalogFile))

eventsDF['type'] = eventsDF['type'].map({'view_product': 0.001, 'add_to_cart': 0.01, 'purchase_item': 0.1})
eventsDF = eventsDF.sort_values(by=['timestamp'])
trainDF, testDF = splitDataframe(eventsDF)
popularItems = getPopularItems(trainDF, 10)
print("Popular items are: {}".format(popularItems))
eventsDF.head()

Popular items are: [11219, 22031, 16959, 3526, 24851, 3617, 20591, 24848, 24846, 20585]


Unnamed: 0,customer_id,product_id,type,timestamp
0,1,19685,0.001,1527812004
1,1,19685,0.001,1527812041
2,1,19685,0.01,1527812046
3,1,19685,0.001,1527812048
4,1,19685,0.001,1527812050


<h1>Making Pivot Table</h1>

In [2]:
file = trainDF
user_u = list(sorted(file.customer_id.unique()))
item_u = list(sorted(file.product_id.unique()))

col = file['customer_id'].tolist()
row = file['product_id'].tolist()
data = file['type'].tolist()
sparse_matrix_user = csc_matrix((data, (row, col)))
sparse_matrix_item = csc_matrix((data, (col, row)))

similarities_user = cosine_similarities(sparse_matrix_user)
similarities_item = cosine_similarities(sparse_matrix_item)
items = recommendItems(similarities_user, 1, trainDF, 50)
print(items)
print(getCustomerItemsList(1, trainDF))

[19685, 21140, 20785, 17621, 22262, 15065, 22614, 17531, 20786, 17551]
[19685]


In [4]:
testUsers = testDF["customer_id"].unique()
hits = 0
misses = 0
print("Total Users: {}".format(len(testUsers)))
for i in range(len(testUsers)):
    user = testUsers[i]
    userItems = getCustomerItemsList(user, trainDF)
    if len(userItems) > 10:
        recommendeditems = recommendItems(similarities_user, user, trainDF, 50)
    elif len(userItems) > 0:
        targetItem = getUsersBestItem(trainDF, user)
        recommendeditems = recommendItemBased(similarities_item, targetItem, 10)
    else:
        recommendeditems = getPopularItems(trainDF, 10)
    actualItems = getCustomerItemsList(user, testDF)
    if len(actualItems) == 0:
        correctPredictions = "No Data in Future"
    else:
        correctPredictions = calculatePrediction(recommendeditems, actualItems)
    if correctPredictions > 0:
        hits += 1
    else:
        misses += 1
        
print("Item Based and User Based Recommender had {} hits and {} misses and overall ratio {}".format(hits,misses, (hits/len(testUsers))))

hits = 0
misses = 0

for i in range(len(testUsers)):
    user = testUsers[i]
    recommendeditems = recommendItems(similarities_user, user, trainDF, 50)
    actualItems = getCustomerItemsList(user, testDF)
    if len(actualItems) == 0 and len(getCustomerItemsList(user, trainDF)):
        correctPredictions = "No Data in Future and Past"
    else:
        correctPredictions = calculatePrediction(recommendeditems, actualItems)
    if correctPredictions > 0:
        hits += 1
    else:
        misses += 1
print("User Based only Recommender had {} hits and {} misses and overall ratio {}".format(hits,misses, (hits/len(testUsers))))


Total Users: 23001
Item Based and User Based Recommender had 2516 hits and 20485 misses and overall ratio 0.10938654841093866
User Based only Recommender had 5041 hits and 17960 misses and overall ratio 0.21916438415721057


UnboundLocalError: local variable 'key' referenced before assignment

In [5]:
hits = 0
misses = 0

for i in range(len(testUsers)):
    user = testUsers[i]
    recommendeditems = getPopularItems(trainDF, 10)
    actualItems = getCustomerItemsList(user, testDF)
    if len(actualItems) == 0:
        correctPredictions = "No Data in Future"
    else:
        correctPredictions = calculatePrediction(recommendeditems, actualItems)
    if correctPredictions > 0:
        hits += 1
    else:
        misses += 1
print("Popular only Recommender had {} hits and {} misses and overall ratio {}".format(hits,misses, (hits/len(testUsers))))

Popular only Recommender had 1608 hits and 21393 misses and overall ratio 0.06991000391287336


In [138]:
#Kaggle
challenge_users = []
lst = []
with open("vi_challenge_uID.csv", "r") as challenge_file:
    for line in challenge_file:
        challenge_users.append(int(line.strip()))
print(challenge_users)

for user in challenge_users:
    userItems = getCustomerItemsList(user, eventsDF)
    if len(userItems) > 10:
        recommendeditems = recommendItems(similarities_user, user, eventsDF, 50)
    elif len(userItems) > 0:
        targetItem = getUsersBestItem(eventsDF, user)
        recommendeditems = recommendItemBased(similarities_item, targetItem, 10)
    else:
        recommendeditems = getPopularItems(trainDF, 10)
    for item in recommendeditems:
        lst.append([user, item])
df_sumbission = pd.DataFrame(lst, columns=['customer_id', 'product_id'])
df_sumbission.to_csv('xvalka_vi_challenge.csv', sep=',', encoding='utf-8', index=False)


[44, 44, 70, 160, 217, 260, 322, 361, 459, 487, 534, 557, 599, 644, 700, 719, 759, 809, 853, 946, 964, 1016, 1052, 1090, 1102, 1127, 1172, 1293, 1376, 1391, 1449, 1553, 1592, 1796, 1811, 1965, 2050, 2240, 2395, 2541, 2727, 2807, 2817, 3424, 3526, 4028, 4037, 4157, 4206, 5024, 5380, 5475, 9652, 9740, 9756, 9871, 10596, 10647, 10673, 10717, 10825, 10871, 11004, 15114, 15117, 15218, 15363, 16467, 16470, 16820, 17640, 17739, 18038, 18260, 21223, 21290, 21452, 21582, 21696, 21868, 22084, 23084, 23648, 24492, 24977, 25603, 25820, 29024, 29991, 30715, 30940, 30965, 33058, 33369, 33565, 36255, 37716, 37991, 38204, 38492, 38660, 39156, 40431, 42108, 42704, 43035, 43270, 44790, 44922, 45462, 45832, 46083, 46290, 49910, 50245, 50248, 50620, 51963, 57064, 57963, 58699, 58869, 59980, 60451, 61272, 63580, 64319, 65588, 68525, 69157, 71192, 71304, 71796, 74047, 74315, 76271, 77181, 79058, 84320]
