In [2]:
from os.path import join as pathJoin
import pandas as pd
import numpy as np
import pickle
import scipy.spatial.distance as dist
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors, LSHForest
import itertools as itt

In [3]:
data_root = "tag-genome/"

MovieID_TagID_Relevance = pathJoin(data_root, "tag_relevance.dat")
MovieID_Title_MoviePopularity = pathJoin(data_root, "movies.dat")
TagID_Tag_TagPopularity = pathJoin(data_root, "tags.dat")

In [4]:
tag_relevance = pd.read_csv(MovieID_TagID_Relevance, delimiter='\t', header=None, names=['MovieID', 'TagID', 'Relevance'])
movies = pd.read_csv(MovieID_Title_MoviePopularity, delimiter='\t', header=None, names=['MovieID', 'Title', 'MoviePopularity'])
tags = pd.read_csv(TagID_Tag_TagPopularity, delimiter='\t', header=None, names=['TagID', 'Tag', 'TagPopularity'])

In [6]:
with open('pickled/genome.pickle', 'rb') as f:
    genome = pickle.load(f)
genome = np.array(genome)

In [8]:
def popularity(tagId):
    return tags.TagPopularity[tagId] + 1

In [9]:
def docFreq(tag):
    byTag = genome[:, tag]
    specific = [x for x in byTag if x > 0.5]
    return len(specific) + 2

In [11]:
nTags = tags.shape[0]
tag_weights = np.array([np.log(popularity(tagId)) / np.log(docFreq(tagId)) for tagId in range(nTags)])

In [12]:
# need to get rid of inf
sorted(tag_weights, reverse=True)[:5]

[6.8201789624151887,
 6.1699250014423122,
 4.5235619560570131,
 3.236217269879349,
 2.2435894449529292]

In [13]:
def weightedCosSimi(u, v):
    '''Not exactly like in the article'''
    return dist.cosine(u * tag_weights, v * tag_weights)

In [17]:
def articleCosSimi(u, v):
    '''Exactly like in the article'''
    w = tag_weights
    x = np.sum(w * u * v)
    y = np.sqrt(sum(w * u * u)) * np.sqrt(sum(w * v * v)) 
    return  x / y 

In [15]:
def rel(tagId, movId):
    return genome[movId, tagId]

In [16]:
def critiqueDist(critiquedMovieId, retrievedMovieId, tagId, direction):
    ic, ir, t, d  = critiquedMovieId, retrievedMovieId, tagId, direction
    return max(0, rel(t, ir) - rel(t, ic) * d)

In [17]:
def printNeigh(randMovNum, indices):
    neighbors = indices[randMovNum]
    for mid in neighbors:
        print(movies.loc[mid]["Title"])

In [128]:
X = genome.copy()
X = X * tag_weights
nbrs_art2 = NearestNeighbors(algorithm="brute", metric=dist.cosine, n_neighbors=250)
# nbrs_art2 = LSHForest(n_candidates = 500, n_neighbors = 250)
nbrs_art2.fit(X)

NearestNeighbors(algorithm='brute', leaf_size=30,
         metric=<function cosine at 0x7f270d5d0f28>, metric_params=None,
         n_jobs=1, n_neighbors=250, p=2, radius=1.0)

In [64]:
# distances_art2

In [269]:
movies.head()

Unnamed: 0,MovieID,Title,MoviePopularity
0,1,Toy Story (1995),53059
1,2,Jumanji (1995),22466
2,3,Grumpier Old Men (1995),15111
3,4,Waiting to Exhale (1995),2898
4,5,Father of the Bride Part II (1995),14323


In [90]:
def neighbours_index(i):
    _, indices_art2 = nbrs_art2.kneighbors(X[i,:].reshape(1, -1))
    return indices_art2[0]

def N(i):
    return neighbours_index(i)

def Np(i, t, Ni):
    delta = 0.25
    return [j for j in Ni if rel(t, j) > rel(t, i) + delta]

def Nn(i, t, Ni):
    delta = 0.25
    return [j for j in Ni if rel(t, j) < rel(t, i) - delta]

def Nz(i, t, Ni):
    return [j for j in Ni if j not in Np(i, t, Ni) and j not in Nn(i, t, Ni)]

In [89]:
def critiqueEntropy(t, i):
    Ni = N(i)
    res = 0
    for Nd in [Nn(i, t, Ni), Nz(i, t, Ni), Np(i, t, Ni)] : 
        res += critiqueEntropyH(t, i, Nd, Ni)
    return res

In [91]:
def critiqueEntropyH(t, i, Nd, Ni):
    modNd = len(Nd) + 1
    modN = len(Ni)
    return - modNd / modN * np.log(modNd / modN)

In [101]:
t = 12
i = 123
critiqueEntropy(t, i)
# b = N(i)
# rel(t, 0)
# for ii in range(i):
#     print(rel(t, ii))
# N(i)
# Np(i, t)
# Nn(i, t)
# Nz(i, t)

0.12746999025767508

In [63]:
genome[: , 12]

array([ 0.191,  0.238,  0.131, ...,  0.092,  0.039,  0.104])

In [31]:
mId = 0
movies.loc[mId]['Title']

'Toy Story (1995)'

In [292]:
tags.head()

Unnamed: 0,TagID,Tag,TagPopularity
0,0,007,61
1,1,007 (series),24
2,2,18th century,37
3,3,1920s,42
4,4,1930s,55


In [32]:
def tagSim(tA, tB):
    relevanceA = genome[:, tA]
    relevanceB = genome[:, tB]
    return dist.cosine(relevanceA, relevanceB)

In [126]:
# function which we want to minimize by S
def objective_function(S, i):
    "|S| == 5"
    cond1 = lambda t : popularity(t) >= 50
    cond2 = lambda t : np.all(np.array([tagSim(t, u) < 0.5 for u in S if t != u]))
    cond3 = lambda t : critiqueEntropy(t) > 0.325
    ts = [t for t in S if cond1(t) and cond2(t) and cond3(t)]
    iss = list(itt.repeat(i, 5))
    return np.sum([critiqueEntropy(t, i) * np.log(popularity(t)) for t in ts])

In [127]:
objective_function([1, 123, 12, 14, 50], 12)

0.0