In [1]:
from os.path import join as pathJoin
import pandas as pd
import numpy as np
import pickle
import scipy.spatial.distance as dist
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
import itertools as itt

In [4]:
data_root = "../tag-genome"

MovieID_TagID_Relevance = pathJoin(data_root, "tag_relevance.dat")
MovieID_Title_MoviePopularity = pathJoin(data_root, "movies.dat")
TagID_Tag_TagPopularity = pathJoin(data_root, "tags.dat")

In [38]:
tag_relevance = pd.read_csv(MovieID_TagID_Relevance, delimiter='\t', header=None, names=['MovieID', 'TagID', 'Relevance'])
movies = pd.read_csv(MovieID_Title_MoviePopularity, delimiter='\t', header=None, names=['MovieID', 'Title', 'MoviePopularity'])
tags = pd.read_csv(TagID_Tag_TagPopularity, delimiter='\t', header=None, names=['TagID', 'Tag', 'TagPopularity'])

In [39]:
with open('../tag-genome/pickled/genome.pickle', 'rb') as f:
    genome = pickle.load(f)

In [49]:
genome = np.array(genome)
list(movies.Title.loc[1:5])

['Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)']

In [13]:
def popularity(tagId):
    return tags.TagPopularity[tagId] + 1

In [14]:
def docFreq(tag):
    byTag = genome[:, tag]
    specific = [x for x in byTag if x > 0.5]
    return len(specific) + 2

In [15]:
tag_weights = np.array([np.log(popularity(tagId)) / np.log(docFreq(tagId)) for tagId in range(tags.shape[0])])

In [20]:
# need to get rid of inf
sorted(tag_weights, reverse=True)[:5]

[6.8201789624151887,
 6.1699250014423122,
 4.5235619560570131,
 3.236217269879349,
 2.2435894449529292]

In [13]:
infID = np.argmax(tag_weights)
tag_weights[infID] = 9

In [17]:
def weightedCosSimi(u, v):
    '''Not exactly like in the article'''
    return dist.cosine(u * tag_weights, v * tag_weights)

In [21]:
def articleCosSimi(u, v):
    '''Exactly like in the article'''
    w = tag_weights
    x = np.sum(w * u * v)
    y = np.sqrt(sum(w * u * u)) * np.sqrt(sum(w * v * v)) 
    return  x / y

In [22]:
def rel(tagId, movId):
    return genome[movId, tagId]

In [23]:
# def critiqueDist(critiquedMovieId, retrievedMovieId, tagId, direction):
#     ic, ir, t, d  = critiquedMovieId, retrievedMovieId, tagId, direction
#     return max(0, rel(t, ir) - rel(t, ic) * d)

In [24]:
def printNeigh(randMovNum, indices):
    neighbors = indices[randMovNum]
    for mid in neighbors:
        print(movies.loc[mid]["Title"])

In [37]:
X = genome.copy()
nbrs_art2 = NearestNeighbors(algorithm="brute", metric=weightedCosSimi, n_neighbors=250)
nbrs_art2.fit(X)
distances_art2, indices_art2 = nbrs_art2.kneighbors(X)

NearestNeighbors(algorithm='brute', leaf_size=30,
         metric=<function articleCosSimi at 0x7f774cd98400>,
         metric_params=None, n_jobs=1, n_neighbors=250, p=2, radius=1.0)

In [None]:
with open('pickled/KNN-neib=250-metr=article.pickle', 'wb') as f:
    pickle.dump(nbrs_art2, f)

In [269]:
movies.head()

Unnamed: 0,MovieID,Title,MoviePopularity
0,1,Toy Story (1995),53059
1,2,Jumanji (1995),22466
2,3,Grumpier Old Men (1995),15111
3,4,Waiting to Exhale (1995),2898
4,5,Father of the Bride Part II (1995),14323


In [283]:
def N(i):
    return indices_art2[randMovNum]

def Np(i, t):
    delta = 0.23
    return [j for j in N(i) if rel(t, j) > rel(t, i) + delta]

def Nn(i, t):
    delta = 0.23
    return [j for j in N(i) if rel(t, j) < rel(t, i) - delta]

def Nz(i, t):
    return [j for j in N(i) if j not in Np(i, t) and  j not in Nn(i, t)]

In [None]:
def critiqueEntropy(t, i):
    Nd = np.array(Nn(i, t), Nz(i, t), Np(i, t))
    modNd = [len(Nx) for Nx in Nd]
    modN = len(N(i))
    return np.sum(- modNd / modN * np.log(modNd / modN))

In [286]:
def idToNum(num):
    pass

In [289]:
mId = 0
movies.loc[mId]['Title']

'Toy Story (1995)'

In [292]:
tags.head()

Unnamed: 0,TagID,Tag,TagPopularity
0,0,007,61
1,1,007 (series),24
2,2,18th century,37
3,3,1920s,42
4,4,1930s,55


In [None]:
def tagSim(tA, tB):
    relevanceA = genome[:, tA]
    relevanceB = genome[:, tB]
    return dist.cosine(relevanceA, relevanceB)

In [None]:
# function which we want to minimize by S
def objective_function(S, i):
    "|S| == 5"
    cond1(t) = popularity(t) >= 50
    cond2(t) = np.all(np.array([tagSim(t, u) < 0.5 for u in S if t != u]))
    cond3(t) = critiqueEntropy(t) > 0.325
    ts = [t for t in S if cond1(t) and cond2(t) and cond3(t)]
    iss = list(itt.repeat(i, 5))
    return np.sum(critiqueEntropy(t, i) * np.log(popularity(t)))

In [11]:
def critiqueDist(critiquedMovieId, retrievedMovieId, tagId, direction):
    ic, ir, t, d  = critiquedMovieId, retrievedMovieId, tagId, direction
    return max(0, rel(t, ir) - rel(t, ic) * d)

In [13]:
# for uniform notation
def linearSat(ic, ir, t, d):
    critiqueDist(ic, ir, t,  d)

In [14]:
def diminishSat(ic, ir, t, d):
    1 - np.exp(-5 * critiqueDist(ic, ir, t, d))

In [5]:
import os.path, sys
sys.path.append("/home/esengie/Programming/au/IR/movie_searcher/")
from internet.movies import *

mov = Movies()
mov.imdb_id_from_title("Casino")

'tt0112641'