In [1]:
#!pip install --ignore-installed --upgrade pandas

In [91]:
import pandas as pd
import numpy as np
import gensim
from sklearn.externals import joblib
from gensim import corpora, utils, similarities
from gensim.models.wrappers.dtmmodel import DtmModel
from collections import defaultdict, Counter
import sklearn.metrics
from scipy.spatial.distance import pdist, squareform
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.ldamulticore import LdaMulticore
import nltk
import pyLDAvis
import pyLDAvis.gensim
#nltk.download()

In [29]:
# read n-grams
data = joblib.load('data/ngrams')
# reset index
data.reset_index(drop=True, inplace=True)

In [30]:
# remove n-grams that only occur once
data['reviews_mono'] = data.apply(lambda row: {item[0]: item[1] for item in row['reviews_mono'].items() if int(item[1])>1}, axis=1)
data['contents_mono'] = data.apply(lambda row:{item[0]: item[1] for item in row['contents_mono'].items() if int(item[1])>1}, axis=1)

data['reviews_bi'] = data.apply(lambda row: {item[0]: item[1] for item in row['reviews_bi'].items() if int(item[1])>1}, axis=1)
data['contents_bi'] = data.apply(lambda row:{item[0]: item[1] for item in row['contents_bi'].items() if int(item[1])>1}, axis=1)

data['reviews_tri'] = data.apply(lambda row: {item[0]: item[1] for item in row['reviews_tri'].items() if int(item[1])>1}, axis=1)
data['contents_tri'] = data.apply(lambda row:{item[0]: item[1] for item in row['contents_tri'].items() if int(item[1])>1}, axis=1)

In [31]:
# join bi-grams and tri-grams with a space
data['reviews_bi'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['reviews_bi'].items()}, axis=1)
data['contents_bi'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['contents_bi'].items()}, axis=1)
data['reviews_tri'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['reviews_tri'].items()}, axis=1)
data['contents_tri'] = data.apply(lambda row: {' '.join(item[0]):item[1] for item in row['contents_tri'].items()}, axis=1)

In [32]:
# combine n-grams
data['ngrams'] = data.apply(lambda row: Counter(row['reviews_mono'])+Counter(row['contents_mono'])+Counter(row['reviews_bi'])+Counter(row['contents_bi'])+Counter(row['reviews_tri'])+Counter(row['contents_tri']), axis=1)


In [33]:
data.head()

Unnamed: 0,title,year,reviews_mono,contents_mono,reviews_bi,contents_bi,reviews_tri,contents_tri,ngrams
0,Dead Awake,2016,"{'actual': 4, 'sleep': 12, 'paralys': 3, 'prob...","{'dead': 2, 'awak': 2, 'supernatur': 2, 'jocel...","{'actual like': 2, 'good job': 2, 'good horror...",{},{'nightmar elm street': 4},{},"{'actual': 4, 'sleep': 12, 'paralys': 3, 'prob..."
1,A Good American,2015,"{'situat': 2, 'read': 2, 'documentari': 2, 'sn...","{'documentari': 3, 'work': 2, 'whistleblow': 3...",{},"{'good american': 2, 'attack film': 2, 'said f...",{},{},"{'situat': 2, 'read': 2, 'documentari': 5, 'sn..."
2,Hard Tide,2015,"{'watch': 4, 'decid': 2, 'girl': 4, 'start': 2...","{'hard': 3, 'tide': 3, '2015': 2, 'nathanael':...","{'accident kill': 2, 'thing life': 2, 'old gir...","{'drug deal': 2, 'girl play': 2, 'take care': ...",{'year old girl': 2},{},"{'watch': 4, 'decid': 2, 'girl': 4, 'start': 2..."
3,Carrie Pilby,2016,"{'geniu': 2, 'colleg': 2, 'adulthood': 3, 'the...","{'carri': 5, 'pilbi': 6, 'susan': 3, 'johnson'...","{'see film': 2, 'base book': 2, 'love movi': 2...","{'princip photographi': 2, 'releas march': 2, ...","{'live new york': 2, 'intern film festiv': 2, ...","{'intern film festiv': 2, 'toronto intern film...","{'geniu': 2, 'colleg': 2, 'adulthood': 3, 'the..."
4,A Dark Song,2016,"{'writer': 2, 'felt': 2, 'review': 7, 'mere': ...","{'dark': 6, 'song': 6, '2016': 5, 'irish': 3, ...","{'watch film': 5, 'fast forward': 2, 'thing st...","{'rotten tomato': 2, 'film festiv': 2}","{'best ive seen': 2, 'dont wast time': 2, 'two...",{},"{'writer': 2, 'felt': 2, 'review': 9, 'mere': ..."


### LDA

In [8]:
# create a dictionary indexing the unique terms:
keys = [list(i.keys()) for i in data['ngrams']]
dictionary = corpora.Dictionary(keys)
dictionary.save('idtowords.dict') 

In [9]:
def get_key(v):
    """ find key based on value"""
    for key, value in dictionary.items():
        if value==v:
            return key
# vectorize function        
get_key_ = np.vectorize(get_key)

In [None]:
# map n-grams with keys
data['tokens'] = data.apply(lambda row: list(zip(get_key_(list(row['ngrams'].keys())), row['ngrams'].values())), axis=1)
## Create a sparsely formatted corpus:
corpus = list(data['tokens'])
# store to disk
corpora.MmCorpus.serialize('corpus.mm', corpus)  

In [None]:
# Specify a number of topics:
K = 50
# Fit the LDA model (100 topics, 10 passes takes about 1/2 hour, 3 topics 3 passes takes 5 min):
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=K, id2word = dictionary, passes=20, alpha=.1/K, eta=.1/K)

# Save the model object for visualization:
ldamodel.save('wiki.model')

In [16]:
corpus = corpora.MmCorpus('corpus.mm')

lda_mult = gensim.models.ldamodel.LdaModel.load('wiki.model')

In [25]:
print(corpus)

MmCorpus(1549 documents, 86042 features, 1973414 non-zero entries)


In [17]:
# Collect phi matrix of topic word proportions, theta matrix of document topic proportions, f_v of word frequencies
# p_v of term probabilities, V_n of unique terms in each document, W_v of document lengths, f_k of number of words 
# used in each topic, and p_k probabilities of each topic occurring, and bayes probabilities of a topic given a word,
# and a vocab list of terms:
phi_kv = np.zeros((K, len(dictionary)))
theta_nk = np.zeros((len(corpus), K))
for k in range(0,K):
    phi_kv[k,:] = [word_prob[1] for word_prob in lda_mult.get_topic_terms(k, len(dictionary))]

In [18]:
ls = np.zeros([len(corpus),1])
for n in range(0,len(corpus)):
    topic_dists = ldamodel.get_document_topics(corpus[n], minimum_probability=0)
    for topic_dist in topic_dists:
        theta_nk[n, topic_dist[0]] = topic_dist[1]

In [19]:
counts = Counter()
for i in range(data.shape[0]):
    counts = Counter(data.loc[i, 'ngrams']) + counts
f_v = np.array([counts[dictionary[word_ind]] for word_ind in range(len(dictionary))])
p_v = f_v/sum(f_v)
V_n = np.array([len(doc) for doc in corpus])
W_n = np.array([sum(word[1] for word in doc) for doc in corpus]) 
f_k = [sum(theta_nk[:,k]*W_n) for k in range(K)]
p_k = f_k/sum(f_k)
bayes_kv = np.zeros((K, len(dictionary)))
for k in range(K):
    bayes_kv[k,:] = phi_kv[k,:]*p_k[k]/p_v
vocab = [dictionary[i] for i in range(len(dictionary))]

In [27]:
data2 = {'topic_term_dists': phi_kv, 
            'doc_topic_dists': theta_nk,
            'doc_lengths': W_n,
            'vocab': vocab,
            'term_frequency': f_v}

In [28]:
movies_vis_data = pyLDAvis.prepare(**data2)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [22]:
pyLDAvis.display(movies_vis_data)


In [143]:
# Show top words from each topic:
ldamodel.show_topics(num_topics=-1, num_words=10, log=False, formatted=False)

[(0,
  [('v', 0.007955280346320124),
   ('v vendetta', 0.004144908760715557),
   ('govern', 0.0039497779514525416),
   ('polit', 0.0030598583835260576),
   ('natali portman', 0.00274356719684559),
   ('movi', 0.0026963305290861406),
   ('mask', 0.0026219933346697754),
   ('like', 0.002600213288089462),
   ('film', 0.002598547801420312),
   ('evey', 0.00233786014175698)]),
 (1,
  [('christin', 0.0054148832441379855),
   ('car', 0.005099247638184713),
   ('bee', 0.004374876310619607),
   ('hollywood', 0.003769244952787116),
   ('norma', 0.003685903875891477),
   ('film', 0.0035396839571629514),
   ('play', 0.0034532419719825224),
   ('like', 0.003006817590549787),
   ('joe', 0.0027863310378728715),
   ('charact', 0.002626056105831778)]),
 (2,
  [('titan', 0.005680310211267982),
   ('film', 0.0028832828125581397),
   ('ship', 0.002769335523053863),
   ('movi', 0.0026100650367330194),
   ('like', 0.002396302580978527),
   ('rose', 0.0022903707071592443),
   ('stori', 0.002272656109318733),

In [202]:
# Specify number of top words:
#num_top_words = 10

# Show top words from each topic:
#for k in range(K):
#    print("topic " + str(k) + ":")
#    topic_top_words = ldamodel.show_topic(k)
#    for top_word in topic_top_words:
#        print((top_word[0],format(top_word[1],".4f")))
#    print("\n")

In [218]:
# Obtain topic distribution for each movie review and every movie content:
#topic_probs = []
#for document in corpus:
#    topic_probs.append(np.array([topic_probs_double[1] for topic_probs_double in ldamodel.get_document_topics(document, minimum_probability=0)]))
#topic_probs_array = np.asarray(topic_probs)

In [23]:
# Obtain topic distribution for each movie review and every movie content:
movie_topics =[]
for document in corpus:
    topic_probs = {i:0 for i in range(K)}
    for topic in ldamodel.get_document_topics(document, minimum_probability=0):
        topic_probs[topic[0]] = topic[1]
    movie_topics.append(topic_probs)

### Application

In [252]:
data.loc[data['title'].str.contains('Titanic')]

Unnamed: 0,title,year,reviews_mono,contents_mono,reviews_bi,contents_bi,reviews_tri,contents_tri,ngrams
157,Titanic,1997,"{'convers': 38, 'turn': 145, 'ill': 83, 'menti...","{'titan': 95, '1997': 13, 'american': 3, 'epic...","{'die hard': 5, 'first world': 2, 'russel crow...","{'second film': 4, 'film depict': 6, 'show fil...","{'lot peopl say': 3, 'didnt know expect': 2, '...","{'special effect film': 2, 'box offic mojo': 3...","{'convers': 45, 'turn': 148, 'ill': 83, 'menti..."


In [253]:
# convert the query to model space
# Moana: animation, adventure
#query = 168
#query = 243
query = 157
vec_bow = corpus[query]
vec_lda = ldamodel[vec_bow]

In [237]:
# build the index 
index = similarities.MatrixSimilarity(ldamodel[corpus])
index.save('similarity.index')
index = similarities.MatrixSimilarity.load('similarity.index')

In [254]:
# get similarities between the query and all index documents
sims = index[vec_lda]
# sort 
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [257]:
sims[0:10]

[(157, 1.0),
 (720, 1.0),
 (1448, 0.99970907),
 (1449, 0.99965554),
 (326, 0.99958664),
 (81, 0.99954736),
 (327, 0.9995186),
 (1212, 0.9043089),
 (1547, 0.90419513),
 (1083, 0.86937153)]

In [258]:
[data.loc[i[0], 'title'] for i in sims[0:20]]

['Titanic',
 'No Country for Old Men',
 'The Double',
 'The Double',
 'The Double',
 'Cloud Atlas',
 'The Double',
 'Seeking a Friend for the End of the World',
 'Chevolution',
 'Valhalla Rising',
 'Outrage',
 'Kurt & Courtney',
 'Aziz Ansari: Buried Alive',
 "Women He's Undressed",
 'We Need to Talk About Kevin',
 'Soaked in Bleach',
 'Amu',
 'Waking Life',
 'Radio Free Albemuth',
 'Whitey: United States of America v. James J. Bulger']