In [13]:
#NLP libraries
import findspark
findspark.init()

import re

import pyspark.sql.functions as psf
from pyspark.sql import *
from pyspark.sql.types import *

import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.session.timeZone', 'UTC')
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [14]:
# load data 
# keeping only the subtiles
df_text = spark.read.parquet('parquets/films2.parquet').select('tconst','subtitles')
df_ratings = spark.read.parquet('parquets/ratings.parquet')

In [15]:
df_text.show()

+---------+--------------------+
|   tconst|           subtitles|
+---------+--------------------+
|tt5275892|[[As, a, kid, gro...|
|tt2318527|[[[, BELL, TOLLIN...|
|tt2234155|[[#, An, old, man...|
|tt2404463|[[Cleaned, correc...|
|tt1398426|[[[, Police, Radi...|
|tt3311384|[[(, CROWD, CHEER...|
|tt2080374|[[MAN, #, 1, :], ...|
|tt4257858|[[We, will, begin...|
|tt4540710|[[Subtitle, made,...|
|tt3152624|[[Girls, ,, your,...|
|tt4425064|[[After, hours, o...|
|tt4987556|[[Define, your, e...|
|tt1229340|[[NARRATOR, :], [...|
|tt1638364|[[SUBTITLES, BY, ...|
|tt1980209|[[I, 'm, strong, ...|
|tt0884732|[[(, DIALING, PHO...|
|tt3628584|[[Yes, ,, sir, ,,...|
|tt4501244|[[(, CELL, PHONE,...|
|tt5213534|[[INTERVIEWER, :]...|
|tt1824254|[[Anti-nuclear, g...|
+---------+--------------------+
only showing top 20 rows



In [16]:
df_ratings.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.8|    1440|
|tt0000002|          6.3|     172|
|tt0000003|          6.6|    1041|
|tt0000004|          6.4|     102|
|tt0000005|          6.2|    1735|
|tt0000006|          5.5|      91|
|tt0000007|          5.5|     579|
|tt0000008|          5.6|    1539|
|tt0000009|          5.6|      74|
|tt0000010|          6.9|    5127|
|tt0000011|          5.4|     214|
|tt0000012|          7.4|    8599|
|tt0000013|          5.7|    1318|
|tt0000014|          7.2|    3739|
|tt0000015|          6.2|     660|
|tt0000016|          5.9|     982|
|tt0000017|          4.8|     197|
|tt0000018|          5.5|     414|
|tt0000019|          6.6|      13|
|tt0000020|          5.1|     232|
+---------+-------------+--------+
only showing top 20 rows



In [17]:
# Joining with ratings to get best and worst movies 
df_tr = df_text.join(other=df_ratings, on='tconst', how='inner')

In [18]:
# check that we have at least 5k votes par movie
df_tr.show()

+---------+--------------------+-------------+--------+
|   tconst|           subtitles|averageRating|numVotes|
+---------+--------------------+-------------+--------+
|tt5275892|[[As, a, kid, gro...|          9.0|   14176|
|tt2318527|[[[, BELL, TOLLIN...|          4.9|    8282|
|tt2234155|[[#, An, old, man...|          6.3|  176282|
|tt2404463|[[Cleaned, correc...|          6.6|  150013|
|tt1398426|[[[, Police, Radi...|          7.9|  160317|
|tt3311384|[[(, CROWD, CHEER...|          8.1|    7005|
|tt2080374|[[MAN, #, 1, :], ...|          7.2|  134255|
|tt4257858|[[We, will, begin...|          8.0|   33233|
|tt4540710|[[Subtitle, made,...|          7.5|   47445|
|tt3152624|[[Girls, ,, your,...|          6.2|  118553|
|tt4425064|[[After, hours, o...|          6.9|    5214|
|tt4987556|[[Define, your, e...|          8.5|   12593|
|tt1229340|[[NARRATOR, :], [...|          6.3|  151320|
|tt1638364|[[SUBTITLES, BY, ...|          7.3|    6718|
|tt1980209|[[I, 'm, strong, ...|          6.4|  

In [19]:
df_best = df_tr.filter(df_tr['averageRating'] >= 8)

In [20]:
df_best.count()

363

In [22]:
df_worst = df_tr.filter(df_tr['averageRating'] <= 5)

In [23]:
df_worst.count()

260

# Detokinizing the text

In [24]:
from sacremoses import MosesDetokenizer
def subtitles_to_string(subtitles):
    result = ""
    for subtitle in subtitles:
        test = MosesDetokenizer().detokenize(subtitle)
        result += test + "\n"
    return result
udf_subtitles_to_string = psf.udf(subtitles_to_string, StringType())

In [25]:
df_best = df_best.withColumn("text",udf_subtitles_to_string("subtitles"))
df_best = df_best.drop('subtiles')
df_best.show()

+---------+--------------------+-------------+--------+--------------------+
|   tconst|           subtitles|averageRating|numVotes|                text|
+---------+--------------------+-------------+--------+--------------------+
|tt5275892|[[As, a, kid, gro...|          9.0|   14176|As a kid growing ...|
|tt3311384|[[(, CROWD, CHEER...|          8.1|    7005|( CROWD CHEERING)...|
|tt4257858|[[We, will, begin...|          8.0|   33233|We will begin the...|
|tt4987556|[[Define, your, e...|          8.5|   12593|Define your enemy...|
|tt3417422|[[Tobacco, use, l...|          8.8|   21918|Tobacco use leads...|
|tt5895028|[[[, Barack, Obam...|          8.2|   18043|[ Barack Obama] S...|
|tt0102138|[[I, deliver, per...|          8.0|  127966|I deliver perfect...|
|tt0106469|[[So, what, 's, t...|          8.0|   26067|So what's the dif...|
|tt0108052|[[[, Rabbi, ], Sa...|          8.9| 1042076|[ Rabbi] Savree m...|
|tt0117293|[[Captions, made,...|          8.3|   13153|Captions made pos...|

In [26]:
df_worst = df_worst.withColumn("text",udf_subtitles_to_string("subtitles"))
df_worst = df_worst.drop('subtiles')
df_best.show()

+---------+--------------------+-------------+--------+--------------------+
|   tconst|           subtitles|averageRating|numVotes|                text|
+---------+--------------------+-------------+--------+--------------------+
|tt5275892|[[As, a, kid, gro...|          9.0|   14176|As a kid growing ...|
|tt3311384|[[(, CROWD, CHEER...|          8.1|    7005|( CROWD CHEERING)...|
|tt4257858|[[We, will, begin...|          8.0|   33233|We will begin the...|
|tt4987556|[[Define, your, e...|          8.5|   12593|Define your enemy...|
|tt3417422|[[Tobacco, use, l...|          8.8|   21918|Tobacco use leads...|
|tt5895028|[[[, Barack, Obam...|          8.2|   18043|[ Barack Obama] S...|
|tt0102138|[[I, deliver, per...|          8.0|  127966|I deliver perfect...|
|tt0106469|[[So, what, 's, t...|          8.0|   26067|So what's the dif...|
|tt0108052|[[[, Rabbi, ], Sa...|          8.9| 1042076|[ Rabbi] Savree m...|
|tt0117293|[[Captions, made,...|          8.3|   13153|Captions made pos...|

# NLP modafucker

In [27]:
nlp = spacy.load('en')
nlp.remove_pipe('parser')
nlp.remove_pipe('tagger')

('tagger', <spacy.pipeline.Tagger at 0x1a1ed16470>)

In [29]:
df_best_pd = df_best.toPandas()

In [None]:
df_genres[]

In [30]:
texts_bests = df_best_pd['text'].values

In [31]:
# remove new lines 
texts_bests = [" ".join(t.split()) for t in texts_bests]

### Chunking

In [32]:
size =  # how many sentences per chunk/page

In [33]:
# transform data into blocks 
def get_chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [34]:
chunks = list()
for text in texts_bests: 
    
    # get list of sentences for considered sub
    sentences = nltk.tokenize.sent_tokenize(text)
    
    # create chunks of subs with "size" sentences in it 
    chunks_of_sents = [x for x in get_chunks(sentences,size)] 
    chs = list()
    
    # regroup so to have a list of chunks which are strings
    for c in chunks_of_sents:
        grouped_chunk = list()
        for s in c:
            grouped_chunk.extend(' ')
            grouped_chunk.extend(s)
        chs.append("".join(grouped_chunk))
    #print("Number of chunks:",len(chs),'\n')
    
    # append all chunks to the list 
    chunks.extend(chs[:])

### NLP pipe

In [35]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

processed_docs = list()
for doc in nlp.pipe(chunks, n_threads=5, batch_size=10):

    # Process document using Spacy NLP pipeline.
    ents = doc.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list and keep only words of length 3 or more.
    doc = [token for token in doc if token not in STOPWORDS and len(token) > 2]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])

    processed_docs.append(doc)
    
docs = processed_docs ############### variable pour la suite

del processed_docs

In [36]:
# Add bigrams too
from gensim.models.phrases import Phrases

# Add bigrams to docs (only ones that appear 15 times or more).
bigram = Phrases(docs, min_count=4)

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

























































In [37]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.
from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
#MmCorpus.serialize("models/corpus.mm", corpus)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

Number of unique tokens: 19764
Number of chunks: 24065


In [41]:
seed = 42
# models
from gensim.models import LdaMulticore
params = {'passes': 10, 'random_state': seed}
base_models = dict()

model = LdaMulticore(corpus=corpus,
                     num_topics=3,
                     id2word=dictionary,
                     workers=6,
                     passes=params['passes'],
                     random_state=params['random_state'])

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [42]:
model.show_topics(num_words=10)

[(14,
  '0.020*"like" + 0.018*"want" + 0.015*"And" + 0.011*"come" + 0.010*"think" + 0.008*"money" + 0.008*"day" + 0.007*"look" + 0.007*"good" + 0.007*"The"'),
 (17,
  '0.013*"The" + 0.012*"want" + 0.010*"And" + 0.007*"But" + 0.007*"tell" + 0.006*"think" + 0.006*"need" + 0.006*"man" + 0.006*"right" + 0.005*"sir"'),
 (16,
  '0.012*"think" + 0.011*"Yes" + 0.009*"The" + 0.009*"love" + 0.008*"like" + 0.007*"But" + 0.007*"tell" + 0.007*"night" + 0.006*"come" + 0.006*"And"'),
 (13,
  '0.024*"right" + 0.019*"Come" + 0.013*"Hello" + 0.013*"All" + 0.011*"want" + 0.011*"All_right" + 0.009*"Yes" + 0.008*"like" + 0.008*"let" + 0.008*"door"'),
 (10,
  '0.012*"The" + 0.011*"man" + 0.008*"This" + 0.007*"They" + 0.007*"And" + 0.007*"people" + 0.006*"like" + 0.005*"# #" + 0.005*"good" + 0.005*"right"'),
 (15,
  '0.018*"The" + 0.012*"And" + 0.011*"man" + 0.008*"like" + 0.007*"people" + 0.006*"think" + 0.006*"time" + 0.005*"right" + 0.005*"want" + 0.005*"This"'),
 (11,
  '0.019*"come" + 0.012*"man" + 0.01

In [43]:
model.show_topic(1,20)

[('The', 0.015443483),
 ('come', 0.009858928),
 ('And', 0.009274823),
 ('They', 0.0073754354),
 ('time', 0.006892856),
 ('want', 0.006511657),
 ('day', 0.006442665),
 ('long', 0.0055920207),
 ('like', 0.0053547323),
 ('tell', 0.0053147315),
 ('way', 0.004886512),
 ('Now', 0.0046205223),
 ('kill', 0.004406466),
 ('leave', 0.0043987664),
 ('But', 0.0041654375),
 ('let', 0.004156266),
 ('right', 0.0040718946),
 ('Scarlett', 0.0040097516),
 ('live', 0.0038918569),
 ('think', 0.0038595481)]

In [44]:
sorted(model[corpus[0]],key=lambda x:x[1],reverse=True)

[(5, 0.48382002), (15, 0.42382395), (10, 0.0839402)]

In [45]:
# plot topics
data =  pyLDAvis.gensim.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
