# 1. Web scraping from SENSACINE.COM

First, we need to import Beautiful Soup along with some other packages.

In [1]:
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse
import concurrent.futures
import pandas as pd

This application needs to download data from a large number of Sensacine URLs. We will be using Python’s concurrent API to make the process parallel and seamless.

In [2]:
# Maximum number of threads that will be spawned
MAX_THREADS = 50

Attributes that we are interested in

We will mostly focus on the below-mentioned attributes:

    Movie title
    Synopsis of the movie

In [3]:
movie_title_arr = []
movie_synopsis_arr =[]

Utility functions for Scraping above Data using Beautiful Soup

In [4]:
def getMovieTitle(title):
    try:
        return title[0].find("a", {"class":  "meta-title-link"}).getText()
    except:
        return 'NA'

def getsynopsis(synopsis):
    try:
        return synopsis[0].find("div", {"class":  "content-txt"}).getText()
    except:
        return 'NA'

The main function that will utilize the URL provided to scrape data

This will be our main function that will be responsible for iterating through the various attributes of the Sensacine data. We will be providing this function with URLs for various Sensacine pages and this will help us extract information from the pages.

In [5]:
def main(sensacine_url):
    response = requests.get(sensacine_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Movie Name
    movies_list  = soup.find_all("div", {"class": "card entity-card entity-card-list cf"})
    
    # iterate over all movies
    for movie in movies_list: 
        title = movie.find_all("h2", {"class":  "meta-title"})
        synopsis = movie.find_all("div", {"class":  "synopsis"})
        
        #  Movie Title
        movie_title =  getMovieTitle(title)
        movie_title_arr.append(movie_title)
        
        # Movie Synopsys
        movie_synopsis = getsynopsis(synopsis)
        movie_synopsis_arr.append(movie_synopsis)

Note below mentioned for loop helps in generating URLs for the list of movies according to the filter that we have specified.

In [6]:
# An array to store all the URL that are being queried
url_arr = []

# Maximum number of pages one wants to iterate over. 
# As we want to retrieve all the movies, we indicate the max number of pages possible
MAX_PAGE = 3108

# Loop to generate all the URLS.
for i in range(0,MAX_PAGE):
    totalRecords = 0 if i==0 else (250*i)+1
    #print(totalRecords)
    sensacine = f'https://www.sensacine.com/peliculas/todas-peliculas/'
    url_arr.append(sensacine)

The below-mentioned download function takes up the URLs and calls the main function with those. It does this in parallel with MAX_THREADS as the maximum number of requests.

In [7]:
def download_movies(movie_urls):
    threads = min(MAX_THREADS, len(movie_urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(main, movie_urls)

Finally, we call the download function and then get our required data.

In [8]:
# Call the download function with the array of URLS called url_arr
download_movies(url_arr)

# Attach all the data to the pandas dataframe
movie_df = pd.DataFrame({
    "Title": movie_title_arr,
    "Synopsis": movie_synopsis_arr,
})

movie_df.head()

Unnamed: 0,Title,Synopsis
0,Spider-Man: No Way Home,\nDespués de que Mysterio desvelara la identid...
1,A través de mi ventana,"\nLa historia sigue a Raquel, una joven que ll..."
2,Spider-Man: No Way Home,\nDespués de que Mysterio desvelara la identid...
3,Encanto,"\nEncanto nos sitúa en el corazón de Brasil, n..."
4,Uncharted,\nEsta adaptación de la exitosa serie de video...


There are *\n*'s in the beggining of each synopsis string. We can get rid of it in order to clean the dataset

In [9]:
movie_df = movie_df.replace(r'\n','', regex=True)
movie_df

Unnamed: 0,Title,Synopsis
0,Spider-Man: No Way Home,Después de que Mysterio desvelara la identidad...
1,A través de mi ventana,"La historia sigue a Raquel, una joven que llev..."
2,Spider-Man: No Way Home,Después de que Mysterio desvelara la identidad...
3,Encanto,"Encanto nos sitúa en el corazón de Brasil, nar..."
4,Uncharted,Esta adaptación de la exitosa serie de videoju...
...,...,...
46615,Eternals,Esta película basada en los cómics de Marvel n...
46616,Cincuenta sombras de Grey,Anastasia Steele es una joven e inocente estud...
46617,Five Nights At Freddy's,Este proyecto que dirige Gil Kenan (Poltergeis...
46618,La abuela,Susana tiene que dejar su vida en París trabaj...


As we can see, we obtained description for 46620 movies which seems enough for the purpose of our project

We can store our dataframe to a CSV file

In [10]:
movie_df.to_csv('sensacine.csv', index=False, sep='\t')

Exploratory Data Analysis for Natural Language Processing

In [11]:
!pip install \
   pandas matplotlib numpy \
   nltk seaborn sklearn gensim pyldavis \
   wordcloud textblob spacy textstat

Collecting wordcloud
  Using cached wordcloud-1.8.1.tar.gz (220 kB)
Collecting textstat
  Downloading textstat-0.7.2-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 1.9 MB/s ta 0:00:01
Building wheels for collected packages: wordcloud
  Building wheel for wordcloud (setup.py) ... [?25ldone
[?25h  Created wheel for wordcloud: filename=wordcloud-1.8.1-cp37-cp37m-macosx_10_9_x86_64.whl size=158971 sha256=e0d5f7d4073e03f9d4ca47dc7e9c2d91597f6b1168c60ead9759c85f3c8e4dcb
  Stored in directory: /Users/polina/Library/Caches/pip/wheels/f8/f6/55/6bd394c32a844a621ca0fe5dbf563c8d71d71edaf095656991
Successfully built wordcloud
Installing collected packages: wordcloud, textstat
Successfully installed textstat-0.7.2 wordcloud-1.8.1


In [12]:
movie_df.head(10)

Unnamed: 0,Title,Synopsis
0,Spider-Man: No Way Home,Después de que Mysterio desvelara la identidad...
1,A través de mi ventana,"La historia sigue a Raquel, una joven que llev..."
2,Spider-Man: No Way Home,Después de que Mysterio desvelara la identidad...
3,Encanto,"Encanto nos sitúa en el corazón de Brasil, nar..."
4,Uncharted,Esta adaptación de la exitosa serie de videoju...
5,El callejón de las almas perdidas,Remake de la cinta de 1947. Se desconoce el re...
6,The Fallout,Vada es una estudiante de secundaria que no so...
7,Moonfall,La Luna sale de su órbita y se dirige hacia la...
8,My Hero Academia: Misión mundial de héroes,Una organización criminal conocida como Humani...
9,Cásate conmigo,Bastian (Maluma) y Kat Valdez (Jennifer López)...


In [13]:
!pip install es_core_news_sm



In [14]:
import spacy

nlp = spacy.load("es_core_news_sm")

def normaliza(texto):
    #separamos después de ciertos signos de puntuación
    texto = re.sub(r"([\.\?])", r"\1 ", texto)
    doc = nlp(texto)
    tokens = [t for t in doc if not t.is_punct and not t.is_stop and not t.is_space and len(t.text)>1]
    palabras = []
    for t in tokens:
        if t.ent_iob_=='B' and t.ent_type_=='PER':
            palabras.append('persona')
        elif t.ent_iob_=='I' and t.ent_type_=='PER':
            continue
        else:
            palabras.append(t.lemma_.lower()) 
    salida = ' '.join(palabras)
    
    return salida

TOPIC MODELING.EDA, GENSIM, LDA model

First of all, we install libraries and dependencies needed.

For LDA model we would need to download a spacy model for a spanish languages as follows:

In [15]:
!pip install spacy



In [16]:
!python -m spacy download es_core_news_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_md')


In [17]:
#nlp is now the variable that has the pre-trained spacy model
import spacy
nlp = spacy.load("es_core_news_md")

Others:

In [18]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

In [19]:
## NLTK Libraries
import nltk; 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/polina/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
# Core Packages
import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay attention to them right now

In [21]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


Let's get a sample to work with:

In [22]:
sample = movie_df[:15]
print(sample)

                                         Title  \
0                      Spider-Man: No Way Home   
1                       A través de mi ventana   
2                      Spider-Man: No Way Home   
3                                      Encanto   
4                                    Uncharted   
5            El callejón de las almas perdidas   
6                                  The Fallout   
7                                     Moonfall   
8   My Hero Academia: Misión mundial de héroes   
9                               Cásate conmigo   
10           Hotel Transilvania: Transformanía   
11                      A través de mi ventana   
12                                     Encanto   
13                                   Uncharted   
14           El callejón de las almas perdidas   

                                             Synopsis  
0   Después de que Mysterio desvelara la identidad...  
1   La historia sigue a Raquel, una joven que llev...  
2   Después de que Mysterio des

**Approach by Understanding LDA**
LDA stands for Latent Dirichlet Allocation, and it is a type of topic modeling algorithm. The purpose of LDA is to learn the representation of a fixed number of topics, and given this number of topics learn the topic distribution that each document in a collection of documents has.

In [23]:
## Stopwords from NTLK Libraries


from nltk.corpus import stopwords
stop_words = stopwords.words('spanish') 
#This will import all the stopwords in Spanish language , similarly you can do for any other language."""

#stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
#This attribute allows you to update your stop words list and make you pre-processing more aggressive and accurrate"""

In [24]:
### To Iterate over each news in every line, we will have to convert it into a list

data_list = sample['Synopsis'].tolist()
data_list[1]

'La historia sigue a Raquel, una joven que lleva casi toda la vida enamorada de su vecino Ares. Nunca le ha dicho lo que siente por él porque nunca han cruzado palabra alguna. Sin embargo, esto cambiará radicalmente cuando comiencen a surgir una serie de acontecimientos que les hará unirse cada vez más y más.'

In [25]:
data_list  = [re.sub(r'[^\w\s]', ' ', line) for line in data_list]
data_list[1]

'La historia sigue a Raquel  una joven que lleva casi toda la vida enamorada de su vecino Ares  Nunca le ha dicho lo que siente por él porque nunca han cruzado palabra alguna  Sin embargo  esto cambiará radicalmente cuando comiencen a surgir una serie de acontecimientos que les hará unirse cada vez más y más '

In [26]:
#nlp is the model  that has the Spacy model pipleline built for English language """

import spacy
nlp = spacy.load("es_core_news_md")

In [27]:
## Define the stop words using NLTK corpus data

from nltk.corpus import stopwords
stop_words = stopwords.words("spanish")
stop_words[1:10]

['la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se']

Tokenize the text to convert them into words format from the sentence format.


In [28]:
# Gensim’s simple_preprocess() is great for this. Additionally we have set deacc=True to remove the punctuations.

def sentence_to_word(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sentence_to_word(data_list))

print(data_words[:1])

[['despues', 'de', 'que', 'mysterio', 'desvelara', 'la', 'identidad', 'de', 'spider', 'man', 'todo', 'el', 'mundo', 'en', 'lejos', 'de', 'casa', 'peter', 'parker', 'tom', 'holland', 'desesperado', 'por', 'volver', 'la', 'normalidad', 'recuperar', 'su', 'anterior', 'vida', 'pide', 'ayuda', 'doctor', 'strange', 'para', 'enmendar', 'tal', 'accion', 'pero', 'alterar', 'la', 'realidad', 'tendra', 'consecuencias', 'nefastas', 'para', 'el', 'heroe', 'de', 'nueva', 'york']]


**Bigrams and Trigrams creation using Gensim Phrase.**

*Automatically detect common phrases – aka multi-word expressions, word n-gram collocations – from a stream of sentences.*

In [29]:
# Build the bigram and trigram models

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['despues', 'de', 'que', 'mysterio', 'desvelara', 'la', 'identidad', 'de', 'spider', 'man', 'todo', 'el', 'mundo', 'en', 'lejos', 'de', 'casa', 'peter', 'parker', 'tom', 'holland', 'desesperado', 'por', 'volver', 'la', 'normalidad', 'recuperar', 'su', 'anterior', 'vida', 'pide', 'ayuda', 'doctor', 'strange', 'para', 'enmendar', 'tal', 'accion', 'pero', 'alterar', 'la', 'realidad', 'tendra', 'consecuencias', 'nefastas', 'para', 'el', 'heroe', 'de', 'nueva', 'york']


Below is the utility functions used for basic preprocessing of Text data.

In [30]:
# def remove_stopwords(text):
#     for doc in text:
#         if doc not in stop_words:
#             return [words for words in simple_preprocess(str(doc))]
                                    
                    
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]



def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]



def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [31]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en_core_web_sm
nlp = spacy.load('es_core_news_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['despues', 'mysterio', 'desvelar', 'identidad', 'man', 'mundo', 'lejos', 'casar', 'peter', 'tom', 'holland', 'desesperar', 'volver', 'normalidad', 'recuperar', 'anterior', 'vida', 'pedir', 'ayudar', 'doctor', 'enmendar', 'accion', 'alterar', 'realidad', 'tendra', 'consecuencia', 'nefasto', 'heroe', 'nuevo', 'york']]


Data Transformation : - Dictionary and Corpus

The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus.

In [32]:
# Create Dictionary

"""This module implements the concept of the Dictionary in Pytons, a mapping between words and their integer ids."""
id2word = corpora.Dictionary(data_lemmatized)


# Create Corpus
texts = data_lemmatized

# Term Document Frequency
"""doc2bow us Document to Bag of Words format, It outputs a tuple of token id and token words"""
corpus = [id2word.doc2bow(text) for text in texts]

# View

print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]]


In bag of Words as you might know, that the model creates a corpus with assigning unique ID to individual word, and then providing the frequency of the word. This helps to define how important that word is in that particular document.

For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

In [33]:
# Human readable format of corpus (term-frequency)
#This fucntion creates the reverse mapping of ID to original lemmatized word"""
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('accion', 1),
  ('alterar', 1),
  ('anterior', 1),
  ('ayudar', 1),
  ('casar', 1),
  ('consecuencia', 1),
  ('desesperar', 1),
  ('despues', 1),
  ('desvelar', 1),
  ('doctor', 1),
  ('enmendar', 1),
  ('heroe', 1),
  ('holland', 1),
  ('identidad', 1),
  ('lejos', 1),
  ('man', 1),
  ('mundo', 1),
  ('mysterio', 1),
  ('nefasto', 1),
  ('normalidad', 1),
  ('nuevo', 1),
  ('pedir', 1),
  ('peter', 1),
  ('realidad', 1),
  ('recuperar', 1),
  ('tendra', 1),
  ('tom', 1),
  ('vida', 1),
  ('volver', 1),
  ('york', 1)]]

Building the Topic Model

Refer this Genesims document for detailed information of LDA model and its hyperparameter for tuning purposes.
https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html#sphx-glr-auto-examples-tutorials-run-lda-py

Important Parameters however which you must know:
Number of Topics;- It can be anything as per output expectations, or the data itself. For our data we will take K = 20, as we know the data has 20 news groups.

chuncksize: - Controls how many documents are processed at a time in the training algorithm.

passes: - controls how often we train the model on the entire corpus, same like " epochs"

We set alpha = 'auto' and eta = 'auto'. Again this is somewhat technical, but essentially we are automatically learning two parameters in the model that we usually would have to specify explicitly.

In [34]:

#Enable logging so we can see the progress of ducment convergence and it can help us tune our parameters for model"""

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


#If you set passes = 10 you will see this line 10 times. 
#Make sure that by the final passes, most of the documents have converged. 
#So you want to choose both passes and iterations to be high enough for this to happen."""


lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                            eval_every=1,                                        
                                           per_word_topics=True)

2022-02-21 17:45:38,456 : INFO : using autotuned alpha, starting with [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
2022-02-21 17:45:38,457 : INFO : using symmetric eta at 0.05
2022-02-21 17:45:38,459 : INFO : using serial LDA version on this node
2022-02-21 17:45:38,461 : INFO : running online (multi-pass) LDA training, 20 topics, 10 passes over the supplied corpus of 15 documents, updating model once every 15 documents, evaluating perplexity every 15 documents, iterating 50x with a convergence threshold of 0.001000
2022-02-21 17:45:38,471 : INFO : -25.817 per-word bound, 59131536.4 perplexity estimate based on a held-out corpus of 15 documents with 384 words
2022-02-21 17:45:38,471 : INFO : PROGRESS: pass 0, at document #15/15
2022-02-21 17:45:38,481 : INFO : optimized alpha [0.044027206, 0.04006174, 0.04006174, 0.04728905, 0.043974794, 0.05174021, 0.04363027, 0.055702336, 0.043817613, 0.043595154, 0.041958645

2022-02-21 17:45:38,523 : INFO : topic #16 (0.044): 0.040*"joven" + 0.040*"seriar" + 0.040*"sullivan" + 0.040*"mentor" + 0.040*"mark" + 0.040*"drake" + 0.040*"crear" + 0.040*"descubrir" + 0.040*"adaptacion" + 0.040*"victor"
2022-02-21 17:45:38,524 : INFO : topic #5 (0.046): 0.050*"cintar" + 0.049*"manipular" + 0.027*"mas" + 0.026*"historia" + 0.025*"amante" + 0.025*"hombre" + 0.025*"incluso" + 0.025*"antojar" + 0.025*"desconocer" + 0.025*"asi"
2022-02-21 17:45:38,524 : INFO : topic #7 (0.056): 0.030*"alterar" + 0.030*"mundo" + 0.020*"ayudar" + 0.020*"mysterio" + 0.020*"man" + 0.020*"nefasto" + 0.020*"pedir" + 0.020*"normalidad" + 0.020*"volver" + 0.020*"despues"
2022-02-21 17:45:38,524 : INFO : topic diff=0.109707, rho=0.408248
2022-02-21 17:45:38,528 : INFO : -5.625 per-word bound, 49.4 perplexity estimate based on a held-out corpus of 15 documents with 384 words
2022-02-21 17:45:38,528 : INFO : PROGRESS: pass 5, at document #15/15
2022-02-21 17:45:38,530 : INFO : optimized alpha [0.0

2022-02-21 17:45:38,565 : INFO : topic #17 (0.024): 0.004*"orbitar" + 0.004*"oportunidad" + 0.004*"salvar" + 0.004*"salir" + 0.004*"provocar" + 0.004*"planeta" + 0.004*"tierra" + 0.004*"improbable" + 0.004*"junto" + 0.004*"unica"
2022-02-21 17:45:38,566 : INFO : topic #16 (0.040): 0.040*"joven" + 0.040*"seriar" + 0.040*"sullivan" + 0.040*"mentor" + 0.040*"mark" + 0.040*"drake" + 0.040*"crear" + 0.040*"descubrir" + 0.040*"adaptacion" + 0.040*"victor"
2022-02-21 17:45:38,566 : INFO : topic #5 (0.042): 0.051*"cintar" + 0.051*"manipular" + 0.026*"mas" + 0.026*"historia" + 0.026*"amante" + 0.026*"hombre" + 0.026*"incluso" + 0.026*"antojar" + 0.026*"desconocer" + 0.026*"asi"
2022-02-21 17:45:38,566 : INFO : topic #7 (0.054): 0.030*"alterar" + 0.030*"mundo" + 0.020*"ayudar" + 0.020*"mysterio" + 0.020*"man" + 0.020*"nefasto" + 0.020*"pedir" + 0.020*"normalidad" + 0.020*"volver" + 0.020*"despues"
2022-02-21 17:45:38,566 : INFO : topic diff=0.025139, rho=0.301511


In [35]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]


2022-02-21 17:45:38,570 : INFO : topic #0 (0.031): 0.049*"heroes" + 0.025*"mision" + 0.025*"partir" + 0.025*"criminal" + 0.025*"oseon" + 0.025*"humanize" + 0.025*"seleccion" + 0.025*"pretender" + 0.025*"poseedor" + 0.025*"tambien"
2022-02-21 17:45:38,570 : INFO : topic #1 (0.024): 0.004*"orbitar" + 0.004*"oportunidad" + 0.004*"salvar" + 0.004*"salir" + 0.004*"provocar" + 0.004*"planeta" + 0.004*"tierra" + 0.004*"improbable" + 0.004*"junto" + 0.004*"unica"
2022-02-21 17:45:38,571 : INFO : topic #2 (0.024): 0.004*"orbitar" + 0.004*"oportunidad" + 0.004*"salvar" + 0.004*"salir" + 0.004*"provocar" + 0.004*"planeta" + 0.004*"tierra" + 0.004*"improbable" + 0.004*"junto" + 0.004*"unica"
2022-02-21 17:45:38,571 : INFO : topic #3 (0.026): 0.005*"encantar" + 0.005*"miembro" + 0.005*"increible" + 0.005*"aventurar" + 0.005*"situa" + 0.005*"magicos" + 0.005*"poder" + 0.005*"narrar" + 0.005*"protagonista" + 0.005*"corazon"
2022-02-21 17:45:38,572 : INFO : topic #4 (0.031): 0.031*"deberan" + 0.031*"s

[(0, '0.049*"heroes" + 0.025*"mision" + 0.025*"partir" + 0.025*"criminal" + 0.025*"oseon" + 0.025*"humanize" + 0.025*"seleccion" + 0.025*"pretender" + 0.025*"poseedor" + 0.025*"tambien"'), (1, '0.004*"orbitar" + 0.004*"oportunidad" + 0.004*"salvar" + 0.004*"salir" + 0.004*"provocar" + 0.004*"planeta" + 0.004*"tierra" + 0.004*"improbable" + 0.004*"junto" + 0.004*"unica"'), (2, '0.004*"orbitar" + 0.004*"oportunidad" + 0.004*"salvar" + 0.004*"salir" + 0.004*"provocar" + 0.004*"planeta" + 0.004*"tierra" + 0.004*"improbable" + 0.004*"junto" + 0.004*"unica"'), (3, '0.005*"encantar" + 0.005*"miembro" + 0.005*"increible" + 0.005*"aventurar" + 0.005*"situa" + 0.005*"magicos" + 0.005*"poder" + 0.005*"narrar" + 0.005*"protagonista" + 0.005*"corazon"'), (4, '0.031*"deberan" + 0.031*"sera" + 0.031*"junto" + 0.031*"colosal" + 0.031*"oportunidad" + 0.031*"trabajar" + 0.031*"provocar" + 0.031*"planeta" + 0.031*"orbitar" + 0.031*"contar"'), (5, '0.051*"cintar" + 0.051*"manipular" + 0.026*"mas" + 0.026*

comment!

In [36]:
## Compute Model Perplexity and Coherence Score


# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

2022-02-21 17:45:38,584 : INFO : -5.553 per-word bound, 46.9 perplexity estimate based on a held-out corpus of 15 documents with 384 words
2022-02-21 17:45:38,587 : INFO : using ParallelWordOccurrenceAccumulator(processes=7, batch_size=64) to estimate probabilities from sliding windows



Perplexity:  -5.552844027367731


2022-02-21 17:45:38,723 : INFO : serializing accumulator to return to master...
2022-02-21 17:45:38,725 : INFO : serializing accumulator to return to master...
2022-02-21 17:45:38,724 : INFO : serializing accumulator to return to master...
2022-02-21 17:45:38,723 : INFO : serializing accumulator to return to master...
2022-02-21 17:45:38,724 : INFO : serializing accumulator to return to master...
2022-02-21 17:45:38,725 : INFO : serializing accumulator to return to master...
2022-02-21 17:45:38,767 : INFO : serializing accumulator to return to master...
2022-02-21 17:45:38,758 : INFO : accumulator serialized
2022-02-21 17:45:38,759 : INFO : accumulator serialized
2022-02-21 17:45:38,765 : INFO : accumulator serialized
2022-02-21 17:45:38,822 : INFO : 7 accumulators retrieved from output queue
2022-02-21 17:45:38,759 : INFO : accumulator serialized
2022-02-21 17:45:38,757 : INFO : accumulator serialized
2022-02-21 17:45:38,757 : INFO : accumulator serialized
2022-02-21 17:45:38,774 : IN


Coherence Score:  0.8043368443079659


In [42]:
!pip install watermark




Collecting watermark
  Downloading watermark-2.3.0-py2.py3-none-any.whl (7.2 kB)
Installing collected packages: watermark
Successfully installed watermark-2.3.0
Python implementation: CPython
Python version       : 3.7.6
IPython version      : 7.12.0

wget         : not installed
pandas       : 1.3.5
numpy        : 1.20.0
geopy        : not installed
altair       : 4.1.0
vega         : not installed
vega_datasets: not installed
watermark    : 2.3.0

Compiler    : Clang 4.0.1 (tags/RELEASE_401/final)
OS          : Darwin
Release     : 20.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

 
Last updated: Mon Feb 21 2022 17:49:05CET



In [43]:
%load_ext watermark

# python, ipython, packages, and machine characteristics
%watermark -v -m -p wget,pandas,numpy,geopy,altair,vega,vega_datasets,watermark 

# date
print (" ")
%watermark -u -n -t -z 

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.7.6
IPython version      : 7.12.0

wget         : not installed
pandas       : 1.3.5
numpy        : 1.20.0
geopy        : not installed
altair       : 4.1.0
vega         : not installed
vega_datasets: not installed
watermark    : 2.3.0

Compiler    : Clang 4.0.1 (tags/RELEASE_401/final)
OS          : Darwin
Release     : 20.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

 
Last updated: Mon Feb 21 2022 17:49:31CET



Visualize the topics-**keywords**

In [37]:
!pip install pyLDAvis



In [38]:
!pip install pandas==1.3.1

Collecting pandas==1.3.1
  Downloading pandas-1.3.1-cp37-cp37m-macosx_10_9_x86_64.whl (11.0 MB)
[K     |████████████████████████████████| 11.0 MB 1.8 MB/s eta 0:00:01
[31mERROR: sklearn-pandas 2.2.0 has requirement scikit-learn>=0.23.0, but you'll have scikit-learn 0.22.1 which is incompatible.[0m
[31mERROR: sklearn-pandas 2.2.0 has requirement scipy>=1.5.1, but you'll have scipy 1.4.1 which is incompatible.[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.2.4
    Uninstalling pandas-1.2.4:
      Successfully uninstalled pandas-1.2.4
Successfully installed pandas-1.3.1


In [39]:
# Visualize the topics
!pip install pandas --upgrade
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

Collecting pandas
  Downloading pandas-1.3.5-cp37-cp37m-macosx_10_9_x86_64.whl (11.0 MB)
[K     |████████████████████████████████| 11.0 MB 281 kB/s eta 0:00:01
[31mERROR: sklearn-pandas 2.2.0 has requirement scikit-learn>=0.23.0, but you'll have scikit-learn 0.22.1 which is incompatible.[0m
[31mERROR: sklearn-pandas 2.2.0 has requirement scipy>=1.5.1, but you'll have scipy 1.4.1 which is incompatible.[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.1
    Uninstalling pandas-1.3.1:
      Successfully uninstalled pandas-1.3.1
Successfully installed pandas-1.3.5


TypeError: import_optional_dependency() got an unexpected keyword argument 'errors'

In [None]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
import gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud

common_dictionary = Dictionary(common_texts) # create corpus
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

lda = gensim.models.LdaModel(common_corpus, num_topics=10) # train model on corpus
for t in range(lda.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(lda_model.show_topic(t, 200))))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()

In [None]:
# lda is assumed to be the variable holding the LdaModel object
import matplotlib.pyplot as plt
for t in range(lda.num_topics):
   plt.figure()
plt.imshow(WordCloud().fit_words(dict(lda_model.show_topic(t, 200))))
#if we use "lda" instead if "lda_model", numbers (id of a token would appear)
plt.axis("off")
plt.title("Topic #" + str(t))
plt.show()

In [None]:
!pip install pandas --upgrade

You can get the topn words from an LDA model using Gensim's built-in method show_topic.

In [None]:
lda = models.LdaModel.load('lda.model')

for i in range(0, lda.num_topics):
    with open('output_file.txt', 'w') as outfile:
        outfile.write('{}\n'.format('Topic #' + str(i + 1) + ': '))
        for word, prob in lda.show_topic(i, topn=20):
            outfile.write('{}\n'.format(word.encode('utf-8')))
        outfile.write('\n')

In [None]:
import pyLDAvis.gensim_models as gensimvis

gensimvis.prepare(lda_model, corpus, id2word)

In [None]:
lda_model.show_topics()

In [None]:
lda.show_topics()

In [None]:
%load_ext watermark

# python, ipython, packages, and machine characteristics
%watermark -v -m -p wget,pandas,numpy,geopy,altair,vega,vega_datasets,watermark 

# date
print (" ")
%watermark -u -n -t -z 