In [3]:
import nltk
from nltk.text import Text
from bs4 import BeautifulSoup
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from prettytable import PrettyTable
import re
import pickle
import math
import numpy as np
import os

In [4]:
#Convertimos en un Pk los objetos
def serializeObject(obj, fname="lemmas2.pkl"):
    with open(fname, "wb") as f:
        pickle.dump(obj, f, -1)
#desconvertimos los pkl
def deserializeObject(fname="lemmas2.pkl"):
    with open(fname, "rb") as f:
        return pickle.load(f)

In [5]:
def get_articles(fname):
        
    f=open(fname, encoding='utf-8')
    text_string=f.read()
    f.close()
    list_articles = text_string.split('<h3>')
    return list_articles
    

In [6]:
def get_text_string(articles):
    '''Receives an html file with a Spanish text, deletes html tags.
    Returns text as a string.'''
    list_articles = []
    for text_string in articles:
        soup = BeautifulSoup(text_string, 'lxml')
        text_string = soup.get_text()
        list_articles.append(text_string)
    return list_articles

In [7]:
def tagSentencesSpanishTagger(sentences):
    articles = []
    spanish_tagger = deserializeObject("tagger.pkl")#agarramos nuestro tagger entrenado
    for s in sentences:
        aux = []
        tokens = nltk.word_tokenize(s)#Tokenizamos los articulos
        s_tagged = spanish_tagger.tag(tokens)#Taggeamos los tokens
        s_tagged = [(it[0].lower(), it[1][0].lower()) for it in s_tagged]#Pasamos a minusculas
        for word in (lemmatize(removeStopWords(clearTokens(s_tagged)))):
            aux.append(word[0])
        articles.append(aux)#Agregamos las oraciones taggeadas
        
    return articles

#Sacamos las palabras de las oraciones de todas las sentencias
def getWordsFromSentences(sentences):
    words = []
    for sentence in sentences:
        for word in sentence:
            words.append(word)
    return words

#Limpiamos los tokens
def removeStopWords(wordsWithTag, language='spanish'):
    '''Receives a list of words and returns another list without stop words'''
    return [ word for word in wordsWithTag if word[0] not in stopwords.words(language) ]


def clearTokens(tokensWithTag):
    '''Receives a list of  with tag and returns another list with the same tokens but only with letters'''
    result = []
    for token in tokensWithTag:
        clearToken = ""
        for c in token[0]:
            if re.match(r'[a-záéíóúñüA-ZÁÉÍÓÚÑÜ]', c):
                clearToken += c
        if len(clearToken) > 0:
            result.append((clearToken, token[1]))
    return result

def lemmatize(wordsTagged, fname="lemmas2.pkl"):
    lemmas = deserializeObject(fname)
    wordsLemmatized = []
    for word in wordsTagged:
        if word in lemmas.keys():#Si la palabra la encuentra en el dic
            wordsLemmatized.append((lemmas[word], word[1]))#Guardamos el lemma con su tag y la agregamos a las palabras lematizadas
        else:
            wordsLemmatized.append(word)#Si no la encuentra guarda la palabra en el vocabulario en el diccionario
    return wordsLemmatized


In [8]:
def one_article(articles):
    full =[]
    for article in articles:
        for word in article:
            full.append(word)
    return full

In [9]:
def frecuency_vec(articles,vocabulary):
    frecuency = []
    for vocabulary_word in vocabulary:
        count = 0
        for word in articles:
            if word == vocabulary_word:
                count +=1
        frecuency.append(count)
    return np.array(frecuency)
    

In [10]:
def background(frecuency,voc):
    words = []
    total = np.sum(frecuency)
    for i,word in enumerate(voc):
        #list =[] #LA lista se crea para guardar palabra y probabilidad 
        #list.append(word)
        words.append(( frecuency[i]/total ))
        #words.append(list)
    #words.sort(key = lambda x: x[1], reverse=True) # ya no se ocupa
    return np.array(words)


In [11]:
def topic(voc):
    words = []
    m = 1/len(voc)
    for word in voc:
        words.append(m)
    return np.array(words)


In [12]:
def compute_document_likelihood(background_words,
                                topic_words, counts,
                                prob_background,prob_topic,
                                num_iterations = 500,
                                print_likelihood = False):
    argument_for_logarithm = (background_words * prob_background) + (topic_words * prob_topic)
    logarithms = np.log(argument_for_logarithm)
    for i in range(len(logarithms)):
        product = logarithms[i]*counts[i]
        logarithms[i] = product
    document_likelihood = np.sum(logarithms)
    if print_likelihood == True:
        if num_iterations % 20 == 0:
            print('Document likelihood is %f', document_likelihood)
    return logarithms

In [13]:
def topic_vec(proba,voc):
    words = []
    a = proba
    a.tolist()
    for i,word in enumerate(voc):
        list =[] #LA lista se crea para guardar palabra y probabilidad 
        list.append(word)
        list.append(a[i])
        words.append(list)
    words.sort(key = lambda x: x[1], reverse=False) # ya no se ocupa
    return np.array(words)

In [14]:
def e_step(prob_topic, topic_words,
           prob_background, background_words):
    z_0_prob = np.divide( (prob_topic*topic_words),( (prob_topic*topic_words) + (prob_background*background_words) ) )
    return z_0_prob

In [15]:
def m_step(counts,z_0_probs):
    numerator = counts*z_0_probs
    denominator = np.sum(numerator)
    topic_word_probs = numerator/denominator
    return topic_word_probs

In [16]:
if __name__=='__main__':
    #Cadena de texto
    #---------------PRIMERA PARTE--------------#
    #fname='text.htm'#Cambiamos por txt despues de procesar htm------2
    #articles = get_articles(fname)
    #articles = get_text_string(articles)
    #articles = tagSentencesSpanishTagger(articles)
    articles = deserializeObject('articles.pkl')
    full = one_article(articles)
    vocabulary = set(full)
    #_---PARTE 2------
    frecuency_brute = frecuency_vec(full,vocabulary)
    background_words = background(frecuency_brute,vocabulary)
    aux=[]
    for article in articles:
        background_words = background(frecuency_brute,vocabulary)
        topic_words = topic( vocabulary )
        prob_topic = .5
        prob_background =.5
        iterations = 500
        counts_vec = frecuency_vec(article, vocabulary)
        
        for i in range(iterations):
            
            z_0_probs = e_step(prob_topic, topic_words, prob_background,background_words)
            topic_words = m_step(counts_vec,z_0_probs)
            compute_document_likelihood(background_words,topic_words, counts_vec, prob_background,prob_topic,num_iterations = i,print_likelihood = True)
        
        topic_vector = topic_vec(topic_words,vocabulary)
        aux.append(topic_vector)
    #Instalar gensim

Document likelihood is %f -22.34741284014499
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.372443245878873
Document likelihood is %f -18.37244

In [28]:
print("The topics words are:")
aux[2][-15:]
aux2 =[]
print("The background words are:")
background_vector = topic_vec(background_words,vocabulary)

background_vector[-15:] 

The topics words are:


array([['moción', '0.007874666682046833'],
       ['hitler', '0.007874666682046833'],
       ['rutina', '0.007874666682046833'],
       ['palacio', '0.007874666682046833'],
       ['gobierno', '0.008236101578586072'],
       ['día', '0.00917477492026317'],
       ['momento', '0.01056043556750079'],
       ['julio', '0.011454410178621833'],
       ['alemán', '0.011722602561958149'],
       ['roma', '0.0117673012925142'],
       ['scalfaro', '0.01181200002307025'],
       ['duce', '0.015749333364093666'],
       ['italiano', '0.015749333364093666'],
       ['italia', '0.01968666670511708'],
       ['mussolini', '0.0236240000461405']], dtype='<U34')

In [25]:
serializeObject(aux, "espectation_max_all.pkl")