### En este notebook finalmente probaré si un Topic extraction no supervisado funciona mejor que los anteriores modelos.

In [1]:
import spacy
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

import gensim.corpora as corpora
from gensim.models import LdaModel

In [2]:
nlp = spacy.load('es_core_news_md', disable = ['parser', 'ner'])
stop_words = nlp.Defaults.stop_words

def lemmatize_doc(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    text_out = [t.lemma_.lower() for t in nlp(text)
                if t.pos_ in allowed_postags
                and len(t.lemma_)>3
                and not t.is_stop]
    return text_out

def build_texts(fname):
    with open(fname) as f:
        for line in f:
            yield lemmatize_doc(line)

In [3]:
lee_data_file = 'sem_eval_train_es_topic.csv'

In [4]:
with open(lee_data_file) as f:
    for line in f:
        print(line)
        break

2018-Es-01643,"@aliciaenp Ajajjaa somos del clan twitteras perdidas pa eventos ""importantes"" "



In [5]:
texto = build_texts(lee_data_file)

In [6]:
lista_procesado = [c for c in texto]

In [7]:
len(lista_procesado)

3561

Crear diccionario

In [8]:
diccionario = corpora.Dictionary(build_texts(lee_data_file))
corpus = [diccionario.doc2bow(text) for text in build_texts(lee_data_file)]

print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]


In [9]:
len(diccionario.items())

7509

In [10]:
warnings.filterwarnings('ignore')

ldamodel = LdaModel(corpus = corpus, num_topics=11, id2word=diccionario, iterations=5000)
pprint(ldamodel.print_topics())

[(0,
  '0.013*"feliz" + 0.012*"querer" + 0.010*"mirar" + 0.009*"amigo" + '
  '0.008*"grave" + 0.007*"terminar" + 0.007*"llorar" + 0.007*"amar" + '
  '0.007*"horrible" + 0.006*"imagen"'),
 (1,
  '0.016*"divertido" + 0.011*"triste" + 0.010*"enojo" + 0.007*"hijo" + '
  '0.007*"ironía" + 0.007*"volver" + 0.007*"mundo" + 0.006*"culpa" + '
  '0.006*"dejar" + 0.006*"venganza"'),
 (2,
  '0.010*"amiga" + 0.010*"quedar" + 0.008*"pasar" + 0.008*"poner" + '
  '0.007*"papá" + 0.007*"suerte" + 0.007*"vida" + 0.007*"imaginar" + '
  '0.007*"buscar" + 0.006*"peligroso"'),
 (3,
  '0.018*"querer" + 0.011*"gustar" + 0.010*"depresión" + 0.009*"cosa" + '
  '0.008*"noche" + 0.007*"llamar" + 0.007*"jajajaja" + 0.007*"hablar" + '
  '0.007*"peligro" + 0.006*"entender"'),
 (4,
  '0.019*"dormir" + 0.013*"horror" + 0.010*"pasar" + 0.008*"salir" + '
  '0.008*"genial" + 0.007*"mierda" + 0.007*"leer" + 0.007*"depender" + '
  '0.007*"llorar" + 0.006*"único"'),
 (5,
  '0.014*"terrible" + 0.013*"orgulloso" + 0.010*"maña

In [15]:
ldamodel[corpus[4]]

[(5, 0.9090491)]