In [None]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

import xml.etree.ElementTree as ET 

# Plotting tools
# import pyLDAvis
# import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
xmlpath = '/content/drive/MyDrive/2021/stuttgart/Text Tech Team/resources/interspeech/all_formatted.xml'
tree = ET.parse(xmlpath)
root = tree.getroot() 

In [None]:
corpus = []
for conf in root:
    meta = conf[0]
    papers = conf[1]
    for paper in papers:
        corpus.append(paper[1].text)

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  

words = list(sent_to_words(corpus))

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatisation(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    ret = []
    for sent in words:
        doc = nlp(" ".join(sent)) 
        ret.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return ret

# Only use open class data
lemmas = lemmatisation(words)

In [None]:
count_vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}')

word_vecs = count_vectorizer.fit_transform(lemmas)


lda_model = LatentDirichletAllocation(n_components=50,               
                                      max_iter=20,               
                                      learning_method='online',   
                                      random_state=771,          
                                      batch_size=128,            
                                      evaluate_every = -1,       
                                      n_jobs = -1)
lda_output = lda_model.fit_transform(word_vecs)

In [None]:
# show top k keywords for n topics
def show_topk(vectorizer, lda, topk=10, n=20):
    keys = np.array(vectorizer.get_feature_names())
    topics = []
    for weights in lda_model.components_:
        keys_idx = (-weights).argsort()[:topk]
        topics.append(keys.take(keys_idx))
        if len(topics) == n:
            break
    return topics 

topics = show_topk(count_vectorizer, lda_model)        

fig = pd.DataFrame(topics)
fig.columns = ['Word '+str(i) for i in range(fig.shape[1])]
fig.index = ['Topic '+str(i) for i in range(fig.shape[0])]
fig



Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,angular,inconsistency,deceptive,pruning,articulation,squeeze,empirically,encode,disfluency,concurrent
Topic 1,unit,segment,acoustic,event,infant,game,audiovisual,linguistic,forensic,discover
Topic 2,human,user,conversation,turn,dialog,interaction,conversational,spoof,automate,switching
Topic 3,vowel,perception,cue,study,participant,formant,perceptual,effect,stimulus,experiment
Topic 4,datum,use,text,language,task,dialogue,speech,automatic,transcription,paper
Topic 5,noise,speech,condition,noisy,enhancement,clean,signal,environment,background,hour
Topic 6,untranscribed,gan,pack,randomly,computer,vocalisation,limited,affective,investigate,assign
Topic 7,articulatory,acoustic,speech,use,tongue,production,articulation,movement,analysis,time
Topic 8,separation,mixture,music,resolution,magnitude,variational,seq,identification,temporal,blind
Topic 9,assistant,smart,home,worker,streaming,biometric,deployment,aggregation,vulnerable,mediate
