In [2]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

import xml.etree.ElementTree as ET 

# Plotting tools
# import pyLDAvis
# import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
xmlpath = '/content/drive/MyDrive/2021/stuttgart/Text Tech Team/resources/interspeech/all_formatted.xml'
tree = ET.parse(xmlpath)
root = tree.getroot() 

In [4]:
corpus = []
for conf in root:
    meta = conf[0]
    papers = conf[1]
    for paper in papers:
        corpus.append(paper[1].text)

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  

words = list(sent_to_words(corpus))

In [6]:
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatisation(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    ret = []
    for sent in words:
        doc = nlp(" ".join(sent)) 
        ret.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return ret

# Only use open class data
lemmas = lemmatisation(words)

In [27]:
def lists_to_sents(word_list):
    ret = []
    for x in word_list:
        ret.append(" ".join(x))
    return ret 

# the original corpus with lowercase and without punctuations
naive_corpus = lists_to_sents(words)
# form sentences with lemmas 
lemma_corpus = lemmas

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer 

def train_one_corpus(corpus, sample_id = 2):
    tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
    fitted_vectorizer=tfidf_vectorizer.fit(corpus)
    tfidf_vectorizer_vectors=fitted_vectorizer.transform(corpus)
    sample = tfidf_vectorizer_vectors[sample_id]
    df = pd.DataFrame(sample.T.todense(), index=fitted_vectorizer.get_feature_names(), columns=["tfidf"]) 
    print(naive_corpus[sample_id])
    res = df.sort_values(by=["tfidf"],ascending=False)
    print(res)

In [24]:
train_one_corpus(naive_corpus)

linear dynamic models ldms have been shown to be viable alternative to hidden markov models hmms on small vocabulary recognition tasks such as phone classification in this paper we investigate various statistical model combination approaches for hybrid hmm ldm recognizer resulting in phone classification performance that outperforms the best individual classifier further we report on continuous speech recognition experiments on the aurora corpus where the model combination is carried out on wordgraph rescoring while the hybrid system improves the hmm system in the case of monophone hmms the performance of the triphone hmm model could not be improved by monophone ldms asking for the need to introduce context dependency also in the ldm model inventory
               tfidf
ldm         0.349247
ldms        0.337932
monophone   0.276649
hmm         0.231771
hmms        0.215474
...              ...
fb          0.000000
favours     0.000000
favoured    0.000000
favourably  0.000000
𝛿t       



In [30]:
train_one_corpus(lemma_corpus)

linear dynamic models ldms have been shown to be viable alternative to hidden markov models hmms on small vocabulary recognition tasks such as phone classification in this paper we investigate various statistical model combination approaches for hybrid hmm ldm recognizer resulting in phone classification performance that outperforms the best individual classifier further we report on continuous speech recognition experiments on the aurora corpus where the model combination is carried out on wordgraph rescoring while the hybrid system improves the hmm system in the case of monophone hmms the performance of the triphone hmm model could not be improved by monophone ldms asking for the need to introduce context dependency also in the ldm model inventory
              tfidf
ldms       0.413690
monophone  0.333849
hmms       0.270670
model      0.252178
wordgraph  0.223532
...             ...
fave       0.000000
favor      0.000000
favorable  0.000000
favorably  0.000000
𝛿f         0.000000


