# NLU_Fallback topic modeling with 
Topic modeling to find the main topics in the NLU_Fallback intents detected by the Rasa bot.

## Setup

In [28]:
import os

import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.lda_model
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymongo import MongoClient
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

pyLDAvis.enable_notebook()

In [29]:
MONGO_USER = os.getenv("MONGO_USER")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
TOPIC_COUNT = 3
RANDOM_STATE = 2055
STOP_WORDS = set(stopwords.words('spanish'))
NLP = spacy.load("es_core_news_md")

## Helper functions

In [30]:
def remove_stop_words(sentence):
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [word for word in word_tokens if word.lower() not in STOP_WORDS]
    return ' '.join(filtered_sentence)

In [31]:
def lemmatize_text(text):
    doc = NLP(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    return lemmatized_text

In [32]:
def create_lda_model(data, topic_number):
    model = LatentDirichletAllocation(n_components=topic_number, max_iter=5, random_state=RANDOM_STATE)
    model.fit(data)

    doc_topic_distribution = model.transform(data)
    topic_word_distribution = model.components_

    return model, doc_topic_distribution, topic_word_distribution

In [33]:
def print_lda_topics(topic_word_distribution, feature_names, top_word_number=5):
    print("\nDistribución de palabras para cada tópico:")
    for i, topic_words in enumerate(topic_word_distribution):
        top_words_indices = topic_words.argsort()[-top_word_number:][::-1]
        top_words = [feature_names[index] for index in top_words_indices]
        print(f"Tópico {i + 1}: {top_words}")

In [34]:
def display_lda_topics(model, X, vectorizer):
    panel = pyLDAvis.lda_model.prepare(model, X, vectorizer, mds='tsne')
    return pyLDAvis.display(panel)

## Database connection

In [35]:
# Connect to MongoDB
uri = f"mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@mycluster.xkgnpk7.mongodb.net/?retryWrites=true&w=majority&appName=MyCluster"
client = MongoClient(uri)
db = client.rasa
conversations = db.conversations

## Fetch user queries detected as nlu_fallback

In [36]:
# Fetch texts detected as nlu_fallback
pipeline = [
    {"$unwind": "$events"},
    {"$match": {"events.event": "user", "events.parse_data.intent.name": "nlu_fallback"}},
    {"$project": {"text": "$events.text"}},
]
results = conversations.aggregate(pipeline)
client.close()

corpus = [result["text"] for result in results]
corpus = [lemmatize_text(text) for text in corpus]
corpus = [remove_stop_words(text) for text in corpus]
print(corpus)

['querer saber plazo presentar matricula', 'lugar mancha cuyo nombre querer acordar', 'vaca hacer mooo bicho hacer siuuuuuuuuuu', 'exit', '/exit', 'documento necesitar inscribir ?', 'haber plazo apuntar ?', 'dia empezar matricula ?', 'haber plazo apuntar ?', 'Grado Medio', 'interesar Grado Superior', '- módulo informática ?', 'módulo informática ?', 'Necisito mas informacion DAW', 'DAW', 'cuál ser horario módulo DAW ?', '/exit', '- ¿ cómo poder renunciar convocatoria ?', '', '¿ cómo poder renunciar convocatoria ?', '¿ cuándo poder cancelar examen ?', 'cuál ser plazo anular convocatoria ?', '¿ cuál ser horario clase manana', 'ser horario clase tarde ?', 'Quer horario tener clase tarde ?', 'poder estudiar ?', 'cunado haber plazo apuntar ?', 'si']


## Topic modeling

In [37]:
# Tokenize the corpus, lemmatize and remove stop words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
terms_dictionary = vectorizer.get_feature_names_out()

In [38]:
# Create LDA model and print topics
lda_model, lda_doc_topic_distribution, lda_topic_word_distribution = create_lda_model(X, TOPIC_COUNT)

## Results

In [39]:
print_lda_topics(lda_topic_word_distribution, terms_dictionary)


Distribución de palabras para cada tópico:
Tópico 1: ['exit', 'hacer', 'tarde', 'bicho', 'siuuuuuuuuuu']
Tópico 2: ['ser', 'horario', 'cuál', 'daw', 'clase']
Tópico 3: ['plazo', 'poder', 'apuntar', 'haber', 'convocatoria']


In [40]:
display_lda_topics(lda_model, X, vectorizer)

