In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS


In [49]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from typing import Dict
from langchain.output_parsers import PydanticOutputParser
from pydantic import RootModel
import os

api_key = os.getenv("GEMINI_API_KEY")


class TopicNames(RootModel[Dict[str, str]]):
    """Mapeia o nome original do tópico para um nome descritivo."""
    pass



class TopicAgent:
    def __init__(self):
        self.llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash",api_key=api_key)
        self.parser = PydanticOutputParser(pydantic_object=TopicNames)
        self.context_prompt = (
            "You are a market intelligence expert and trend analyst in science and technology. "
            "Your task is to analyze the output of an LDA model and assign meaningful, short, and descriptive names "
            "to each topic based on its keywords. "
            "Return your output STRICTLY as a valid JSON object mapping the topic ID to its name.\n"
            "Example:\n"
            "{\n"
            "  'Topic 1': 'topic name',\n"
            "  'Topic 2': 'topic name'\n"
            "}\n"
        )


    def Generate_names(self, topic_string: str) -> str:
        prompt = ChatPromptTemplate.from_messages([
            SystemMessage(content=self.context_prompt),
            ("human", "Generate topic names considering this output: {topic_string}")
        ])


        topic_chain = prompt | self.llm | self.parser
        result = topic_chain.invoke({"topic_string": topic_string})
        return result.root


In [50]:
class LDA_handler():
    def __init__(self,csv_file):
        self.data_src = csv_file
        self.df = pd.read_csv(csv_file)
        self.classify_agent = TopicAgent()

    def Generate_article_text_list(self):
        return self.df['Title'].tolist()

    def Generate_tokenized_list(self,text_list):
        return [
            [word for word in text.lower().split() if word not in STOPWORDS]
            for text in text_list
        ]

    def Transform_data_numeric(self,tokens_list):
        dictionary = corpora.Dictionary(tokens_list)
        corpus = [dictionary.doc2bow(text) for text in tokens_list]
        return dictionary, corpus
    
    def Generate_lda_model(self,topics_number,corpus,dictionary,passes):
        lda_model = models.LdaModel(corpus, num_topics=topics_number, id2word=dictionary, passes=passes, random_state=42)  
        return lda_model

    def Get_lda_topics_keywords(self, lda_model):
        result = ""
        for i in range(lda_model.num_topics):
            topic_text = f"Topic {i}: {lda_model.print_topic(i)}\n"
            result += topic_text
        return result

    def Append_model_to_df(self,lda_model,corpus):
        title_topics = []
        for doc_bow in corpus:
            title_topics.append(lda_model.get_document_topics(doc_bow))

        self.df['topicos'] = title_topics

    def Create_model_names(self,topics_keywords):
        return self.classify_agent.Generate_names(topics_keywords)

    def main_process(self):
        text_list = self.Generate_article_text_list()
        tokenized_list = self.Generate_tokenized_list(text_list)
        dictionary,corpus = self.Transform_data_numeric(tokenized_list)
        lda_model = self.Generate_lda_model(5,corpus,dictionary,20)
        topics_keywords = self.Get_lda_topics_keywords(lda_model)
        print(self.Create_model_names(topics_keywords))


In [51]:
handler = LDA_handler("SB_publication_PMC.csv")
handler.main_process()

E0000 00:00:1759607270.201785    1104 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


{'Topic 0': 'ISS Biological Research', 'Topic 1': 'Space Genomics & Bone Health', 'Topic 2': 'Microgravity Physiology: Plants & Animals', 'Topic 3': 'Spaceflight Effects: Muscle & Radiation', 'Topic 4': 'Spaceflight Molecular & Cellular Responses'}
