In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from typing import Dict
from langchain.output_parsers import PydanticOutputParser
from pydantic import RootModel
import os
from langchain.schema.output_parser import StrOutputParser

api_key = os.getenv("GEMINI_API_KEY")


class TopicNames(RootModel[Dict[str, str]]):
    pass



class TopicAgent:
    def __init__(self):
        self.llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash",api_key=api_key)
        self.context_prompt = (
            "You are a market intelligence expert and trend analyst in science and technology. "
        )


    def Generate_names(self, topic_string: str) -> str:
        prompt = ChatPromptTemplate.from_messages([
            SystemMessage(content=self.context_prompt),
            ("Your task is to analyze the output of an LDA model and assign meaningful, short, and descriptive names "
            "to each topic based on its keywords. "
            "Return your output STRICTLY as a valid JSON object mapping the topic ID to its name.\n"
            "Example:\n"
            "{{\n"
            "  'topic 1': 'topic name',\n"
            "  'topic 2': 'topic name'\n"
            "}}\n"),
            ("human", "Generate topic names considering this output: {topic_string}")
        ])


        topic_chain = prompt | self.llm | PydanticOutputParser(pydantic_object=TopicNames)
        result = topic_chain.invoke({"topic_string": topic_string})
        return result.root
    
    def Generate_summary(self, topic_string: str) -> str:
        prompt = ChatPromptTemplate.from_messages([
            SystemMessage(content=self.context_prompt),
            ("Your task is to analyze a topic name for a give category and the titles from the articles that belong to this category"
            " and write a small descriptive summary to understant site visitors to understand what that topic is about "
            ),
            ("human", "Generate text considering this this informations: {topic_string}")
        ])


        topic_chain = prompt | self.llm | StrOutputParser()
        result = topic_chain.invoke({"topic_string": topic_string})
        return result


In [None]:
class LDA_handler():
    def __init__(self,csv_file):
        self.data_src = csv_file
        self.df = pd.read_csv(csv_file)
        self.classify_agent = TopicAgent()

    def Generate_article_text_list(self):
        return self.df['Title'].tolist()

    def Generate_tokenized_list(self,text_list):
        return [
            [word for word in text.lower().split() if word not in STOPWORDS]
            for text in text_list
        ]

    def Transform_data_numeric(self,tokens_list):
        dictionary = corpora.Dictionary(tokens_list)
        corpus = [dictionary.doc2bow(text) for text in tokens_list]
        return dictionary, corpus
    
    def Generate_lda_model(self,topics_number,corpus,dictionary,passes):
        lda_model = models.LdaModel(corpus, num_topics=topics_number, id2word=dictionary, passes=passes, random_state=42)  
        return lda_model

    def Get_lda_topics_keywords(self, lda_model):
        result = ""
        for i in range(lda_model.num_topics):
            topic_text = f"topic {i}: {lda_model.print_topic(i)}\n"
            result += topic_text
        return result

    def Append_model_to_df(self,lda_model,corpus):
        title_topics = []
        for doc_bow in corpus:
            title_topics.append(lda_model.get_document_topics(doc_bow))

        self.df['topics'] = title_topics

    def Create_model_names(self,topics_keywords):
        return self.classify_agent.Generate_names(topics_keywords)

    def Get_all_titles_from_topic(self, topic_num, threshold=0.4):

        def topic_greater_than_threshold(topicos):
            if topic_num >= len(topicos):
                return False
            _, valor = topicos[topic_num]
            return valor > threshold

        return self.df[self.df['topics'].apply(topic_greater_than_threshold)]['Title'].tolist()
    
    def Create_topics_dict(self,number_of_topics):
        topics_dict = {}
        text_list = self.Generate_article_text_list()
        tokenized_list = self.Generate_tokenized_list(text_list)
        dictionary,corpus = self.Transform_data_numeric(tokenized_list)
        lda_model = self.Generate_lda_model(number_of_topics,corpus,dictionary,20)
        topics_keywords = self.Get_lda_topics_keywords(lda_model)
        self.Append_model_to_df(lda_model,corpus)
        model_names = self.Create_model_names(topics_keywords)
        print(model_names)
        for i in range(number_of_topics):
            topic_dict = {}
            topic_name = model_names[f"topic {i}"]
            topic_dict["name"] = topic_name
            articles_list = self.Get_all_titles_from_topic(i,0.95)
            summary_prompt =(f"""
            Topic name: {topic_name}
            Article list: {articles_list}
            """)
            topic_dict["description"] = self.classify_agent.Generate_summary(summary_prompt)
            topics_dict[i] = topic_dict
        return topics_dict



In [11]:
handler = LDA_handler("SB_publication_PMC.csv")
db_topics = handler.Create_topics_dict(5)

E0000 00:00:1759610647.962304  101145 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


{'Topic 0': 'ISS Biomedical & Genetic Research', 'Topic 1': 'Spaceflight Genomics & Bone Health', 'Topic 2': 'Microgravity Plant & Bone Studies', 'Topic 3': 'Spaceflight Effects on Muscle & Cells', 'Topic 4': 'Model Organism Space Biology'}


KeyError: 'topic 0'

In [None]:
articles_list = handler.Get_all_titles_from_topic(0,0.95)
test_string = f""""
Topic name: ISS Microgravity Biology
articles list: {articles_list}
"""

handler.classify_agent.Generate_summary(test_string)

'ISS Microgravity Biology explores how diverse living organisms respond and adapt to the unique conditions of the International Space Station, with a particular focus on microgravity. This research spans from the molecular level, investigating gene expression, DNA repair, and stress responses in microorganisms like *Bacillus subtilis* and *Salmonella*, to physiological changes in mammalian systems, such as bone formation and reproductive tissue health. The topic also highlights the development of specialized methodologies and technologies essential for conducting biological experiments in space, including novel systems for RNA analysis and protocols for tissue preservation. Furthermore, it delves into the adaptation of new bacterial species to space environments, utilizing advanced genomic analysis to understand their resilience and functional changes. This field ultimately seeks to unravel the fundamental biological impacts of spaceflight and advance our understanding of life beyond E

In [None]:
my_dict = {"topic 0":"topic1_name",
           "topic 1":"topic2_name",
           "topic 2":"topic3_name",
           "topic 3":"topic4_name",
           }

topic_list = {}
for i in range(4):
    new_dict = {}
    new_dict["name"] = my_dict[f"topic {i}"]
    new_dict["description"] = f"minha_desc{i}"
    topic_list[i] = new_dict

print(topic_list)

{0: {'name': 'topic1_name', 'description': 'minha_desc0'}, 1: {'name': 'topic2_name', 'description': 'minha_desc1'}, 2: {'name': 'topic3_name', 'description': 'minha_desc2'}, 3: {'name': 'topic4_name', 'description': 'minha_desc3'}}


V: teste
