# <h1> Topic Modeling

#### *pip installs if needed for first-time run:*

In [102]:
# !pip install neo4j bertopic sentence-transformers scikit-learn
# !python -m spacy download fr_core_news_sm

In [103]:
import hashlib
from sklearn.feature_extraction.text import CountVectorizer 
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase

# For file conversion: 
import spacy
import json
import logging
from pathlib import Path
from spacy_layout import spaCyLayout

### *This part below would be for the end-users to run on their data*

In [104]:
#Setting up paths from directory
# Testing our multiple paths at once with different file types
input_paths = [
    Path("../Radicalism_Verbalized_NLP/sample_data/casa tomada en Francais.docx"),
    Path("../Radicalism_Verbalized_NLP/sample_data/37-septembre-2008.pdf"),
    Path("../Radicalism_Verbalized_NLP/sample_data/fr_stopwords.txt")
]

In [105]:
#Getting sample doc:
# docs = []
# with open('../Radicalism_Verbalized_NLP/sample_data/fra_news_2023_100K-sentences.txt', 'r') as f:
#     read_text = f.read()
#     for index in read_text.split('\t')[:10000]: #To shorten runtime
#         docs.append(index)

In [106]:
# Setting up spacy file converter for french
nlp_converter = spacy.blank('fr')
layout = spaCyLayout(nlp_converter)
nlp_converter.add_pipe('sentencizer')

# processing a test document and creating a spacy object
test_pdf = Path("../Radicalism_Verbalized_NLP/sample_data/37-septembre-2008.pdf")
user_text = layout(test_pdf)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [122]:
# converting pdf into docs to process
docs = []
# iterating through and grabbing raw text to iterate through
# needed for topic modeler
for line in user_text:
    if line.text_with_ws.isalnum():
        docs.append(line.text_with_ws)

## Testing the Sentence-Transformer based Topic Modeling:'


##### Removing stop words¶
         - "At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context in order to create accurate embeddings."

         - "Instead, we can use the CountVectorizer to preprocess our documents after having generated embeddings and clustered our documents." 
          - "we can use a KeyBERT-Inspired model to reduce the appearance of stop words. This also often improves the topic representation:"

In [123]:
#Creating a list of basic french stopwords to remove using CountVectorizer:
fr_stopwords = []
with open('../Radicalism_Verbalized_NLP/sample_data/fr_stopwords.txt', 'r') as f:
    text = f.read()
    for i in text.split('\n'):
        fr_stopwords.append(i)

In [124]:
#Creating a custom vectorizer model & representation model to help remove stopwords
vectorizer_model = CountVectorizer(stop_words=fr_stopwords)
representation_model = KeyBERTInspired()

In [125]:
#Setting up our French-supported embedding model from HF embeddings:
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

#Initializing our BERTopic model for the HF embeddings
topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model, representation_model=representation_model)

In [126]:
# topics, probs = topic_model.fit_transform(docs)
topics, probs = topic_model.fit_transform(docs)

In [127]:
# Iterating getting each topic:
# keywords = []
# num_topics = len(topic_model.get_topics())
# for i in range(num_topics):
#     # print(topic_model.get_topic(i))
#     keywords.append(topic_model.get_topic(i))

In [128]:
#-1 refers to outliers and should be ignored
topic_model.get_topic_info() 
topic_model.get_document_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,235,-1_femmes_claudia_épouse_prestigieuses,"[femmes, claudia, épouse, prestigieuses, moniq...","[femmes, Femmes, femmes]"
1,0,50,0_19_29_18_28,"[19, 29, 18, 28, 174, 27, 175, 211, 37, 163]","[29, 19, 225]"
2,1,40,1_2006_2005_2007_2004,"[2006, 2005, 2007, 2004, 1960, , , , , ]","[2006, 2006, 2006]"
3,2,37,2_métiers_technologiques_ouvrières_carrière,"[métiers, technologiques, ouvrières, carrière,...","[technologique, métiers, métiers]"
4,3,37,3_2003_2002_1999_1994,"[2003, 2002, 1999, 1994, 1988, 1996, 1998, siè...","[2003, 2003, 2003]"
5,4,36,4_ainsi_puis_également_pourtant,"[ainsi, puis, également, pourtant, corresponda...","[Ainsi, Ainsi, Ainsi]"
6,5,33,5_dir_ans_échéant_jour,"[dir, ans, échéant, jour, pre, , , , , ]","[dir, dir, dir]"
7,6,27,6_vst_vol_dcsf_,"[vst, vol, dcsf, , , , , , , ]","[VST, VST, VST]"
8,7,25,7_scolaire_scolaires_école_écoles,"[scolaire, scolaires, école, écoles, schooling...","[scolaire, scolaire, scolaire]"
9,8,22,8_sexes_sexe_sex_sexués,"[sexes, sexe, sex, sexués, sexuées, sexuée, ma...","[sexes, sexes, hommes]"


In [129]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,technologique,2,2_métiers_technologiques_ouvrières_carrière,"[métiers, technologiques, ouvrières, carrière,...","[technologique, métiers, métiers]",métiers - technologiques - ouvrières - carrièr...,0.310063,True
1,information,-1,-1_femmes_claudia_épouse_prestigieuses,"[femmes, claudia, épouse, prestigieuses, moniq...","[femmes, Femmes, femmes]",femmes - claudia - épouse - prestigieuses - mo...,0.000000,False
2,2008,12,12_2008___,"[2008, , , , , , , , , ]","[2008, 2008, 2008]",2008 - - - - - - - - -,1.000000,True
3,ligne,5,5_dir_ans_échéant_jour,"[dir, ans, échéant, jour, pre, , , , , ]","[dir, dir, dir]",dir - ans - échéant - jour - pre - - - - -,0.487460,False
4,éducation,14,14_éducation_education_éducatif_enseignement,"[éducation, education, éducatif, enseignement,...","[éducation, éducation, éducation]",éducation - education - éducatif - enseignemen...,0.943007,True
...,...,...,...,...,...,...,...,...
784,Fax,-1,-1_femmes_claudia_épouse_prestigieuses,"[femmes, claudia, épouse, prestigieuses, moniq...","[femmes, Femmes, femmes]",femmes - claudia - épouse - prestigieuses - mo...,0.000000,False
785,93,11,11_93_94_81_84,"[93, 94, 81, 84, 79, 75, 71, 90, 78, 77]","[94, 81, 93]",93 - 94 - 81 - 84 - 79 - 75 - 71 - 90 - 78 - 77,0.863262,True
786,VST,6,6_vst_vol_dcsf_,"[vst, vol, dcsf, , , , , , , ]","[VST, VST, VST]",vst - vol - dcsf - - - - - - -,0.989249,True
787,2008,12,12_2008___,"[2008, , , , , , , , , ]","[2008, 2008, 2008]",2008 - - - - - - - - -,0.510151,True


In [130]:
topic_fig = topic_model.visualize_topics()
topic_fig.show()
# topic_model.visual

In [131]:
h_fig = topic_model.visualize_hierarchy()
h_fig.show()

In [132]:
bar_fig = topic_model.visualize_barchart()
bar_fig.show()

## Building a Knowledge Graph from the extracted topics & docs as an example:

In [118]:
# defining a function that allows us to have unique keys for each topic
def generate_topic_key(keywords):
    keywords_string = "_".join([kw for kw, _ in keywords])
    return hashlib.md5(keywords_string.encode()).hexdigest()

In [119]:
# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = ... #database uri link
AUTH = ... #username + password
#NOTE: Unfortunately had to remove these login credentials for security reasons. Can be presented in real-time

#Verifying connection
driver = GraphDatabase.driver(URI, auth=('sample username','sample password'))
driver.verify_connectivity()

#### *The function below should now handle preventing duplicates.*

In [120]:
#Defining a function to create topic graphs
def create_topic_graph(driver, docs, topics, topic_model):
    """To create a neo4j Knowledge Graph based on the extracted data from the topic model. We accomplish this by the following:
    1. Creating Topic Nodes.
    2. Creating Document Nodes and relationships between each doc and their corresponding topic.
    3. Creating Keyword Nodes and Strength relationships to their associated topic.

    Args:
        driver : Variable representing the connected GraphDatabase
        docs (_type_): Docs used for topic modeling
        topics (_type_): topics extracted from BERTopic
        topic_model (_type_): Topic Model fit during topic modeling
    """
    #Initializing driver session to write to KG:
    with driver.session() as session:
        #Creating the topic nodes:
        topic_ids = set(topics)
        topic_queries = [
            {
                "key": generate_topic_key(topic_model.get_topic(topic_id)),
                "keywords": ", ".join([word for word, proba in topic_model.get_topic(topic_id)])
            }
            for topic_id in topic_ids
        ]

        # Merge/creating topic nodes with keywords
        session.run(
            """
            UNWIND $topics AS topic
            MERGE (t:Topic {key: topic.key})
            SET t.keywords = topic.keywords
            """,
            topics=topic_queries
        )
        
        # querying doc relationships
        doc_queries = [
            {
                "content": doc,
                "topic_key": generate_topic_key(topic_model.get_topic(topic))
            }
            for doc, topic in zip(docs, topics)
        ]
        
        #Creating document nodes and relationships between the doc and topic
        session.run(
            """
            UNWIND $docs AS doc
            MERGE (d:Document {content: doc.content})
            WITH d, doc
            MATCH (t:Topic {key: doc.topic_key})
            MERGE (d)-[:BELONGS_TO]->(t)
            """,
            docs=doc_queries
        )
        
        
        # Adding keyword and strength relationships to the topic
        for topic_id in topic_ids:
            topic_key = generate_topic_key(topic_model.get_topic(topic_id))
            keywords = topic_model.get_topic(topic_id)
            for keyword, strength in keywords:
                session.run(
                    """
                    MERGE (k:Keyword {word: $keyword})
                    WITH k
                    MATCH (t:Topic {key: $topic_key})
                    MERGE (k)-[r:REPRESENTS]->(t)
                    SET r.strength = $strength
                    """,
                    keyword=keyword, topic_key=topic_key, strength=strength
                )
    
    # original version for creating without merging in mind:
     
    # #Initializing driver session to write to KG:
    # with driver.session() as session:
    #     #Creating the topic nodes:
    #     topic_ids = set(topics)
    #     for topic_id in set(topics):
    #         topic_keywords = ', '.join([word for word, proba in topic_model.get_topic(topic_id)])
    #         session.run(
    #             """CREATE (t:Topic {id: $id, keywords: $keywords})""",
    #             id=topic_id, keywords=topic_keywords
    #         )
            
    #     #Creating document nodes and relationships between the doc and topic
    #     for doc, topic in zip(docs, topics):
    #         session.run(
    #             """
    #             CREATE (d:Document {content: $content})
    #             with d
    #             MATCH (t:Topic {id: $topic_id})
    #             CREATE (d)-[:BELONGS_TO]->(t)
    #             """,
    #             content=doc, topic_id=topic
    #         )
        
    #     # Adding keyword and strength relationships to the topic
    #     for topic_id in topic_ids:
    #         keywords = topic_model.get_topic(topic_id)
    #         for keyword, strength in keywords:
    #             session.run(
    #                 """
    #                 MERGE (k:Keyword {word: $keyword})
    #                 WITH k
    #                 MATCH (t:Topic {id: $topic_id})
    #                 MERGE (k)-[r:REPRESENTS]->(t)
    #                 SET r.strength = $strength
    #                 """,
    #                 keyword=keyword, topic_id=topic_id, strength=strength
    #             ) 
    

In [121]:
create_topic_graph(driver=driver, docs=docs, topics=topics, topic_model=topic_model) #Actually writes to our Neo4j Knowledge Graph Database