# <h1> Topic Modeling

#### *pip installs if needed for first-time run:*

In [3]:
# !pip install neo4j bertopic sentence-transformers scikit-learn
# !python -m spacy download fr_core_news_sm

In [4]:
from sklearn.feature_extraction.text import CountVectorizer 
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase

# For file conversion: 
import spacy
import json
import logging
from pathlib import Path
from spacy_layout import spaCyLayout

### *This part below would be for the end-users to run on their data*

In [5]:
#Setting up paths from directory
# Testing our multiple paths at once with different file types
input_paths = [
    Path("../Radicalism_Verbalized_NLP/sample_data/casa tomada en Francais.docx"),
    Path("../Radicalism_Verbalized_NLP/sample_data/37-septembre-2008.pdf"),
    Path("../Radicalism_Verbalized_NLP/sample_data/fr_stopwords.txt")
]

In [6]:
#Getting sample doc:
# docs = []
# with open('../Radicalism_Verbalized_NLP/sample_data/fra_news_2023_100K-sentences.txt', 'r') as f:
#     read_text = f.read()
#     for index in read_text.split('\t')[:10000]: #To shorten runtime
#         docs.append(index)

In [30]:
# Setting up spacy file converter for french
nlp_converter = spacy.blank('fr')
layout = spaCyLayout(nlp_converter)
nlp_converter.add_pipe('sentencizer')

# processing a test document and creating a spacy object
test_pdf = Path("../Radicalism_Verbalized_NLP/sample_data/37-septembre-2008.pdf")
user_text = layout(test_pdf)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [54]:
# converting pdf into docs to process
docs = []
# iterating through and grabbing raw text to iterate through
# needed for topic modeler
for line in user_text:
    if line.text_with_ws.isalnum():
        docs.append(line.text_with_ws)
# print(docs)
# print(docs)

['technologique', 'information', '2008', 'ligne', 'éducation', 'Marguerite', 'années', 'laïque', 'scolaire', 'égalité', 'sexuées', 'désormais', 'différemment', 'genre', 'et', 'échéant', 'payant', 'institution', 'contraire', 'l', 'formation', 'nationale', 'effet', 'nationale', 'Solidarité', 'Pêche', 'ministères', '2008', 'École', 'IPSOS', 'hommes', 'filles', 'DGESCO', 'ESEN', 'ESEN', 'éducation', 'Freinet', 'Moderne', 'nouvelle', '2007', 'Social', 'Bourven', 'définitions', 'mixité', 'américaines', 'femmes', 'autres', 'biologique', 'culture', 'étude', 'Novelle', 'plus', 'Terret', 'Chiland', 'Establet', 'aussi', 'Buchmann', 'McDaniel', '2008', 'VST', '2008', '1', 'sexe', 'auparavant', 'biologiques', 'biologie', 'sociaux', 'plus', 'Establet', 'Ainsi', 'femmes', 'filles', 'différemment', 'différences', 'naissance', 'disparaître', 'naissance', '90', '15', 'naissance', 'sexes', 'rares', 'égales', 'Benoit', 'Browaeys', 'scolaire', 'siècle', 'école', 'mixité', 'filles', 'sexes', 'Chaponnière', 

## Testing the Sentence-Transformer based Topic Modeling:'


##### Removing stop words¶
         - "At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context in order to create accurate embeddings."

         - "Instead, we can use the CountVectorizer to preprocess our documents after having generated embeddings and clustered our documents." 
          - "we can use a KeyBERT-Inspired model to reduce the appearance of stop words. This also often improves the topic representation:"

In [55]:
#Creating a list of basic french stopwords to remove using CountVectorizer:
fr_stopwords = []
with open('../Radicalism_Verbalized_NLP/sample_data/fr_stopwords.txt', 'r') as f:
    text = f.read()
    for i in text.split('\n'):
        fr_stopwords.append(i)

In [56]:
#Creating a custom vectorizer model & representation model to help remove stopwords
vectorizer_model = CountVectorizer(stop_words=fr_stopwords)
representation_model = KeyBERTInspired()

In [57]:
#Setting up our French-supported embedding model from HF embeddings:
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

#Initializing our BERTopic model for the HF embeddings
topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model, representation_model=representation_model)

In [58]:
# topics, probs = topic_model.fit_transform(docs)
topics, probs = topic_model.fit_transform(docs)

In [59]:
#-1 refers to outliers and should be ignored
topic_model.get_topic_info() 

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,206,-1_marry_single_christine_correspondantes,"[marry, single, christine, correspondantes, pr...","[Marry, Marry, Marry]"
1,0,50,0_116_115_28_29,"[116, 115, 28, 29, 175, 211, 19, 174, 27, 124]","[116, 115, 225]"
2,1,41,1_dir_ans_échéant_jour,"[dir, ans, échéant, jour, pre, freinet, , , , ]","[dir, dir, dir]"
3,2,40,2_2006_2005_2007_2004,"[2006, 2005, 2007, 2004, 1960, , , , , ]","[2006, 2006, 2006]"
4,3,37,3_2003_2002_1999_1994,"[2003, 2002, 1999, 1994, 1988, 1996, 1998, siè...","[2003, 2003, 2003]"
5,4,36,4_ainsi_puis_également_pourtant,"[ainsi, puis, également, pourtant, corresponda...","[Ainsi, Ainsi, Ainsi]"
6,5,30,5_technologiques_technologique_métiers_profess...,"[technologiques, technologique, métiers, profe...","[métiers, technologique, technologique]"
7,6,28,6_vst_vol_dcsf_,"[vst, vol, dcsf, , , , , , , ]","[VST, VST, VST]"
8,7,24,7_école_scolaires_scolaire_écoles,"[école, scolaires, scolaire, écoles, schooling...","[école, école, école]"
9,8,23,8_égales_mathématiques_égale_maths,"[égales, mathématiques, égale, maths, égalité,...","[mathématiques, différemment, mathématiques]"


In [60]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,technologique,5,5_technologiques_technologique_métiers_profess...,"[technologiques, technologique, métiers, profe...","[métiers, technologique, technologique]",technologiques - technologique - métiers - pro...,0.304126,True
1,information,24,24_recherches_recherche_débats_étude,"[recherches, recherche, débats, étude, informa...","[débats, débats, recherches]",recherches - recherche - débats - étude - info...,1.000000,False
2,2008,15,15_2008___,"[2008, , , , , , , , , ]","[2008, 2008, 2008]",2008 - - - - - - - - -,0.468530,True
3,ligne,1,1_dir_ans_échéant_jour,"[dir, ans, échéant, jour, pre, freinet, , , , ]","[dir, dir, dir]",dir - ans - échéant - jour - pre - freinet - ...,0.545141,False
4,éducation,14,14_éducation_education_éducatif_enseignement,"[éducation, education, éducatif, enseignement,...","[éducation, éducation, éducation]",éducation - education - éducatif - enseignemen...,1.000000,True
...,...,...,...,...,...,...,...,...
784,Fax,20,20_esf_esen_epf_fax,"[esf, esen, epf, fax, ipsos, insa, , , , ]","[EPF, ESEN, ESEN]",esf - esen - epf - fax - ipsos - insa - - - -,1.000000,False
785,93,12,12_93_94_81_84,"[93, 94, 81, 84, 79, 75, 71, 90, 78, 77]","[94, 93, 81]",93 - 94 - 81 - 84 - 79 - 75 - 71 - 90 - 78 - 77,0.680267,True
786,VST,6,6_vst_vol_dcsf_,"[vst, vol, dcsf, , , , , , , ]","[VST, VST, VST]",vst - vol - dcsf - - - - - - -,1.000000,True
787,2008,15,15_2008___,"[2008, , , , , , , , , ]","[2008, 2008, 2008]",2008 - - - - - - - - -,1.000000,True


In [61]:
topic_fig = topic_model.visualize_topics()
topic_fig.show()
# topic_model.visual

In [62]:
h_fig = topic_model.visualize_hierarchy()
h_fig.show()

In [63]:
bar_fig = topic_model.visualize_barchart()
bar_fig.show()

## Building a Knowledge Graph from the extracted topics & docs as an example:

In [15]:
# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = ... #database uri link
AUTH = ... #username + password
#NOTE: Unfortunately had to remove these login credentials for security reasons. Can be presented in real-time

#Verifying connection
driver = GraphDatabase.driver(URI, auth=('sample username','sample password'))
driver.verify_connectivity()

In [1]:
#Defining a function to create topic graphs
def create_topic_graph(driver, docs, topics, topic_model):
    """To create a neo4j Knowledge Graph based on the extracted data from the topic model. We accomplish this by the following:
    1. Creating Topic Nodes
    2. Creating Document Nodes and relationships between each doc and their corresponding topic

    Args:
        driver : Variable representing the connected GraphDatabase
        docs (_type_): Docs used for topic modeling
        topics (_type_): topics extracted from BERTopic
        topic_model (_type_): Topic Model fit during topic modeling
    """
    #Initializing driver session to write to KG:
    with driver.session() as session:
        #Creating the topic nodes:
        for topic_id in set(topics):
            topic_keywords = ', '.join([word for word, proba in topic_model.get_topic(topic_id)])
            session.run(
                """CREATE (t:Topic {id: $id, keywords: $keywords})""",
                id=topic_id, keywords=topic_keywords
            )
            
        #Creating document nodes and relationships between the doc and topic
        for doc, topic in zip(docs, topics):
            session.run(
                """
                CREATE (d:Document {content: $content})
                with d
                MATCH (t:Topic {id: $topic_id})
                CREATE (d)-[:BELONGS_TO]->(t)
                """,
                content=doc, topic_id=topic
            )
    

#### *NOTE:* The create function below will create duplicates if run a second time, need to create a seperate function that merges instead to prevent this:

In [174]:
create_topic_graph(driver=driver, docs=docs, topics=topics, topic_model=topic_model) #Actually writes to our Neo4j Knowledge Graph Database