# Setup Environment <a class=anchor id=section1></a>

In [2]:
%%capture
!apt-get update
!apt-get install --reinstall build-essential --yes

In [3]:
%%capture
!pip install bertopic==0.11.0
!pip install farm-haystack
!pip install spacy
!pip install gensim
!pip install sagemaker_pyspark
!python -m spacy download en_core_web_sm
!pip install joblib==1.1.0

In [4]:
import re
import string
import pickle
import logging
import pandas as pd
import plotly.io as pio
import matplotlib.pyplot as plt
import gensim.corpora as corpora

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from nltk.corpus import stopwords
from haystack.nodes import PreProcessor
from nltk.corpus import PlaintextCorpusReader
from haystack.utils import convert_files_to_docs
from sentence_transformers import SentenceTransformer
from gensim.models.coherencemodel import CoherenceModel

pio.renderers.default='iframe'
logging.getLogger("haystack.utils.preprocessing").setLevel(logging.ERROR)

In [5]:
import nltk

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('wordnet')
nltk.download('omw-1.4')
token_pattern = re.compile(r"(?u)\b\w\w+\b")
sentence_model = SentenceTransformer("all-distilroberta-v1")

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [
            self.wnl.lemmatize(t)
            for t in word_tokenize(doc)
            if (len(t) > 3 and re.match("[a-z].*", t) and re.match(token_pattern, t)) or (t=='ai')
        ]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
# nr_bins = 7
output_path = "BERT_DTM/"

topic_model = BERTopic.load(output_path+"bert_dtm_model")
with open('bert_dtm_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

In [7]:
with open(output_path+'bert_dtm_timestamps.pkl', 'rb') as f:
    timestamps = pickle.load(f)
with open(output_path+'bert_dtm_prob.pkl', 'rb') as f:
    prob = pickle.load(f)
with open(output_path+'bert_dtm_coherence.pkl', 'rb') as f:
    coherence = pickle.load(f)
with open(output_path+'bert_dtm_topics.pkl', 'rb') as f:
    topic = pickle.load(f)
with open(output_path+'bert_dtm_hierarchical_topics.pkl', 'rb') as f:
    hierarchical_topics = pickle.load(f)
with open(output_path+'bert_dtm_docs.pkl', 'rb') as f:
    docs = pickle.load(f)

In [9]:
documents = pd.DataFrame({"Document": docs, "Topic": topic, "Timestamps": timestamps})

all_topics = sorted(list(documents.Topic.unique()))
all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}

if isinstance(timestamps[0], str):
#     infer_datetime_format = True if not datetime_format else False
    infer_datetime_format = True
    documents["Timestamps"] = pd.to_datetime(documents["Timestamps"],
                                             infer_datetime_format=infer_datetime_format,
                                             format=None)
#                                              format=datetime_format)

# if nr_bins:
#     documents["Bins"] = pd.cut(documents.Timestamps, bins=nr_bins)
#     documents["Timestamps"] = documents.apply(lambda row: row.Bins.left, 1)

In [8]:
documents.sort_values(by=['Topic', 'Timestamps'])

Unnamed: 0,Document,Topic,Timestamps,Bins
0,mayors statement on budget economies following is the tgxt of the portion of...,-1,1958-12-10 02:02:24.000000000,"(1958-12-10 02:02:24, 1967-07-28 17:08:34.285714272]"
3,by wtt t tajvi m freeman the parker pen company is making plans for what pro...,-1,1958-12-10 02:02:24.000000000,"(1958-12-10 02:02:24, 1967-07-28 17:08:34.285714272]"
9,new office plans break tradition modern buildings try to fit space and equip...,-1,1958-12-10 02:02:24.000000000,"(1958-12-10 02:02:24, 1967-07-28 17:08:34.285714272]"
10,following is the text of the report of the board of directors of the associa...,-1,1958-12-10 02:02:24.000000000,"(1958-12-10 02:02:24, 1967-07-28 17:08:34.285714272]"
13,economics and finance timing sequence in business cycles by edward h collins...,-1,1958-12-10 02:02:24.000000000,"(1958-12-10 02:02:24, 1967-07-28 17:08:34.285714272]"
...,...,...,...,...
12630,on a recent morning in may carlos barrientos drove up to a belt line road in...,16,2010-06-06 06:51:25.714285568,"(2010-06-06 06:51:25.714285568, 2019-01-01]"
12878,updated 4 40 p m on tuesday the reporter michael luo of the times is answeri...,16,2010-06-06 06:51:25.714285568,"(2010-06-06 06:51:25.714285568, 2019-01-01]"
12903,researchers recently learned that immigration and customs enforcement used f...,16,2010-06-06 06:51:25.714285568,"(2010-06-06 06:51:25.714285568, 2019-01-01]"
13004,mayor bill de blasio on tuesday unveiled a sweeping set of proposals aimed a...,16,2010-06-06 06:51:25.714285568,"(2010-06-06 06:51:25.714285568, 2019-01-01]"


In [9]:
documents.to_csv("BERT_DTM/bert_dtm_documents.csv", index=False)