### Set Up

In [1]:
import re
import string
import pickle
import logging
import pandas as pd
import json
from tqdm import tqdm
import plotly.io as pio
import matplotlib.pyplot as plt
import gensim.corpora as corpora
from collections import defaultdict
import os

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from nltk.corpus import stopwords
from haystack.nodes import PreProcessor
from nltk.corpus import PlaintextCorpusReader
from haystack.utils import convert_files_to_docs
from sentence_transformers import SentenceTransformer
from gensim.models.coherencemodel import CoherenceModel

OpenAI tiktoken module is not available for Python < 3.8,Linux ARM64 and AARCH64. Falling back to GPT2TokenizerFast.


In [2]:
pio.renderers.default='iframe'
logging.getLogger("haystack.utils.preprocessing").setLevel(logging.ERROR)

In [3]:
import nltk

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

token_pattern = re.compile(r"(?u)\b\w\w+\b")

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [
            self.wnl.lemmatize(t)
            for t in word_tokenize(doc)
            if (len(t) > 3 and re.match("[a-z].*", t) and re.match(token_pattern, t)) or (t=='ai')
        ]

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vedantgupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/vedantgupta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
sentence_model = SentenceTransformer("all-distilroberta-v1")

In [7]:
output_path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/BERT_DTM/"

### Getting the results from the pretrained model

In [8]:
topic_model = BERTopic.load(output_path + "bert_dtm_model")

In [9]:
with open(output_path+'bert_dtm_timestamps.pkl', 'rb') as f:
    timestamps = pickle.load(f)
with open(output_path+'bert_dtm_prob.pkl', 'rb') as f:
    prob = pickle.load(f)
with open(output_path+'bert_dtm_coherence.pkl', 'rb') as f:
    coherence = pickle.load(f)
with open(output_path+'bert_dtm_topics.pkl', 'rb') as f:
    topic = pickle.load(f)
with open(output_path+'bert_dtm_hierarchical_topics.pkl', 'rb') as f:
    hierarchical_topics = pickle.load(f)
with open(output_path+'bert_dtm_docs.pkl', 'rb') as f:
    docs = pickle.load(f)

In [22]:
documents = pd.DataFrame({"Document": docs, "Topic": topic, "Timestamps": timestamps})

all_topics = sorted(list(documents.Topic.unique()))
all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}

if isinstance(timestamps[0], str):
    infer_datetime_format = True
    documents["Timestamps"] = pd.to_datetime(documents["Timestamps"],
                                             infer_datetime_format=infer_datetime_format,
                                             format=None)

In [39]:
topicName = topic_model.get_topic_info()

In [40]:
topicName = topicName.set_index('Topic')

In [31]:
documents['Topic_Name'] = None

In [58]:
for i in range(documents.shape[0]):
    topic = documents.iloc[i, 1]
    topic_name = topicName.loc[topic]['Name']
    documents.iloc[i, -1] = topic_name

### Matching the documents to the files for sentiment analysis

In [60]:
documents = documents.sort_values(by=['Topic', 'Timestamps'])
documents.reset_index(drop=True, inplace = True)

In [61]:
# extracting documents not containing any topics
non_classified_documents = []
for i in range(documents.shape[0]):
    if documents.iloc[i, 1] == -1:
        non_classified_documents.append(documents.iloc[i, 0])

In [62]:
# get 100 random documents from the list
import random
non_classified_documents = random.sample(non_classified_documents, 100)

In [63]:
with open("non_classified_documents.txt", 'w') as file:
    for i in range(len(non_classified_documents)):
        file.write("Document " + str(i) + " :"+"\n\n")
        file.write(non_classified_documents[i])
        file.write("\n\n\n")

In [64]:
# document dict which stores the data according to timestamps of the folders in jsonl format so we have to match with minimal documents
dict_keys = ["1950-1959", "1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000-2009", "2010-2019", "2020-2029"]
Bertdocs = defaultdict(list) # dict containing the documents data and Topic
for i in tqdm(range(documents.shape[0])):
    if documents.iloc[i, 1] == -1:
        continue
    year = documents.iloc[i, 2].year
    if year <= 1959:
        Bertdocs[dict_keys[0]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1]])
    elif year <= 1969:
        Bertdocs[dict_keys[1]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1]])
    elif year <= 1979:
        Bertdocs[dict_keys[2]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1] ])
    elif year <= 1989:
        Bertdocs[dict_keys[3]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1]])
    elif year <= 1999:
        Bertdocs[dict_keys[4]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1]])
    elif year <= 2009:
        Bertdocs[dict_keys[5]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1]])
    elif year <= 2019:
        Bertdocs[dict_keys[6]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1]])
    else:
        Bertdocs[dict_keys[7]].append([documents.iloc[i, 0], documents.iloc[i, 1], documents.iloc[i, -1]])

100%|███████████████████████████████████| 13191/13191 [00:01<00:00, 9081.62it/s]


In [65]:
# # Reading the jsonl files
# data_path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/data/JSONL/"
# file_names = ["1950-1959.jsonl", "1960-1969.jsonl", "1970-1979.jsonl", "1980-1989.jsonl", "1990-1999.jsonl", "2000-2009.jsonl", "2010-2019.jsonl", "2020-2029.jsonl"]
# data_map = {}

# for file in tqdm(file_names):
#     json_path = data_path + file
#     key = file.split(".")[0]
#     with open(json_path) as f:
#         data = [json.loads(line) for line in f]
#     data_map[key] = data

# final_map = defaultdict(list)

# # Reading filenames from folders for the year after 1980 and storing in a dict to match with jsonl filenames
# # 1980-1989
# path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/data/TRANSFORMED/1980-1989"
# files_1980_1989 = os.listdir(path)
# dict_1980_1989 = {}
# for i in range(len(files_1980_1989)):
#     key = files_1980_1989[i].split("_")[1].split(".")[0]
#     dict_1980_1989[key] = files_1980_1989[i]

# # 1990 - 1999
# path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/data/TRANSFORMED/1990-1999"
# files_1990_1999 = os.listdir(path)
# dict_1990_1999 = {}
# for i in range(len(files_1990_1999)):
#     key = files_1990_1999[i].split("_")[1].split(".")[0]
#     dict_1990_1999[key] = files_1990_1999[i]

# # 2000 - 2009
# path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/data/TRANSFORMED/2000-2009"
# files_2000_2009 = os.listdir(path)
# dict_2000_2009 = {}
# for i in range(len(files_2000_2009)):
#     key = files_2000_2009[i].split("_")[1].split(".")[0]
#     dict_2000_2009[key] = files_2000_2009[i]

# # 2010 - 2019
# path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/data/TRANSFORMED/2010-2019"
# files_2010_2019 = os.listdir(path)
# dict_2010_2019 = {}
# for i in range(len(files_2010_2019)):
#     key = files_2010_2019[i].split("_")[1].split(".")[0]
#     dict_2010_2019[key] = files_2010_2019[i]

# # 2020 - 2029
# path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/data/TRANSFORMED/2020-2029"
# files_2020_2029 = os.listdir(path)
# dict_2020_2029 = {}
# for i in range(len(files_2020_2029)):
#     key = files_2020_2029[i].split("_")[1].split(".")[0]
#     dict_2020_2029[key] = files_2020_2029[i]
    
# for key, val in tqdm(docs.items()):
#     documents_to_search = data_map[key]
#     for j in range(len(val)):
#         text = val[j][0].strip()
#         topic = val[j][1]
#         for i in range(len(documents_to_search)):
#             text_to_compare = documents_to_search[i]['fullText'].strip()
#             if text == text_to_compare:
#                 if key == "1950-1959" or key == "1960-1969" or key == "1970-1979":
#                     doc_id = documents_to_search[i]['id']+"_1.txt"
#                 elif key == "1980-1989":
#                     doc_id = documents_to_search[i]['id']
#                     doc_id = dict_1980_1989[doc_id]
#                 elif key == "1990-1999":
#                     doc_id = documents_to_search[i]['id']
#                     doc_id = dict_1989_1999[doc_id]
#                 elif key == "2000-2009":
#                     doc_id = documents_to_search[i]['id']
#                     doc_id = dict_2000_2009[doc_id]
#                 elif key == "2010-2019":
#                     doc_id = documents_to_search[i]['id']
#                     doc_id = dict_2010_2019[doc_id]
#                 elif key == "2020-2029":
#                     doc_id = documents_to_search[i]['id']
#                     doc_id = dict_2020_2029[doc_id]
#                 final_map[topic].append([text, doc_id])

# final_map

In [66]:
Bertdocs["1950-1959"][0]

['servo to expand plant construction has been started on an addition to the assembly plant of the servo corporation of america in new hyde park l i it was announced yesterday by henry blackstone president of the company the addition will increase by 50 per cent the area used for assembling and testing precision electromechanical products the company makes electronic automation equipment automation testing equipment radiation detection and measurement systems and radio communication and navigation instruments ',
 0,
 '0_company_system_worker_state']

In [67]:
output_path = "/Users/vedantgupta/Desktop/Oncampus - Research/topic modelling/data/"
sources = ['CLEANSED/1950-1959', 'CLEANSED/1960-1969', 'CLEANSED/1970-1979', 'CLEANSED/1980-1989', 
          'CLEANSED/1990-1999', 'CLEANSED/2000-2009', 'CLEANSED/2010-2019']
stop_words = stopwords.words('english')
common_words = ['from', 'subject', 're', 'edu', 'use', 'said', 'find', 'still', 'take', 'year', 'first', 'be', 'am', 
                'are', 'is', 'was', 'were', 'being', 'can', 'could', 'do', 'did', 'does', 'doing', 'have', 'had', 
                'has', 'having', 'may', 'might', 'must', 'shall', 'should', 'will', 'would', 'still', 'going', 'never', 
                'incb21', 'outst21', '000c8', 'sfas', 'rev11', 'inct', 'amer', 'thet', 'inds_inct', 'adrt', 'natl', 
                'benihana_natl', 'tofruzen', 'indst', 'finl', 'inc10', 'amer_commun', 'incn', 'said', 'like', 'sept', 
                'mon', 'tues', 'wed', 'thu', 'fri', 'sat', 'sun', 'east', 'meany', 'corp', 'intl', 'inds', 'pas', 'return', 
                'trela', 'foot', 'time', 'named', 'also']
stop_words.extend(common_words)

In [68]:
# Remove copyright parts from articles
def remove_copyright_footer(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace('reproduced with permission of the copyright owner.', '')
    sentence = sentence.replace('further reproduction prohibited without permission.', '')
    return sentence.replace('further reproduction is prohibited without permission.', '')

In [69]:
docsDict = defaultdict(list)
#docs = []
timestamps = []
#doc_names = []
for source in sources:
    key = source.split("/")[1]
    all_docs = convert_files_to_docs(dir_path=output_path+source)
    selected_docs = [doc for doc in all_docs if len(doc.content.split()) >= 10]
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by=None
    )
    print(len(selected_docs))
    data = [item.content for item in preprocessor.process(selected_docs)]
    names = [item.meta['name'] for item in preprocessor.process(selected_docs)]
    # Remove Links
    data = [re.sub(r'http\S+', '', sent) for sent in data]
    # Remove footer
    data = [remove_copyright_footer(sent) for sent in data]
    # Fix spaces
    data = [sent.strip() for sent in data]
    # Remove multiple space and new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
    # Remove special characters
    data = [re.sub(r'[^\x00-\x7f]',r'', sent) for sent in data]
    # # Remove some punctuations
    data = [item.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for item in data]
    # Remove multiple space and new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    # Remove distracting single quotes
    processed_docs = [re.sub("\'", "", sent) for sent in data]
    print(f"Number of input files: {len(all_docs)}\n"+
          f"Number of selected files: {len(selected_docs)}\n"+
          f"Number of output files: {len(processed_docs)}")
    docsDict[key].append(processed_docs)
    docsDict[key].append(names)
    #docs.extend(processed_docs)
    timestamps.extend([source.split("-")[1]]*len(processed_docs))
    #doc_names.extend(names)

1052


Preprocessing:   0%|          | 0/1052 [00:00<?, ?docs/s]







Preprocessing:   0%|          | 0/1052 [00:00<?, ?docs/s]







Number of input files: 1052
Number of selected files: 1052
Number of output files: 1052
2900


Preprocessing:   0%|          | 0/2900 [00:00<?, ?docs/s]







Preprocessing:   0%|          | 0/2900 [00:00<?, ?docs/s]







Number of input files: 2900
Number of selected files: 2900
Number of output files: 2900
1258


Preprocessing:   0%|          | 0/1258 [00:00<?, ?docs/s]















Preprocessing:   0%|          | 0/1258 [00:00<?, ?docs/s]















Number of input files: 1258
Number of selected files: 1258
Number of output files: 1258
1753


Preprocessing:   0%|          | 0/1753 [00:00<?, ?docs/s]

















Preprocessing:   0%|          | 0/1753 [00:00<?, ?docs/s]

















Number of input files: 1771
Number of selected files: 1753
Number of output files: 1753
1368


Preprocessing:   0%|          | 0/1368 [00:00<?, ?docs/s]















Preprocessing:   0%|          | 0/1368 [00:00<?, ?docs/s]















Number of input files: 1520
Number of selected files: 1368
Number of output files: 1368
2009


Preprocessing:   0%|          | 0/2009 [00:00<?, ?docs/s]















Preprocessing:   0%|          | 0/2009 [00:00<?, ?docs/s]















Number of input files: 2010
Number of selected files: 2009
Number of output files: 2009
2852


Preprocessing:   0%|          | 0/2852 [00:00<?, ?docs/s]































Preprocessing:   0%|          | 0/2852 [00:00<?, ?docs/s]































Number of input files: 2852
Number of selected files: 2852
Number of output files: 2852


In [70]:
Bertdocs["1950-1959"][0]

['servo to expand plant construction has been started on an addition to the assembly plant of the servo corporation of america in new hyde park l i it was announced yesterday by henry blackstone president of the company the addition will increase by 50 per cent the area used for assembling and testing precision electromechanical products the company makes electronic automation equipment automation testing equipment radiation detection and measurement systems and radio communication and navigation instruments ',
 0,
 '0_company_system_worker_state']

In [73]:
final_map = defaultdict(list)
docs_not_matched = []
for key, val in tqdm(Bertdocs.items()):
    all_docs = docsDict[key][0]
    file_names = docsDict[key][1]
    for i in range(len(val)):
        documents_to_search = val[i][0]
        topic = val[i][1]
        topic_name = val[i][2]
        match_flag = False
        for j in range(len(all_docs)):
            if all_docs[j] == documents_to_search:
                final_map[topic].append([documents_to_search, file_names[j], topic, topic_name])
                match_flag = True
        if not match_flag:
            docs_not_matched.append([documents_to_search, topic])

100%|█████████████████████████████████████████████| 7/7 [00:02<00:00,  2.81it/s]


In [74]:
s = 0
for key, val in final_map.items():
    s += len(final_map[key])

In [75]:
print(s)

8815


In [76]:
print(len(docs_not_matched))

0


In [77]:
import pickle

In [78]:
with open('final_map.pkl', 'wb') as f:
    pickle.dump(final_map, f)