## Load in Required Packages

In [None]:
from bertopic import BERTopic
from sqlalchemy import create_engine

import numpy as np
import pandas as pd
import pymysql

## Choose AWS or Local

This code does some initial setup depending upon if you're running this code in AWS EC2 or locally via Docker Containers.

In [None]:
# Do not edit these 2 lines
LOCAL_RUN_MODE = "LOCAL"
AWS_EC2_RUN_MODE = "AWS_EC2"

# Uncomment the line you want to use, matching to where you're running the code
run_mode = LOCAL_RUN_MODE
# run_mode = AWS_EC2_RUN_MODE

In [None]:
if (run_mode == LOCAL_RUN_MODE):
    # Configure MySQL Connection
    sqlEngine = create_engine('mysql+pymysql://root:p@ssw0rd1@cse6242_team094_mysqldb/cse6242_team094')
    dbConnection = sqlEngine.connect()
    
    abstracts = pd.read_sql_table("processed_abstracts", con=dbConnection)

In [None]:
if (run_mode == AWS_EC2_RUN_MODE):
    # Read the data from the Parquet files on the EC2 instance
    abstracts = pd.read_parquet("abstract_parquet")

## Process the Loaded, Preprocessed Abstracts using BERTopic

In [None]:
abstract_li = abstracts.abstract_tokens.values.tolist()

abstract_li2 = [x.replace(","," ") for x in abstract_li]

topic_model = BERTopic(language="english", calculate_probabilities=True)

topics, probs = topic_model.fit_transform(abstract_li2)

abstracts['topic'] = topics

abstracts["topic_prob"] = [x for x in probs]

old_version = abstracts[["cord_uid","topic","topic_prob"]]

prob_df = pd.DataFrame([x for x in probs])

In [None]:
doc_topic_df = pd.concat([abstracts[["cord_uid","topic"]],prob_df],axis=1)

topic_words = topic_model.get_topic_freq()

probs = [topic_model.get_topic(x) for x in topic_words["Topic"]]

topic_words["related_words"] = [topic_model.get_topic(x) for x in topic_words["Topic"]]
topic_words["related_words"] = topic_words["related_words"].astype(str)

flat_probs = []
for words in probs:
  doc_words = []
  for word in words:
    doc_words.append(word[0])
    doc_words.append(word[1])
  flat_probs.append(doc_words)

flat_probs_df = pd.DataFrame(flat_probs)

extended_topic_df = pd.concat([topic_words,flat_probs_df],axis=1)

## Output the Data

In [None]:
if (run_mode == LOCAL_RUN_MODE):
    # Write to MySQL
    old_version.to_sql("02a_bert_string_doc_to_topic", con=dbConnection, if_exists='replace')
    doc_topic_df.to_sql("02a_bert_doc_topic", con=dbConnection, if_exists='replace')
    topic_words.to_sql("02a_bert_string_topic_to_words", con=dbConnection, if_exists='replace')
    extended_topic_df.to_sql("02a_bert_extended_topic", con=dbConnection, if_exists='replace')

In [None]:
if (run_mode == AWS_EC2_RUN_MODE):
    # Save to a CSV
    old_version.to_csv("string_doc_to_topic.csv")
    doc_topic_df.to_csv('doc_topic_df.csv')
    topic_words.to_csv("string_topic_to_words.csv")
    extended_topic_df.to_csv("extended_topic_df.csv")