## Load in Required Packages

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
from sqlalchemy import create_engine

import numpy as np
import nltk
import pandas as pd
import pymysql

## Choose AWS or Local

This code does some initial setup depending upon if you're running this code in AWS EC2 or locally via Docker containers.

In [None]:
# Do not edit these 2 lines
LOCAL_RUN_MODE = "LOCAL"
AWS_EC2_RUN_MODE = "AWS_EC2"

# Uncomment the line you want to use, matching to where you're running the code
run_mode = LOCAL_RUN_MODE
# run_mode = AWS_EC2_RUN_MODE

In [None]:
if (run_mode == LOCAL_RUN_MODE):
    # Configure MySQL Connection
    sqlEngine = create_engine('mysql+pymysql://root:p@ssw0rd1@cse6242_team094_mysqldb/cse6242_team094')
    dbConnection = sqlEngine.connect()
    
    table = pd.read_sql_table("processed_abstracts", con=dbConnection)

In [None]:
if (run_mode == AWS_EC2_RUN_MODE):
    # Read the data from the Parquet files on the EC2 instance
    table = pd.read_parquet("abstract_parquet")

In [None]:
table.dropna(inplace=True)
table.reset_index(inplace=True, drop=True)

## Transform the Preprocessed Abstracts

In [None]:
table.rename({"abstract_tokens":"abstract", "abstract":"abstract_raw"}, axis=1, inplace=True)

In [None]:
text = table["abstract"]
vectored = TfidfVectorizer(max_features=2**12)
X = vectored.fit_transform(text)
print("vectors formed")

In [None]:
pca = PCA(n_components=0.95, random_state=42)
X_reduced = pca.fit_transform(X.toarray())
print("pca fit")

In [None]:
k = 20

In [None]:
gm = GaussianMixture(n_components=k,  
                     random_state=42,
                     init_params='kmeans',
                     covariance_type = "spherical")

In [None]:
gm.fit(X_reduced)
print("Gaussian Fit")

In [None]:
y_pred = gm.predict(X_reduced)

In [None]:
y_gm_proba = gm.predict_proba(X_reduced)

In [None]:
print("probability of topic gained")

In [None]:
table['topic'] = y_pred

In [None]:
prob_df = pd.DataFrame([x for x in y_gm_proba])

In [None]:
doc_topic_df = pd.concat([table[["cord_uid","topic"]],prob_df],axis=1)

In [None]:
if (run_mode == LOCAL_RUN_MODE):
    # Write to MySQL
    doc_topic_df.to_sql("02b_lda_doc_to_topic", con=dbConnection, if_exists='replace')

In [None]:
if (run_mode == AWS_EC2_RUN_MODE):
    # Save to a CSV
    doc_topic_df.to_csv("lda_doc_to_topic.csv")

    print("saved lda_doc_to_topic.csv")
    print(doc_topic_df.tail())

In [None]:
str_prob = [str(x) for x in y_gm_proba]

In [None]:
table["topic_prob"] = str_prob

In [None]:
if (run_mode == LOCAL_RUN_MODE):
    # Write to MySQL
    table[["cord_uid","topic","topic_prob"]].to_sql("02b_lda_string_doc_to_topic", con=dbConnection, if_exists='replace')

In [None]:
if (run_mode == AWS_EC2_RUN_MODE):
    # Save to a CSV
    table[["cord_uid","topic","topic_prob"]].to_csv("string_lda_doc_to_topic.csv")

    print("saved string_lda_doc_to_topic.csv")
    print(table[["cord_uid","topic","topic_prob"]].tail())

In [None]:
# vectorize our data in each cluster
vectorizers = []
    
for ii in range(0, 20):
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [None]:
vectorized_data = []
print("cvec")
for current_cluster, cvec in enumerate(vectorizers):
        vectorized_data.append(cvec.fit_transform(table.loc[table['topic'] == current_cluster, 'abstract']))

In [None]:
TOPICS_PER_CLUSTER = 10
lda_models = []
for ii in range(0, 20):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=TOPICS_PER_CLUSTER, max_iter=10, learning_method='online', verbose=False, random_state=42)
    lda_models.append(lda)

In [None]:
n_top_words = 10

In [None]:
clusters_lda_data = []
print("fitting lda")
for current_cluster, lda in enumerate(lda_models):
    # print("Current Cluster: " + str(current_cluster))
    
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))

In [None]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        #print(words)
        #break
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
        #print(keywords)
        #print(current_words)
        
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii)
    return return_values

In [None]:
all_keywords = []
print("getting topic words")
for current_vectorizer, lda in enumerate(lda_models):
    # print("Current Cluster: " + str(current_vectorizer))
    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))

In [None]:
top_topic_words = [x[:10] for x in all_keywords]

In [None]:
word_per_topic = pd.DataFrame({"Topic":[x for x in range(20)],"related_words":top_topic_words})
word_per_topic["related_words"] = word_per_topic["related_words"].astype(str)

In [None]:
if (run_mode == LOCAL_RUN_MODE):
    # Write to MySQL
    word_per_topic.to_sql("02b_lda_string_topic_to_words", con=dbConnection, if_exists='replace')

In [None]:
if (run_mode == AWS_EC2_RUN_MODE):
    # Save to a CSV
    word_per_topic.to_csv("lda_string_topic_to_words.csv")

    print("making lda_string_topic_to_words.csv")
    print(word_per_topic.tail())

In [None]:
probs = top_topic_words
flat_probs = []
for words in probs:
    doc_words = []
    for word in words:
        doc_words.append(word[0])
        doc_words.append(word[1])
    flat_probs.append(doc_words)

In [None]:
flat_probs_df = pd.DataFrame(flat_probs)

In [None]:
extended_topic_df = pd.concat([word_per_topic["Topic"],flat_probs_df],axis=1)

In [None]:
if (run_mode == LOCAL_RUN_MODE):
    # Write to MySQL
    extended_topic_df.to_sql("02b_lda_topic_to_words", con=dbConnection, if_exists='replace')

In [None]:
if (run_mode == AWS_EC2_RUN_MODE):
    # Save to a CSV
    extended_topic_df.to_csv("lda_topic_to_words.csv")

    print("making lda_topic_to_words.csv")
    print(extended_topic_df.tail())