In [None]:
!pip install bertopic

In [None]:
import pandas as pd
from google.colab import drive
import umap
import sqlalchemy
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from scipy.spatial import KDTree

import nltk
from sentence_transformers import SentenceTransformer
import random

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import collections
from typing import List
import re

from sklearn.metrics.pairwise import cosine_similarity
import sys
import os
import json
import numpy as np
from pprint import pprint
import torch

from random import shuffle

In [None]:
drive.mount('/content/drive')

In [None]:
class CustomClusterer():
  def __init__(self, num_reduced_dimensions: int, num_top_words: int, bm25_weighting: bool = False, reduce_frequent_words: bool = False):
    self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    self.umap_model = umap.UMAP(n_neighbors=15, n_components=num_reduced_dimensions, min_dist=0.0, metric='cosine')
    self.hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    self.vectorizer_model = CountVectorizer(stop_words="english")
    self.ctfidf_model = ClassTfidfTransformer(bm25_weighting=bm25_weighting, reduce_frequent_words=reduce_frequent_words)

    self.num_top_words = num_top_words

  def preprocess_text(self, documents: np.ndarray) -> List[str]:
    """ Basic preprocessing of text

    Steps:
        * Replace \n and \t with whitespace
        * Only keep alpha-numerical characters
    """
    cleaned_documents = [doc.replace("\n", " ") for doc in documents]
    cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents]
    # assuming language is english
    cleaned_documents = [re.sub(r'[^A-Za-z0-9 ]+', '', doc) for doc in cleaned_documents]
    cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents]
    return cleaned_documents


  def top_n_idx_sparse(self, matrix: csr_matrix, n: int) -> np.ndarray:
    """ Return indices of top n values in each row of a sparse matrix

    Retrieved from:
        https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix

    Arguments:
        matrix: The sparse matrix from which to get the top n indices per row
        n: The number of highest values to extract from each row

    Returns:
        indices: The top n indices per row
    """
    indices = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]]
        values = [values[index] if len(values) >= index + 1 else None for index in range(n)]
        indices.append(values)
    return np.array(indices)

  def top_n_values_sparse(self, matrix: csr_matrix, indices: np.ndarray) -> np.ndarray:
    """ Return the top n values for each row in a sparse matrix

    Arguments:
        matrix: The sparse matrix from which to get the top n indices per row
        indices: The top n indices per row

    Returns:
        top_values: The top n scores per row
    """
    top_values = []
    for row, values in enumerate(indices):
        scores = np.array([matrix[row, value] if value is not None else 0 for value in values])
        top_values.append(scores)
    return np.array(top_values)


  def create_topic_vectors(self, documents: pd.DataFrame = None, embeddings: np.ndarray = None):
    topic_embeddings = []
    topics = documents.sort_values("topics").topics.unique()
    for topic in topics:
        indices = np.where(documents.topics == topic)[0]
        indices = [int(index) for index in indices]
        topic_embedding = np.mean(embeddings[indices], axis=0)
        topic_embeddings.append(topic_embedding)
    return dict(zip(topics, np.array(topic_embeddings)))

  def guided_topics(self, embeddings: np.ndarray = None, seed_topics = List[List[str]]) -> tuple[List[int], np.array]:
    seed_topic_words = [" ".join(seed_words) for seed_words in seed_topics]
    seed_topic_words = self.embedding_model.encode(seed_topic_words)
    pass

  def fit(self, data: pd.DataFrame, data_key: str, seed_topics:List[List[str]] = None) -> dict:
    embeddings = self.embedding_model.encode(data[data_key], show_progress_bar = True)
    raw_embeddings = embeddings
    self.umap_model.fit(embeddings) # fit to embeddings manifold
    embeddings = np.nan_to_num(self.umap_model.transform(X=embeddings))

    self.hdbscan_model.fit(embeddings)
    data['topics'] = self.hdbscan_model.labels_

    # literally all this code does is sort the topics by its frequency, so topic 0 is remapped to the most frequent topic, 1 to the second-most, etc.
    # https://github.com/MaartenGr/BERTopic/blob/master/bertopic/_bertopic.py#L3830
    size_sorted_topics = pd.DataFrame(collections.Counter(data.topics.values.tolist()).items(), columns=("topics", "size")).sort_values("size", ascending=False)
    size_sorted_topics = size_sorted_topics[size_sorted_topics.topics != -1] # ignore "other" topic for now
    size_sorted_topics = {**{-1:-1}, **dict(zip(size_sorted_topics.topics, range(len(size_sorted_topics))))}
    data.topics = data.topics.map(size_sorted_topics).fillna(data.topics).astype(int)

    # create BIG string for each topic where each string is a concatenation of all the strings that were assigned that topic - returns dict_item
    test_data = data.groupby(['topics'], as_index=False).agg({data_key: ' '.join})
    test_data = self.preprocess_text(test_data[data_key].values)

    self.vectorizer_model.fit(test_data)
    X = self.vectorizer_model.transform(test_data)

    ctfidf_model = self.ctfidf_model.fit(X, multiplier=None)

    c_tf_idf = ctfidf_model.transform(X)
    words = self.vectorizer_model.get_feature_names_out()

    labels = sorted(list(data.topics.unique()))
    labels = [int(label) for label in labels]

    # default num words = 50
    indices = self.top_n_idx_sparse(c_tf_idf, 50)

    scores = self.top_n_values_sparse(c_tf_idf, indices)
    sorted_indices = np.argsort(scores, 1)
    indices = np.take_along_axis(indices, sorted_indices, axis=1)
    scores = np.take_along_axis(scores, sorted_indices, axis=1)

    topics = {label: [(words[word_index], score) if word_index is not None and score > 0
                else ("", 0.00001) for word_index, score in zip(indices[index][::-1], scores[index][::-1])]
                for index, label in enumerate(labels)}

    topic_vectors = self.create_topic_vectors(data, raw_embeddings)
    return {"data":data, "embeddings":raw_embeddings, "topic_vectors":topic_vectors, "topic_names":topics}

  def inference(self, complaint:str = None, topic_names:dict = None, topic_embeddings:dict = None) -> tuple[list[int], list[float]]:
    complaint_embedding = self.embedding_model.encode([complaint])[0]
    complaint_embedding = complaint_embedding / np.linalg.norm(complaint_embedding)

    topic_num = topic_embeddings.keys()
    cosine_similarities = np.zeros((len(topic_num),))

    for key in topic_num:
      curr_topic_embedding = topic_embeddings[key]
      cosine_similarities[key+1] = np.dot(curr_topic_embedding / np.linalg.norm(curr_topic_embedding), complaint_embedding)

    indexes = (-1 * cosine_similarities).argsort()
    sims = [cosine_similarities[indexes[i]] for i in range(len(indexes))]
    top_topics = [topic_names[indexes[i] - 1] for i in range(len(indexes))]

    return top_topics, sims, tuple([num - 1 for num in indexes])


In [None]:
engine = CustomClusterer(num_reduced_dimensions=5, num_top_words=20, bm25_weighting=True, reduce_frequent_words=True)

In [None]:
path = "./drive/MyDrive/data_analysis/.angelina_data.txt"
file_data = []
with open(path,"r") as data:
  file_data = data.readlines()
  file_data = [elem.strip() for elem in file_data]
  file_data = [elem for elem in file_data if elem != '']

sentinel = -1
file_data = iter(file_data)

angelina = []
aayush = []
all_msgs = []

msg = next(file_data, sentinel)
adder = []
while (msg != sentinel):
  if (msg == 'to AARP Shayla Angelina Riann Puglia'):
    adder = aayush
  elif (msg == 'from AARP Shayla Angelina Riann Puglia'): #   adder = angelina
    adder = angelina
  else:
    adder.append(msg)
    all_msgs.append(msg)
  msg = next(file_data, sentinel)

In [None]:
print(len(all_msgs))
print(len(aayush))
print(len(angelina))

In [None]:
aayush = pd.DataFrame(aayush, columns=["data"])
angelina = pd.DataFrame(angelina, columns=["data"])
all_msgs = pd.DataFrame(all_msgs, columns=["data"])

In [None]:
def get_topics(words):
  output = engine.fit(words,data_key="data")
  tn = output["topic_names"]
  all_topics = dict()
  for i in tn:
    topic = tn[i]
    all_topics[i] = [elem[0] for elem in topic]

  return (words, all_topics)

In [None]:
aayush = get_topics(aayush)


In [None]:
print(len(aayush_topics))
pprint(aayush_topics[-1])

In [None]:
angelina = get_topics(angelina)


In [None]:
current_analysis = all_msg_topics

analysis_topics = current_analysis[1]
analysis_df = current_analysis[0]

N = 590
print(f"{len(analysis_topics)} total topics")
print(f"current topic: {N}\n---------------------------\ntopic description:")
pprint(analysis_topics[N][:10])
print("---------------------------\n")
print("\nsample texts\n---------------------------")

samples = list(analysis_df[analysis_df["topics"] == N]["data"])
print(len(samples))
shuffle(samples)
pprint(samples[:50])

In [None]:
all_msg_topics = get_topics(all_msgs)

In [None]:
topics_aayush = dict()
print(len(tn_aayush))

for i in tn_aayush:
  topic = tn_aayush[i]
  topics_aayush[i] = ([elem[0] for elem in topic])

In [None]:
angelina_output = engine.fit(angelina, data_key="data")
data_angelina = angelina_output["data"]
embeddings_angelina = angelina_output["data"]
tv_angelina = angelina_output["topic_vectors"]
tn_angelina = angelina_output["topic_names"]

In [None]:
print(text_data)