In [4]:
import numpy as np
import pandas as pd
import pickle
import plotly.graph_objects as go

from pydantic import BaseModel
from typing import List, Dict

In [3]:
class MainOut(BaseModel):
  stage_1_outputs: List[Dict[str, str]]
  stage_2_outputs: List[Dict[str, str]]
  final_summary: str
  markdown_summary: str
  summary_similarity_matrix: np.ndarray
  chunk_topics: List[int]

  class Config:
    arbitrary_types_allowed = True

In [9]:
import aiohttp
import asyncio
import torch

from loguru import logger
from tqdm import tqdm
from typing import Union, List


url = "https://fun-sentence-embedder-c8f3c4818216.herokuapp.com"


def init(): pass

class _Encoder:
  def __init__(
    self,
    url,
    session: aiohttp.ClientSession,
    text_list: List[str],
    batch_size: int=8,
    num_workers: int=8
  ):

    self.url = url
    self.session = session
    self.text_list = text_list
    self.batch_size = batch_size
    self.num_workers = num_workers
    self.pbar = tqdm(total=len(text_list) // batch_size, desc="Encoding")

    self._todo = asyncio.Queue()
    self.responses = []

  async def _encode_batch(self) -> torch.Tensor:
    for i in range(0, len(self.text_list), self.batch_size):
      text_batch = self.text_list[i : i + self.batch_size]
      await self._todo.put((text_batch))
    
    workers = [asyncio.create_task(self.worker()) for _ in range(self.num_workers)]
    await self._todo.join()
    for w in workers: w.cancel()
    return torch.cat([torch.tensor(res['embeddings']) for res in self.responses], dim=0)

  async def worker(self):
    while True:
      try: await self.process_one()
      except asyncio.CancelledError: return

  async def process_one(self):
    text_batch = await self._todo.get()
    try:
      async with self.session.post(self.url + "/embed_batch", json=text_batch) as response:
        if response.status != 200: raise Exception(f"Failed to encode text: {text_batch}. Got text: {await response.text()}")
        self.responses.append(await response.json())
    except Exception as e: logger.error(e)
    finally:
      self.pbar.update(1)
      self._todo.task_done()
        

async def _encode(text: Union[str, list[str]], batch_size: int=8) -> torch.Tensor:
  async with aiohttp.ClientSession() as session:
    _encoder = _Encoder(url, session, text, batch_size=batch_size, num_workers=8)
    return await _encoder._encode_batch()

def encode(text: Union[str, list[str]]) -> torch.Tensor:
  if isinstance(text, str): text = [text]
  loop = asyncio.get_event_loop()
  return asyncio.run_coroutine_threadsafe(_encode(text, batch_size=8), loop)
  return asyncio.run(_encode(text, batch_size=8))

In [75]:
VERSION = 1.1
fn = f'/Users/rohan/1_Project/fun_podsmart_summarizer/summary_obj_v{VERSION}.pkl'
with open(fn, 'rb') as f: summary_obj = pickle.load(f)

In [76]:
stage_1_outputs = summary_obj.stage_1_outputs
titles = [x['title'] for x in stage_1_outputs]
summaries = [x['summary'] for x in stage_1_outputs]

list(zip(titles, summaries))[:3]

[('A Computational and Engineering View of Biology',
  'The text discusses the importance of abstractions in understanding the functionality of biological systems, drawing parallels with computer science. It emphasizes the need to consider biological phenomena at different scales and describes the use of abstractions in biology, highlighting the potential for insights from both biologists and computer scientists. The text also includes a quote from a study on cells as information-processing devices and explains the concept of abstractions in computer science and engineering.'),
 ('Abstractions in Computer Science and Biological Information',
  'The text discusses the importance of abstractions in computer science, which help in managing complexity and organizing computer systems. It also highlights the role of information flow in biological processes, emphasizing the need to consider information as a fundamental aspect of biological systems. The comparison is drawn between the historic

In [None]:
summary_embeds = encode(summaries)

Encoding:   0%|          | 0/4 [00:00<?, ?it/s]

Encoding: 5it [00:03,  1.46it/s]                       


In [78]:
summary_embeds = summary_embeds.result()

In [None]:
title_embeds = encode(titles)

Encoding:   0%|          | 0/4 [00:00<?, ?it/s]

Encoding: 5it [00:01,  2.75it/s]                       


In [80]:
title_embeds = title_embeds.result()

In [81]:
summary_embeds.shape, title_embeds.shape

(torch.Size([34, 768]), torch.Size([34, 768]))

In [82]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(summary_embeds, summary_embeds)

In [83]:
similarity_matrix.shape

(34, 34)

In [84]:
similarity_matrix

array([[1.        , 0.2217517 , 0.35288817, ..., 0.24696626, 0.3594935 ,
        0.4104817 ],
       [0.2217517 , 1.0000001 , 0.18700896, ..., 0.15392655, 0.3371789 ,
        0.49257052],
       [0.35288817, 0.18700896, 0.9999998 , ..., 0.38240036, 0.47766694,
        0.3121438 ],
       ...,
       [0.24696626, 0.15392655, 0.38240036, ..., 1.        , 0.15975073,
        0.2576941 ],
       [0.3594935 , 0.3371789 , 0.47766694, ..., 0.15975073, 1.0000005 ,
        0.35839382],
       [0.4104817 , 0.49257052, 0.3121438 , ..., 0.2576941 , 0.35839382,
        1.0000001 ]], dtype=float32)

In [85]:
# plot similarity matrix using plotly
fig = go.Figure(data=go.Heatmap(
    z=similarity_matrix,
    x=titles,
    y=titles,
    colorscale='Viridis'))

In [86]:
# save plot to html str
fig.write_html(f'similarity_matrix_v{VERSION}.html', auto_open=True)

In [87]:
# create a tsne plot of the summary embeddings

from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
tsne_obj = tsne.fit_transform(summary_embeds)

tsne_obj.shape

(34, 2)

In [88]:
import textwrap
formatted_summaries = [textwrap.fill(summary, width=80) for summary in summaries]
formatted_summaries = [x.replace('\n', '<br>') for x in formatted_summaries]
formatted_summaries = [f"<b>{title}</b><br>{summary}" for title, summary in zip(titles, formatted_summaries)]

In [89]:
formatted_summaries[:3]

['<b>A Computational and Engineering View of Biology</b><br>The text discusses the importance of abstractions in understanding the<br>functionality of biological systems, drawing parallels with computer science. It<br>emphasizes the need to consider biological phenomena at different scales and<br>describes the use of abstractions in biology, highlighting the potential for<br>insights from both biologists and computer scientists. The text also includes a<br>quote from a study on cells as information-processing devices and explains the<br>concept of abstractions in computer science and engineering.',
 '<b>Abstractions in Computer Science and Biological Information</b><br>The text discusses the importance of abstractions in computer science, which<br>help in managing complexity and organizing computer systems. It also highlights<br>the role of information flow in biological processes, emphasizing the need to<br>consider information as a fundamental aspect of biological systems. The<br>com

In [90]:
import networkx as nx
from networkx.algorithms import community

def get_topics(title_similarity, num_topics = 8, bonus_constant = 0.25, min_size = 3):

  proximity_bonus_arr = np.zeros_like(title_similarity)
  for row in range(proximity_bonus_arr.shape[0]):
    for col in range(proximity_bonus_arr.shape[1]):
      if row == col:
        proximity_bonus_arr[row, col] = 0
      else:
        proximity_bonus_arr[row, col] = 1/(abs(row-col)) * bonus_constant
        
  title_similarity += proximity_bonus_arr

  title_nx_graph = nx.from_numpy_array(title_similarity)

  desired_num_topics = num_topics
  # Store the accepted partitionings
  topics_title_accepted = []

  resolution = 0.85
  resolution_step = 0.01
  iterations = 40

  # Find the resolution that gives the desired number of topics
  topics_title = []
  while len(topics_title) not in [desired_num_topics, desired_num_topics + 1, desired_num_topics + 2]:
    topics_title = community.louvain_communities(title_nx_graph, weight = 'weight', resolution = resolution)
    resolution += resolution_step
  topic_sizes = [len(c) for c in topics_title]
  sizes_sd = np.std(topic_sizes)
  modularity = community.modularity(title_nx_graph, topics_title, weight = 'weight', resolution = resolution)

  lowest_sd_iteration = 0
  # Set lowest sd to inf
  lowest_sd = float('inf')

  for i in range(iterations):
    topics_title = community.louvain_communities(title_nx_graph, weight = 'weight', resolution = resolution)
    modularity = community.modularity(title_nx_graph, topics_title, weight = 'weight', resolution = resolution)
    
    # Check SD
    topic_sizes = [len(c) for c in topics_title]
    sizes_sd = np.std(topic_sizes)
    
    topics_title_accepted.append(topics_title)
    
    if sizes_sd < lowest_sd and min(topic_sizes) >= min_size:
      lowest_sd_iteration = i
      lowest_sd = sizes_sd
      
  # Set the chosen partitioning to be the one with highest modularity
  topics_title = topics_title_accepted[lowest_sd_iteration]
  print(f'Best SD: {lowest_sd}, Best iteration: {lowest_sd_iteration}')
  
  topic_id_means = [sum(e)/len(e) for e in topics_title]
  # Arrange title_topics in order of topic_id_means
  topics_title = [list(c) for _, c in sorted(zip(topic_id_means, topics_title), key = lambda pair: pair[0])]
  # Create an array denoting which topic each chunk belongs to
  chunk_topics = [None] * title_similarity.shape[0]
  for i, c in enumerate(topics_title):
    for j in c:
      chunk_topics[j] = i
            
  return {
    'chunk_topics': chunk_topics,
    'topics': topics_title
    }


In [91]:
# chunk_topics = get_topics(similarity_matrix, num_topics=len(summaries) // 4, bonus_constant=0.25, min_size=3)['chunk_topics']

In [92]:
# plot tsne
fig = go.Figure(data=go.Scatter(
  x=tsne_obj[:, 0],
  y=tsne_obj[:, 1],
  mode='markers',
  text=titles,
  hovertext=formatted_summaries,  # Add summaries as hover text
  marker=dict(
    size=16,
    color=summary_obj.chunk_topics, # set color equal to a variable
    colorscale='Viridis', # one of plotly colorscales
    showscale=True
  )
))
fig.update_layout(xaxis=dict(scaleanchor="y", scaleratio=1), yaxis=dict(scaleanchor="x", scaleratio=1))


In [93]:
fig.write_html(f'tsne_v{VERSION}.html', auto_open=True)

In [94]:
# create a pca plot of the summary embeddings

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_obj = pca.fit_transform(summary_embeds)

pca_obj.shape

(34, 2)

In [96]:
# plot pca
fig = go.Figure(data=go.Scatter(
  x=pca_obj[:, 0],
  y=pca_obj[:, 1],
  mode='markers',
  text=titles,
  hovertext=formatted_summaries,  # Add summaries as hover text
  marker=dict(
    size=16,
    color=summary_obj.chunk_topics, # set color equal to a variable
    colorscale='Viridis', # one of plotly colorscales
    showscale=True
  )
))
fig.update_layout(xaxis=dict(scaleanchor="y", scaleratio=1), yaxis=dict(scaleanchor="x", scaleratio=1))

In [97]:
fig.write_html(f'pca_v{VERSION}.html', auto_open=True)