<a href="https://colab.research.google.com/github/RomanEngeler1805/cohere-hackathon-Sep22/blob/main/Cohere_Embed_Analyse_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cohere Analyse
This script has been developed during the *Cohere AI Hackathon #2* to make the Cohere embed endpoints better accessible. <br><br>

It runs and displays all outputs within a streamlit app. It is meant for demonstration purposes with a longer latency to fire up streamlit.
<br><br>

It consists of four parts:
<li>
Data upload
<li>
Exploratory data analysis (EDA)
<li>
Cluster analysis
<li>
Semantic search

<br>

TODO: search for TODO keywords and insert valid cohere api key and ngrok key.

<br>
Make sure to run all cells sequentially and not skip any section<br>
A CPU is enough to run it as all the heavy lifting is done on Cohere's side


## Install & Imports

In [1]:
!pip install cohere umap-learn altair annoy bertopic streamlit pyngrok==4.1.1 -q

[K     |████████████████████████████████| 88 kB 4.1 MB/s 
[K     |████████████████████████████████| 647 kB 17.4 MB/s 
[K     |████████████████████████████████| 76 kB 1.7 MB/s 
[K     |████████████████████████████████| 9.1 MB 43.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 12.0 MB/s 
[K     |████████████████████████████████| 5.2 MB 7.8 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 636 kB 56.0 MB/s 
[K     |████████████████████████████████| 85 kB 5.7 MB/s 
[K     |████████████████████████████████| 4.7 MB 54.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 47.2 MB/s 
[K     |████████████████████████████████| 120 kB 62.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 43.0 MB/s 
[K     |████████████████████████████████| 164 kB 61.7 MB/s 
[K     |████████████████████████████████| 4.7

## Streamlit Scripts

In [2]:
%%writefile helper.py
import streamlit as st
import pandas as pd
import cohere
import umap
import warnings
from sklearn.cluster import KMeans
from bertopic._ctfidf import ClassTFIDF
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from typing import Tuple

# TODO: insert valid api key
api_key = None
co = cohere.Client(api_key)
title = 'Some title needs to be inputted here'

@st.cache(allow_output_mutation=True)
# Load & Prepare dataset
def get_dataset(df: pd.DataFrame, text: str, title: str, max_length: int=100) -> pd.DataFrame:
  '''
  inputs:
  - df: dataframe of data
  - text: name of text column
  - title: name of title column
  - max_length: parameter to limit length for the sake of speed

  outputs:
  - df: dataframe with 'text' and 'title' column
  '''
  df.rename(columns={text: 'text', title: 'title'}, inplace=True)
  df = df[['title', 'text']]
  max_length = min(max_length, df.shape[0])
  df = df.head(max_length)
  return df

@st.cache(allow_output_mutation=True)
def get_embeddings(df: pd.DataFrame) -> Tuple[list, list]:
  '''
  input:
  - df: dataframe with 'text' column

  output:
  - embeds: cohere embedding
  - umap_embeds: umap embeddings -> dimensionality reduction technique
  '''
  embeds = co.embed(texts=list(df['text']),
                  model='medium',
                  truncate='LEFT').embeddings
  reducer = umap.UMAP(n_neighbors=100) 
  umap_embeds = reducer.fit_transform(embeds)
  return (embeds, umap_embeds)

@st.cache(allow_output_mutation=True)
def get_keywords(df: pd.DataFrame, n_clusters: int=8) -> pd.DataFrame:
  '''
  inputs:
  - df: dataframe with columns ('text', 'title', 'embeds', 'x', 'y')
  - n_clusters: number of clusters in k-means

  outputs:
  - df: dataframe with columns ('text', 'topic', 'embeds', 'x', 'y', 'cluster', 'keywords')
  - chart-title
  '''

  # k-means clustering
  kmeans_model = KMeans(n_clusters=n_clusters, random_state=0)
  classes = kmeans_model.fit_predict(list(df['embeds'].values))

  # get keywords from each cluster
  # - group documents by cluster assignment
  # - get tf-id for the topic words in each cluster 
  documents =  df['title']
  documents = pd.DataFrame({"Document": documents,
                            "ID": range(len(documents)),
                            "Topic": None})
  documents['Topic'] = classes
  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  count_vectorizer = CountVectorizer(stop_words="english").fit(documents_per_topic.Document)
  count = count_vectorizer.transform(documents_per_topic.Document)
  words = count_vectorizer.get_feature_names()
  ctfidf = ClassTFIDF().fit_transform(count).toarray()
  words_per_class = {label: [words[index] for index in ctfidf[label].argsort()[-10:]] for label in documents_per_topic.Topic}

  # add cluster assignment and keywords per cluster to dataframe
  df['cluster'] = classes
  df['keywords'] = df['cluster'].map(lambda topic_num: ", ".join(np.array(words_per_class[topic_num])[:]))

  return df

Writing helper.py


In [3]:
%%writefile semantic_search.py
import streamlit as st
import pandas as pd
import numpy as np
from annoy import AnnoyIndex
from helper import get_dataset, get_embeddings, get_keywords
from typing import Tuple

@st.cache(allow_output_mutation=True)
def search(df: pd.DataFrame, query: str, n_relevantDocs: int=20) -> pd.DataFrame:
  '''
  inputs:
  - df: dataframe with embeds column
  - query: search query
  - n_relevantDocs: number of documents to return for query

  outputs:
  - df: dataframe with additional collumn 'relevance' in [0, 1]
  '''

  # query and embedding
  temp_dict = {'text': query}
  df_query = pd.DataFrame(temp_dict, index=[0])

  # embed query
  query_embed, query_umap_embed = get_embeddings(df_query)

  # create search index
  embeds =  np.array(list(df['embeds'].values))

  search_index = AnnoyIndex(embeds.shape[1], 'angular')
  # Add all the vectors to the search index
  for i in range(len(embeds)):
      search_index.add_item(i, embeds[i])

  search_index.build(10) # 10 trees
  search_index.save('test.ann')

  # Retrieve the nearest neighbors
  similar_item_ids = search_index.get_nns_by_vector(query_embed[0],
                                                    n_relevantDocs,
                                                    include_distances=True)
  # Format the results
  results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'],
                              'distance': similar_item_ids[1]})

  # dataframe for plotting -> (x, y, relevance) with relevance 0 or 1
  relevant_docs = []

  for k in range(len(df)):
    if k in similar_item_ids[0]:
      relevant_docs.append(1)
    else:
      relevant_docs.append(0)

  df_relevantDocs = pd.DataFrame(relevant_docs, columns=['relevance'])

  df = df.join(df_relevantDocs);

  return df

Writing semantic_search.py


In [8]:
%%writefile app.py
import numpy as np
import pandas as pd
import umap
import altair as alt
import warnings
from PIL import Image
import requests
from sklearn.cluster import KMeans
from bertopic._ctfidf import ClassTFIDF
from sklearn.feature_extraction.text import CountVectorizer
from typing import Tuple
import streamlit as st
from helper import get_dataset, get_embeddings, get_keywords
from semantic_search import search
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

# work-around to not trigger get_embeddings at every tab change
file_name = None

def main():

  df = pd.DataFrame({'title': [], 'text': []})
  # title
  col1, _, col2 = st.columns([1,1,15])

  image_cohere = Image.open(requests.get('https://avatars.githubusercontent.com/u/54850923?s=280&v=4', stream=True).raw)
  col1.image(image_cohere, width=80)
  #
  col2.title('Analyze')

  with st.expander("How this works.", expanded=False):
    st.write(
      """     
      Embeddings are hard to vizualize. Analyze makes it a breeze.
      1. Go ahead and upload a csv file that you want to examine.
          The csv file needs to have atleast 2 columns:
          - The first column being shorter text - a title for example.
          - The second column being the longer text - the body of the text for example.
      2. You then have 3 options:
          - EDA: Get an overview of the file and get some general exploratory data analysis.
          - Cluster: Do some cluster analysis, with keywords generated from the body of the text and using the titles.
          - Search: Query the data and retrieve the closest match.

      To make sure this app works quickly, we only capture the first 500 lines of text.     
      """
    )
    st.markdown('')
  
  # file upload
  uploaded_file = st.file_uploader("Choose a file")

  if (uploaded_file is not None):
    df = pd.read_csv(uploaded_file, usecols=[5, 6]) # TODO: remove hard coding

    df.columns = ['text', 'title']
    df = df.head(100)
    df = get_dataset(df, text='text', title='title')

  #
  app_mode = st.sidebar.selectbox('Task', ['Import', 'EDA', 'Cluster', 'Search'])

  if app_mode == "Import":
      st.markdown('')

      st.markdown(
          """
          <style>
          [data-testid="stSidebar"][aria-expanded="true"] > div:first-child{
              width: 350px
          }
          [data-testid="stSidebar"][aria-expanded="false"] > div:first-child{
              width: 350px
              margin-left: -350px
          }
          </style>
          """,

          unsafe_allow_html=True,
      )
      
  elif app_mode == 'EDA':
    st.sidebar.subheader(' Quick  Explore')
    st.markdown("Tick the box on the side panel to explore the dataset.")
    if st.sidebar.checkbox('Basic Info'):
        if st.sidebar.checkbox("Show Columns"):
            st.subheader('Show Columns List')
            all_columns = df.columns.to_list()
            st.write(all_columns)

        if st.sidebar.checkbox('Overview'):
            st.subheader('File contents')
            st.write(df)
            st.write(f'The number of lines is {df.shape[0]}. We will only process {min(500, df.shape[0])}')
        if st.sidebar.checkbox('Missing Values?'):
            st.subheader('Missing values')
            st.write(df.isnull().sum())

  elif app_mode == 'Cluster':
    if 'embeds' not in df:
      # get cohere and umap embeddings
      embeds, umap_embeds = get_embeddings(df)
      # store umap embeddings in dataframe for plotting
      df['embeds'] = embeds
      df['x'] = umap_embeds[:,0]
      df['y'] = umap_embeds[:,1]

    # number of clusters
    low, med, high = 1, 8, 10

    with st.expander("Help", expanded=False):
      st.write(
          """     
          One of the ways to determine the optimal number of clusters, is to choose the number corresponding to an elbow.
          """
      )
      st.markdown("")

      # determine distortion as a function of the number of clusters in kmeans
      distortions = []
      nembeds = np.array(list(df['embeds'].values))
      for k in range(low, high + 1):
        km = KMeans(n_clusters=k)
        km.fit(nembeds)
        distortions.append(sum(np.min(cdist(nembeds, km.cluster_centers_,
                                              'euclidean'), axis=1)) / nembeds.shape[0])  

      fig = plt.figure(figsize=(10, 4))
      plt.plot(range(low, high + 1), distortions, 'bx-')
      plt.xlabel('Number of clusters')
      plt.ylabel('Distortion')
      plt.title('Determine the optimal number of clusters')
      st.pyplot(fig)

    # user can choose number of clusters
    n_clusters = st.slider('Select number of clusters', low, high, med)
    df = get_keywords(df, n_clusters=n_clusters)
    selection = alt.selection_multi(fields=['keywords'], bind='legend')
    chart = alt.Chart(df).transform_calculate(
        url=alt.datum.id
    ).mark_circle(size=60, stroke='#666', strokeWidth=1, opacity=0.3).encode(
        x=#'x',
        alt.X('x',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        y=
        alt.Y('y',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        href='url:N',
        color=alt.Color('keywords:N', 
                        legend=alt.Legend(columns=1, symbolLimit=0, labelFontSize=14)
                      ),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
        tooltip=['title', 'keywords', 'cluster']
    ).properties(
        width=800,
        height=500
    ).add_selection(
        selection
    ).configure_legend(labelLimit= 0).configure_view(
        strokeWidth=0
    ).configure(background="#FAFAFA").properties(
        title='K-Means clustering in 2D umap visualisation'
    ).interactive()
    st.altair_chart(chart, use_container_width=True)
  
  elif app_mode == 'Search':
    if 'embeds' not in df:
      # get cohere and umap embeddings
      embeds, umap_embeds = get_embeddings(df)
      # store umap embeddings in dataframe for plotting
      df['embeds'] = embeds
      df['x'] = umap_embeds[:,0]
      df['y'] = umap_embeds[:,1]

    # call the search Function
    query = st.text_input(label='Search query', value='Show me something important')
    df = search(df, query)

    # Plot
    chart = alt.Chart(df).transform_calculate(
        url= alt.datum.id
    ).mark_circle(size=60, stroke='#666', strokeWidth=1, opacity=0.3).encode(
        x=#'x',
        alt.X('x',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        y=
        alt.Y('y',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        color=alt.Color('relevance', scale=alt.Scale(domain=[0, 1], range=['blue', 'red'])),
        tooltip=['title']
    ).properties(
        width=800,
        height=500
    ).configure_legend(labelLimit= 0).configure_view(
        strokeWidth=0
    ).configure(background="#FAFAFA").interactive()
    st.altair_chart(chart, use_container_width=True)

if __name__ == '__main__':
  main()

Overwriting app.py


## Run Streamlit App

In [5]:
# get authtoken from https://dashboard.ngrok.com/get-started/setup 
# TODO: insert valid authtoken
!ngrok authtoken 2EGJ6BBj3YLPwKOmEih64tTW8Ix_2bY2s98krjdTQmjLgNYFf

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [6]:
# Note: opening the ngrok port in the browser returns unsecure side warning
from pyngrok import ngrok
public_url = ngrok.connect(port=8090)
print(public_url)

http://b87f-35-199-175-165.ngrok.io


In [9]:
!streamlit run --server.port 8090 app.py &>/dev/null&

## Close Down Streamlit App

In [15]:
# kill tunnel
ngrok.kill()
ngrok.disconnect(public_url)

In [16]:
# kill streamlit
!pgrep streamlit

271


In [17]:
# TODO: insert streamlit process number
!kill 271