In [1]:
!pip install cohere umap-learn altair annoy datasets tqdm bertopic transformers datasets streamlit pyngrok==4.1.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cohere
  Downloading cohere-2.2.5.tar.gz (9.3 kB)
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 4.0 MB/s 
Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
[K     |████████████████████████████████| 647 kB 11.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 40.2 MB/s 
Collecting bertopic
  Downloading bertopic-0.11.0-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 3.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 70.7 MB/s 
[?25hCollecting streamlit
  Downloading streamlit-1.12.2-py2.py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 37.0 MB/s 
[?25hCollec

In [2]:
import cohere
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
from sklearn.cluster import KMeans
from bertopic._ctfidf import ClassTFIDF
from sklearn.feature_extraction.text import CountVectorizer

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [45]:
%%writefile helper.py
import streamlit as st
import pandas as pd
import cohere
import umap
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from sklearn.cluster import KMeans
from bertopic._ctfidf import ClassTFIDF
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

api_key = 'WKsYz7mFBHEDsI7xsZ9KJP3WPMm5txIjKmU0eBTK'
co = cohere.Client(api_key)
title = 'Some title needs to be inputted here'

@st.cache(allow_output_mutation=True)
def get_dataset(df, text, title):
  max_length = 500
  df.rename(columns={text: 'text', title: 'title'}, inplace=True)
  df = df[['title', 'text']]
  max_length = min(max_length, df.shape[0])
  df = df.head(max_length)
  return df

@st.cache(allow_output_mutation=True)
def get_embeddings(df):
    embeds = co.embed(texts=list(df['text']),
                    model='large',
                    truncate='LEFT').embeddings
    reducer = umap.UMAP(n_neighbors=100) 
    umap_embeds = reducer.fit_transform(embeds)
    return (embeds, umap_embeds)

@st.cache(allow_output_mutation=True)
def get_keywords(df, n_clusters=8, chart_title='This is the title'):
  embeds, umap_embeds = get_embeddings(df)
  df['x'] = umap_embeds[:,0]
  df['y'] = umap_embeds[:,1]

  kmeans_model = KMeans(n_clusters=n_clusters, random_state=0)
  classes = kmeans_model.fit_predict(embeds)
  documents =  df['title']
  documents = pd.DataFrame({"Document": documents,
                            "ID": range(len(documents)),
                            "Topic": None})
  documents['Topic'] = classes
  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  count_vectorizer = CountVectorizer(stop_words="english").fit(documents_per_topic.Document)
  count = count_vectorizer.transform(documents_per_topic.Document)
  words = count_vectorizer.get_feature_names()
  ctfidf = ClassTFIDF().fit_transform(count).toarray()
  words_per_class = {label: [words[index] for index in ctfidf[label].argsort()[-10:]] for label in documents_per_topic.Topic}
  df['cluster'] = classes
  df['keywords'] = df['cluster'].map(lambda topic_num: ", ".join(np.array(words_per_class[topic_num])[:]))
  return df, chart_title

Overwriting helper.py


In [44]:
%%writefile semantic_search.py
import streamlit as st
import cohere
from datasets import load_dataset
import pandas as pd
import numpy as np
from annoy import AnnoyIndex
from helper import get_dataset, get_embeddings, get_keywords

@st.cache(allow_output_mutation=True)
def search(df, query):
  api_key = 'dwhPny8kTpkhDNpu05484MtqjFU2QKXeYx9kH6DA'
  co = cohere.Client(api_key)
  title = 'Some title needs to be inputted here'

  # Get embeddings
  embeds, umap_embeds = get_embeddings(df)
  df['x'] = umap_embeds[:,0]
  df['y'] = umap_embeds[:,1]

  # query and embedding
  temp_dict = {'text': query}
  df_query = pd.DataFrame(temp_dict, index=[0])

  # embed query
  query_embed, query_umap_embed = get_embeddings(df_query)

  # create search index
  embeds = np.array(embeds)

  search_index = AnnoyIndex(embeds.shape[1], 'angular')
  # Add all the vectors to the search index
  for i in range(len(embeds)):
      search_index.add_item(i, embeds[i])

  search_index.build(10) # 10 trees
  search_index.save('test.ann')

  # Retrieve the nearest neighbors
  similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,
                                                  include_distances=True)
  # Format the results
  results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'],
                              'distance': similar_item_ids[1]})

  # find neighbours
  neighbour = []

  for k in range(len(df)):
    if k in similar_item_ids[0]:
      neighbour.append(1)
    else:
      neighbour.append(0)

  df_neighbour = pd.DataFrame(neighbour, columns=['neighbour'])

  df = df.join(df_neighbour);

  return df

Overwriting semantic_search.py


In [53]:
%%writefile app.py
import streamlit as st
from datasets import load_dataset
import cohere
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
from sklearn.cluster import KMeans
from bertopic._ctfidf import ClassTFIDF
from sklearn.feature_extraction.text import CountVectorizer
from PIL import Image
import requests
from helper import get_dataset, get_embeddings, get_keywords
from semantic_search import search
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)


api_key = 'WKsYz7mFBHEDsI7xsZ9KJP3WPMm5txIjKmU0eBTK'
co = cohere.Client(api_key)
title = 'Some title needs to be inputted here'



@st.cache(allow_output_mutation=True)
def get_keywords(df, n_clusters=8, chart_title='This is the title'):
  def get_embeddings(df):
    embeds = co.embed(texts=list(df['text']),
                    model='large',
                    truncate='LEFT').embeddings
    reducer = umap.UMAP(n_neighbors=100) 
    umap_embeds = reducer.fit_transform(embeds)
    return (embeds, umap_embeds)

  embeds, umap_embeds = get_embeddings(df)
  df['x'] = umap_embeds[:,0]
  df['y'] = umap_embeds[:,1]

  kmeans_model = KMeans(n_clusters=n_clusters, random_state=0)
  classes = kmeans_model.fit_predict(embeds)
  documents =  df['title']
  documents = pd.DataFrame({"Document": documents,
                            "ID": range(len(documents)),
                            "Topic": None})
  documents['Topic'] = classes
  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  count_vectorizer = CountVectorizer(stop_words="english").fit(documents_per_topic.Document)
  count = count_vectorizer.transform(documents_per_topic.Document)
  words = count_vectorizer.get_feature_names()
  ctfidf = ClassTFIDF().fit_transform(count).toarray()
  words_per_class = {label: [words[index] for index in ctfidf[label].argsort()[-10:]] for label in documents_per_topic.Topic}
  df['cluster'] = classes
  df['keywords'] = df['cluster'].map(lambda topic_num: ", ".join(np.array(words_per_class[topic_num])[:]))
  return df, chart_title

def main():

  df = pd.DataFrame({'title': [], 'text': []})
  # title
  col1, _, col2 = st.columns([1,1,15])

  image_cohere = Image.open(requests.get('https://avatars.githubusercontent.com/u/54850923?s=280&v=4', stream=True).raw)
  col1.image(image_cohere, width=80)
  #
  col2.title('Analyze')
  #
  app_mode = st.sidebar.selectbox('Task', ['Import', 'EDA', 'Cluster', 'Search'])

  with st.expander("How this works.", expanded=False):
    st.write(
      """     
      Embeddings are hard to vizualize. Analyze makes it a breeze.
      1. Go ahead and upload a csv file that you want to examine.
          The csv file needs to have atleast 2 columns:
          - The first column being shorter text - a title for example.
          - The second column being the longer text - the body of the text for example.
      2. You then have 3 options:
          - EDA: Get an overview of the file and get some general exploratory data analysis.
          - Cluster: Do some cluster analysis, with keywords generated from the body of the text and using the titles.
          - Search: Query the data and retrieve the closest match.

      To make sure this app works quickly, we only capture the first 500 lines of text.     
      """
    )
    st.markdown('')
  uploaded_file = st.file_uploader("Choose a file")
  if uploaded_file is not None:
    df = pd.read_csv(uploaded_file, usecols=[0, 1])

  
   
  if app_mode == "Import":
      st.markdown('')

      st.markdown(
          """
          <style>
          [data-testid="stSidebar"][aria-expanded="true"] > div:first-child{
              width: 350px
          }
          [data-testid="stSidebar"][aria-expanded="false"] > div:first-child{
              width: 350px
              margin-left: -350px
          }
          </style>
          """,

          unsafe_allow_html=True,
      )
      #with st.expander("Help", expanded=False):
      #  st.write(
      #    """     
      #    Embeddings are hard to vizualize. Analyze makes it a breeze.
      #    1. Go ahead and upload a csv file that you want to examine.
      #       The csv file needs to have atleast 2 columns:
      #       - The first column being shorter text - a title for example.
      #       - The second column being the longer text - the body of the text for example.
      #    2. You then have 3 options:
      #       - EDA: Get an overview of the file and get some general exploratory data analysis.
      #       - Cluster: Do some cluster analysis, with keywords generated from the body of the text and using the titles.
      #       - Search: Query the data and retrieve the closest match.
      #
      #    To make sure this app works quickly, we only capture the first 500 lines of text.     
      #    """
      #)
      #st.markdown('')
      #uploaded_file = st.file_uploader("Choose a file")
      #if uploaded_file is not None:
      #  df = pd.read_csv(uploaded_file, usecols=[0, 1])
      #  st.write(uploaded_file.name)
      #  st.write(dataframe)
  elif app_mode == 'EDA':
    st.sidebar.subheader(' Quick  Explore')
    st.markdown("Tick the box on the side panel to explore the dataset.")
    if st.sidebar.checkbox('Basic Info'):
        if st.sidebar.checkbox("Show Columns"):
            st.subheader('Show Columns List')
            all_columns = df.columns.to_list()
            st.write(all_columns)

        if st.sidebar.checkbox('Overview'):
            st.subheader('File contents')
            st.write(df)
            st.write(f'The number of lines is {df.shape[0]}. We will only process {min(500, df.shape[0])}')
        if st.sidebar.checkbox('Missing Values?'):
            st.subheader('Missing values')
            st.write(df.isnull().sum())

  elif app_mode == 'Cluster':
    df.columns = ['title', 'text']
    embeds, umap_embeds = get_embeddings(df)
    low, med, high = 1, 8, 10

    with st.expander("Help", expanded=False):
      st.write(
          """     
          One of the ways to determine the optimal number of clusters, is to choose the number corresponding to an elbow, if it exists.
          """
      )
      st.markdown("")
      distortions = []
      nembeds = np.array(embeds)
      for k in range(low, high + 1):
        km = KMeans(n_clusters=k)
        km.fit(nembeds)
        distortions.append(sum(np.min(cdist(nembeds, km.cluster_centers_,
                                              'euclidean'), axis=1)) / nembeds.shape[0])  

      fig = plt.figure(figsize=(10, 4))
      plt.plot(range(low, high + 1), distortions, 'bx-')
      plt.xlabel('Number of clusters')
      plt.ylabel('Distortion')
      plt.title('Determine the optimal number of clusters')
      st.pyplot(fig)

    n_clusters = st.slider('Select number of clusters', low, high, med)
    #st.write('Number of clusters:', n_clusters)
    #df.columns = ['title', 'text']
    #st.write(df.head())
    df = get_dataset(df, text='text', title='title')
    try:
      chart_title = uploaded_file.name.split('.')[0]
    except:
      chart_title = 'Title TBD'
    df, chart_title = get_keywords(df, n_clusters=n_clusters, chart_title=chart_title)
    selection = alt.selection_multi(fields=['keywords'], bind='legend')
    chart = alt.Chart(df).transform_calculate(
        url=alt.datum.id
    ).mark_circle(size=60, stroke='#666', strokeWidth=1, opacity=0.3).encode(
        x=#'x',
        alt.X('x',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        y=
        alt.Y('y',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        href='url:N',
        color=alt.Color('keywords:N', 
                        legend=alt.Legend(columns=1, symbolLimit=0, labelFontSize=14)
                      ),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
        tooltip=['title', 'keywords', 'cluster']
    ).properties(
        width=800,
        height=500
    ).add_selection(
        selection
    ).configure_legend(labelLimit= 0).configure_view(
        strokeWidth=0
    ).configure(background="#FAFAFA").properties(
        title=chart_title
    ).interactive()
    st.altair_chart(chart, use_container_width=True)
  elif app_mode == 'Search':
    # call the search Function
    # TODO: pass query from here
    # call the search Function
    query = st.text_input(label='Search query', value='Show me something important')
    #st.write(df.head())
    df.columns = ['title', 'text']
    df = search(df, query)
    # Plot
    chart = alt.Chart(df).transform_calculate(
        url= alt.datum.id
    ).mark_circle(size=60, stroke='#666', strokeWidth=1, opacity=0.3).encode(
        x=#'x',
        alt.X('x',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        y=
        alt.Y('y',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(labels=False, ticks=False, domain=False)
        ),
        color=alt.Color('neighbour', scale=alt.Scale(domain=[0, 1], range=['blue', 'red'])),
        tooltip=['title']
    ).properties(
        width=800,
        height=500
    ).configure_legend(labelLimit= 0).configure_view(
        strokeWidth=0
    ).configure(background="#FAFAFA")
    #chart.interactive()
    st.altair_chart(chart.interactive(), use_container_width=True)

if __name__ == '__main__':
  main()





Overwriting app.py


In [17]:
!ngrok authtoken 2EFrz6ZHXEJzLa2gEluLjJONyo6_2T1iQUfKihu5g8gftsLDw

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [61]:
!ps

    PID TTY          TIME CMD
      1 ?        00:00:00 docker-init
      8 ?        00:00:08 node
     18 ?        00:00:00 tail
     31 ?        00:00:05 python3 <defunct>
     32 ?        00:00:00 colab-fileshim.
     45 ?        00:00:05 jupyter-noteboo
     46 ?        00:00:09 dap_multiplexer
     66 ?        00:01:10 python3
     88 ?        00:00:17 python3
    288 ?        00:05:35 node
   1865 ?        00:03:58 streamlit
   1973 ?        00:00:00 ps


In [83]:
from pyngrok import ngrok
public_url = ngrok.connect(port=8090)
print(public_url)

http://66a8-34-66-124-73.ngrok.io


In [84]:
!streamlit run --server.port 8090 app.py >/dev/null

2022-09-04 18:57:38.887 INFO    numexpr.utils: NumExpr defaulting to 4 threads.


In [79]:
!ls

app.py	helper.py  __pycache__	sample_data  semantic_search.py  test.ann


In [82]:
from google.colab import files
files.download('semantic_search.py')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>