<a href="https://colab.research.google.com/github/TollanBerhanu/Semantic-search-on-Slack/blob/main/slack_semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing Semantic Search on Exported Slack Data

This notebook contains an implementation of semantic search on exported slack data

*   *This implementation utilizes the following tools:*

>

    1.   'Pandas' - to load and extract relevant information from the dataset
    2.   'SentenceTransformers embedding model' - to generate embeddings for each chunk of data
    3.   'Pinecone' - to store and query the vector embeddings with some metadata
    4.   'Alpaca / LLaMA model' - to present the results in natural language


## 1. Installing dependencies

In [1]:
# Install dependinces for the LLM (Alpaca)
!pip install -q datasets loralib sentencepiece
!pip uninstall transformers
!pip install -q git+https://github.com/zphang/transformers@c3dc391
!pip -q install git+https://github.com/huggingface/peft.git
!pip -q install bitsandbytes

# Install sentence transformers for generating embeddings
!pip install --upgrade langchain  -q
!pip install sentence_transformers > /dev/null

# Install the pinecone client
!pip install pinecone-client

# Install ngrok to run a server in colab
!pip install pyngrok

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m91.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.5 MB/s[0m eta

## 2. Extracting the messages from the exported slack data

In [3]:
import os
import pandas as pd

slack_data_path = '/content/drive/MyDrive/Colab Notebooks/dataset/slack-data/'
# cwd = os.path.join (os.getcwd(), slack_data)  # join with current_working_directory just in case

def get_all_channels(path):
  df = pd.read_json(path + 'channels.json')

  channel_ids = [id for id in df['id']]
  channel_names = [ name for name in df['name']]

  return pd.DataFrame({ 'channel_id': channel_ids, 'channel_name': channel_names } )

channels = get_all_channels(slack_data_path)
channels

Unnamed: 0,channel_id,channel_name
0,C05D1SE01B7,random
1,C05D77W3N76,general
2,C05D7863DRA,test


In [4]:
import glob
import json

# Return the metadata of each message in the channel
def extract_channel_data(path, channel_name):
  # use glob to get all the json files in the folder
  daily_json_files = glob.glob(path + channel_name +'/*.json')

  # just return if the channel doesn't exist (or hasn't been exported yet)
  if not daily_json_files:
    return

  metadata = pd.DataFrame(columns = ['message', 'channel', 'date', 'time', 'user_id', 'user_name'])

  # loop over the list of json files (each json file includes every post in that channel for a single day)
  for f in daily_json_files:
    # read the json file
    # today_data = pd.read_json(f)
    with open(f, 'r') as file:
        # Read the contents
        data = file.read()
        # Parse the JSON data
        today_data = json.loads(data)

    today_date = f.split("/")[-1]  # 'f' is the full file path and file name
    print('Extracting...', today_date) # the file name is the date

    # iterate through all the messages of the day
    for msg_data in today_data:
      # Skip if its a "channel_join" type message or if the actual message content is empty
      if ('subtype' in msg_data) or (msg_data['text'] == "") or (msg_data['type'] != 'message'):
        continue
        # TODO: filter out any links, stickers, and other junk
        # TODO: replace @Member references by their real names

      metadata.loc[len(metadata)] = {
            'message': msg_data['text'],
            'channel': channel_name,
            'date': today_date.split(".json")[0], # omit the file extension '.json'
            'time': msg_data['ts'],
            'user_id': msg_data['user'],
            'user_name': msg_data['user_profile']['first_name'] # We can also use 'real_name' if we wanted the full name of the user
      }

  return metadata

In [None]:
extract_channel_data(slack_data_path, 'general').to_json(orient="records")

Extracting... 2023-06-19.json


'[{"message":"hello","channel":"general","date":"2023-06-19","time":"1687166197.580079","user_id":"U05DHDPL4FK","user_name":"kenenisaalemayhu0"},{"message":"<https:\\/\\/haystack.deepset.ai\\/tutorials\\/08_preprocessing>","channel":"general","date":"2023-06-19","time":"1687166202.864639","user_id":"U05DHDPL4FK","user_name":"kenenisaalemayhu0"},{"message":"Good work.. now we don\'t have to worry about exporting data.","channel":"general","date":"2023-06-19","time":"1687166814.786429","user_id":"U05CQ93C3FZ","user_name":"Tollan"},{"message":"It\'s best if we just post random topics here to test the semantic search.","channel":"general","date":"2023-06-19","time":"1687166901.338569","user_id":"U05CQ93C3FZ","user_name":"Tollan"},{"message":"yeah then we\\u2019ll see how we can clean the data","channel":"general","date":"2023-06-19","time":"1687167171.439409","user_id":"U05DHDPL4FK","user_name":"kenenisaalemayhu0"},{"message":"but we can\\u2019t use it for search since it\\u2019ll be very 

## 3. Generating Embeddings

**We use SentenceTransformers to generate the embeddings. In this implementation, embeddings are generated for each message on slack.**

In [5]:
# Loading the embedding model
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# ... is equivalent to ...
# SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [6]:
# Return a list of embeddings for all messages of the channel
def embed_channel_messages(channel_data):
  msg_list = channel_data['message'].astype(str).tolist()
  return embedding_model.embed_documents(msg_list)

In [None]:
los = extract_channel_data('general') # ['message'].tolist()
embed_channel_messages(los)

Extracting... 2023-06-19.json


## 4. Storing the embeddings in Pinecone

**We must initialize pinecone before we do anything**

In [7]:
# import getpass
import pinecone
from langchain.vectorstores import Pinecone

# PINECONE_API_KEY = getpass.getpass("Your API key: ")
# PINECONE_ENV = getpass.getpass("Your env't name: ")

PINECONE_API_KEY = "3c7987b8-6c6b-46a3-8adb-667f21b05421"
PINECONE_ENV = "us-west1-gcp-free"

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV,  # next to api key in console
)

# Make a sample embedding to determine the index dimension
sample_embedding = embedding_model.embed_query('Sample text')

# all_indices = pinecone.list_indexes() # List all the indexed in our pinecone workspace
index_name = "slack-embeddings"
index_dimension = len(sample_embedding)


**We create the pinecone index with the same dimensions as the embeddings. This should only be run the first time.**

In [None]:
# Create a pinecone index
print('Creating an index of dimension "'+ str(index_dimension) +'" ...')
pinecone.create_index(index_name, index_dimension)

pinecone.describe_index(index_name)
print('Pinecone index created!')

Creating an index of dimension "384" ...


ApiException: ignored

**Connecting to our pinecone index**

In [8]:
# Connect to the index
index = pinecone.Index(index_name)
# Current index statistics
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'first-upsert': {'vector_count': 19}},
 'total_vector_count': 19}

**This is a function to upsert new data to the pinecone index**

In [None]:
# This will be the size of the batch of vectors sent to pinecone at a time
step = 100

def upsert_channel_embeddings(channel_name, channel_embeddings, channel_data):

  # channel_name = 'general'
  # channel_data = extract_channel_data(channel_name)
  # channel_embeddings = embed_channel_messages(channel_data)

  no_embeddings = len(channel_embeddings)
  print(channel_embeddings)

  parsed_channel_data = json.loads(channel_data.to_json(orient="records"))

  for start in range(0, no_embeddings, step):
    # The end location of the current batch
    end = min(no_embeddings, start+step)    # If it reached the last batch, the end should be the total amount of vectors
                                            # [0..99], [100..199], ... , [1600..1678]  (The last batch should end at 1678)

    # create IDs for all embedded chunks (vectors) ... [channelname_0 -> ... -> channelname_..]
    ids = [channel_name+str(x) for x in range(start, end)]

    # create a records list of current batch for upsert
    records = zip(ids, channel_embeddings[start:end], parsed_channel_data[start:end]) # "vector_id" , [embeddings], {metadata}

    # upsert to Pinecone
      # vectors = [ ( "id1", [0.1,0.2,..], {metadata1} )  ,  ( "id2", [0.4,0.6,..], {metadata2} )  , .. ]
      # namespace = "my-namespace"
    index.upsert(vectors=records, namespace="first-upsert")

    # index stat after current batch upsert
    print('Batch no. ' + str(int( start/step + 1 )) )
    index.describe_index_stats()

  # index stats after all upsert batch
  print('Completed upserting all batches in '+channel_name+' ... ')
  index.describe_index_stats()

**Here, we are upserting messages from all channels. This must only be run the first time (when a new index is created).**

In [None]:
# Upsert all messages using the helper functions defined above
def upsert_all():
  channel_names = channels['channel_name'].tolist()

  for channel_name in channel_names:

    channel_data = extract_channel_data(channel_name)

    channel_embeddings = embed_channel_messages(channel_data)

    upsert_channel_embeddings(channel_name, channel_embeddings, channel_data)

Extracting... 2023-06-19.json
[[-0.07584918290376663, -0.021358055993914604, -0.047360558062791824, 0.024587076157331467, -0.05030532553792, -0.07117699831724167, 0.10254044830799103, -0.06500173360109329, 0.08284778147935867, -0.022649534046649933, 0.06542233377695084, -0.05668095499277115, 0.021410761401057243, -0.020286956802010536, -0.04972054436802864, -0.021618368104100227, -0.028856726363301277, -0.039371080696582794, -0.09512753784656525, 0.006624280475080013, -0.05642743408679962, -0.09569098800420761, 0.0026799780316650867, 0.01117519661784172, -0.0966227799654007, -0.0653490200638771, 0.05503646284341812, 0.03184431418776512, 0.040415842086076736, -0.111893430352211, 0.07045644521713257, 0.1069994568824768, 0.01636463589966297, -0.06084941700100899, -0.013645083643496037, -0.00085640192264691, -0.04624719172716141, -0.05976218730211258, -0.0004942309460602701, 0.04147835075855255, 0.0014390397118404508, -0.05436088517308235, 0.05043290555477142, -0.029950425028800964, 0.0157

## 5. Querying the Messages From Pinecone

**Here we are querying our pinecone index. This involves using embeddings to query the index. We can also filter the results based on their metadata.**

In [21]:
def get_context_from_pinecone(query):
  # Generate embeddings for the query
  embedded_query = embedding_model.embed_query(query)

  query_response = index.query(
      namespace="first-upsert",
      top_k=5,
      include_values=False,
      include_metadata=True,
      vector=embedded_query,

      filter={
          "channel": {"$in": ["general", "random"]}
          # "user_id": {"$in": ["U05D1SQDNSH", "U05DHDPL4FK", "U05CQ93C3FZ", "U05D4M7RGQ3"]}
      }
  )

  matches = query_response['matches']
  max_score = matches[0]['score']

  context = ""
  for ctx in matches:
    context += ctx['metadata']['user_name'] + ' said ' + ctx['metadata']['message'] + '\n'

  return (context, max_score)

example_context = get_context_from_pinecone("What shouldn't we worry about?")
print('Context: \n', example_context[0])
print('Max Score: \n', example_context[1])

Context: 
 Tollan said Good work.. now we don't have to worry about exporting data.
kenenisaalemayhu0 said yeah then we’ll see how we can clean the data
Tollan said And we should also post some stickers... :grinning: :smile: :grin:
kenenisaalemayhu0 said but we can’t use it for search since it’ll be very small
kenenisaalemayhu0 said Random shiii

Max Score: 
 0.264166862


## 6. Getting the response from Alpaca

**We should load the model first**

In [11]:
from peft import PeftModel
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig
import textwrap

# Load the model
tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf")

model = LLaMAForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)
model = PeftModel.from_pretrained(model, "samwit/alpaca7B-lora")



Downloading (…)l-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading (…)l-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

In [22]:
# Define a function that runs the model
def alpaca_talk(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
    )
    input_ids = inputs["input_ids"].cuda()

    generation_config = GenerationConfig(
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.2,
    )
    print("Generating... \n")
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256,
    )
    alpaca_output = ""
    for s in generation_output.sequences:
        # print(tokenizer.decode(s))
        alpaca_output += tokenizer.decode(s)
    return alpaca_output

In [23]:
# Formatting the query to include the context along with some instructions
def format_query(query, context):
  return '''
Below is sequence of chat messages related to a certain topic. Write a response that answers the question below based on
what is discussed in the messages. Do not mention anything outside of what is discussed below. If there isn't enough
context, simply reply "This topic was not discussed previously"

### Messages:
{context}

### Question:
{query}

### Response:
'''.format(context=context, query=query)

In [24]:
def extract_response(response):
  return response.partition("### Response:")[2].strip()

In [26]:
def semantic_search(query):
  # Query pinecone and get the context and the max_score
  context_score = get_context_from_pinecone(query)
  context = context_score[0]
  max_score = context_score[1]

  # Format the query to include the context and some instructions for the LLM
  formatted_query = format_query(query, context)

  # Check if the context retrieved from pinecone is worth it
  if max_score > 0.2:
    # Get the full response from alpaca (including the instructions and the context)
    alpaca_response = alpaca_talk(formatted_query)
  else:
    return 'This topic was not discussed previously!'

  # Extract only the response and return it
  return extract_response(alpaca_response)

In [27]:
# Watch the magic happen... maybe not as magical as you'd expect
semantic_search("What shouldn't we worry about?")

Generating...


"We shouldn't worry about exporting our data because Tollan has already taken care of this issue and we will no longer need to do so. We should however still make sure that all of our data is properly formatted before uploading it into the database."

## 7. Creating an API Endpoint for the Semantic Search

In [28]:
from flask import Flask, request
from pyngrok import ngrok

port_no = 5000

app = Flask(__name__)
ngrok.set_auth_token("2S3sOVlp9tBJmDvErbUdCPoE2ri_3yrw7sVk12An4TZieirkT")
public_url =  ngrok.connect(port_no).public_url

@app.route("/")
def semantic_search_query():
    query = request.args.get('query')
    return semantic_search(query)


print(f"To acces the Gloable link please click... {public_url}")

app.run(port=port_no)



Exception in thread Thread-13 (_monitor_process):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pyngrok/process.py", line 146, in _monitor_process
    self._log_line(self.proc.stdout.readline())
  File "/usr/lib/python3.10/encodings/ascii.py", line 26, in decode
    return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 184: ordinal not in range(128)


To acces the Gloable link please click... https://1f97-35-197-142-67.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
ERROR:__main__:Exception on / [GET]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/lib/python3.10/dist-packages/flask/app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "<ipython-input-28-bede39f70c84>", line 13, in semantic_search_query
    return semantic_search(query)
  File "<ipython-input-26-84f36d5c1197>", line 3, in semantic_search
    context_score = get_context_from_pinecone(query)
  File "<ipython-input-21-3970b9c05999>", line

Generating...


INFO:werkzeug:127.0.0.1 - - [04/Jul/2023 09:30:27] "GET /?query=what%should%i%worry%ABout HTTP/1.1" 200 -


Generating...


INFO:werkzeug:127.0.0.1 - - [04/Jul/2023 09:31:17] "GET /?query=what%should%i%worry%ABout%BAsed%on%tollan HTTP/1.1" 200 -


Generating...


INFO:werkzeug:127.0.0.1 - - [04/Jul/2023 09:32:06] "GET /?query=What%shouldn't%we%worry%ABout HTTP/1.1" 200 -
