<a href="https://colab.research.google.com/github/TollanBerhanu/Semantic-search-on-Slack/blob/main/slack_semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading the dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/dataset/discord_chatlogs.xlsx')
df['Content']

0      Thank you! I attempted to address the issue, b...
1                                 already solved! thanks
2      yes! your file has too big name. reduce it and...
3                             and i'm trying to send 100
4                                  which has 10.000 adas
                             ...                        
496    Thanks for the info. Yes, `sh query-tip.sh` wo...
497    Hi, <@588140556958302209>! No, it's an overall...
498    <@665713765743853600> Could you explain what's...
499    also `^M` usually happens when a windows edito...
500                                                  NaN
Name: Content, Length: 501, dtype: object

In [None]:
# Cast the values of the column 'Content' into strings
df['Content'] = df['Content'].astype(str)

# Join the string values of all the rows in 'Content' into one large corpus of text
conversations = ' '.join(df['Content'])

Splitting the giant string into chnuks

In [None]:
!pip install --upgrade langchain  -q

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,           # Usually chunk sizes are much larger than this
    chunk_overlap  = 20,        # Overlap is needed incase the text is split in odd places
    length_function = len,
)

In [None]:
chunks = text_splitter.create_documents([conversations])
        # text_splitter.split_text(conversations)[:2]
docs = text_splitter.split_documents(chunks)

print(chunks[0]) # 1st chunk
print(chunks[1]) # 2nd chunk


page_content='Thank you! I attempted to address the issue, but the error persists. already solved! thanks yes!' metadata={}
page_content="solved! thanks yes! your file has too big name. reduce it and it should work and i'm trying to send" metadata={}


Embedding

Generating embedding using sentence transformers

In [None]:
!pip install sentence_transformers > /dev/null

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# ... is equivalent to ...
# SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Extract only the page_content attribute from the list of objects and put them back in a list
doc_embeddings = embeddings.embed_documents([chunk.page_content for chunk in chunks])

KeyboardInterrupt: ignored

Storing the embeddings in a vector database (Pinecone)

In [None]:
pip install pinecone-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import getpass    # To prompt the user for a password without echoing.
# from langchain.vectorstores import Pinecone

# PINECONE_API_KEY = getpass.getpass("Your API key: ")  # 2b975b00-9ceb-4d2f-b38d-ba6ccb6e532a
# PINECONE_ENV = getpass.getpass("Your env't name: ")   # us-west1-gcp-free

PINECONE_API_KEY = "2b975b00-9ceb-4d2f-b38d-ba6ccb6e532a"
PINECONE_ENV = "us-west1-gcp-free"

In [None]:
import pinecone
from langchain.vectorstores import Pinecone

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV,  # next to api key in console
)

# all_indices = pinecone.list_indexes() # List all the indexed in our pinecone workspace
index_name = "discord-embeddings"
index_dimension = len(doc_embeddings[0])

# The number of embedded chunks
no_embeddings = len(chunks)
print('No of embeddings: ' + no_embeddings)   # The free pinecone API can't take more than 1000 vectors
print('Index dimension: ' + index_dimension)

384

In [None]:
# Connect to the index
index = pinecone.Index(index_name)
# Current index statistics
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'first-upsert': {'vector_count': 1679}},
 'total_vector_count': 1679}

In [None]:
#doc_embeddings

In [None]:
step = 100  # This will be the size of the batch of vectors sent to pinecone at a time

for start in range(0, no_embeddings, step):
  # The end location of the current batch
  end = min(no_embeddings, start+step)    # If it reached the last batch, the end should be the total amount of vectors
                                          # [0..99], [100..199], ... , [1600..1678]  (The last batch should end at 1678)

  # create IDs for all embedded chunks (vectors) ... [0 -> 99 -> ... -> 1678]
  ids = [str(x) for x in range(start, end)]

  # create metadata for each vector ... (ideally, this should be as minimal as possible. For e.g., we can add the link to the median of the message chunk)
     # In this case, the original message is given as the metadata
  metadatas = [{'messages': chunk.page_content} for chunk in chunks[start:end]]

  # create a records list of current batch for upsert
  records = zip(ids, doc_embeddings[start:end], metadatas)

  # upsert to Pinecone
    # vectors = [ ( "id1", [0.1,0.2,..], {metadata1} )  ,  ( "id2", [0.4,0.6,..], {metadata2} )  , .. ]
    # namespace = "my-namespace"
  index.upsert(vectors=records, namespace="first-upsert")

  # index stat after current batch upsert
  print('Batch no. ' + str(int( start/step + 1 )) )
  index.describe_index_stats()

# index stats after all upsert batch
print('Completed upserting all batches: ')
index.describe_index_stats()

Batch no. 1
Batch no. 2
Batch no. 3
Batch no. 4
Batch no. 5
Batch no. 6
Batch no. 7
Batch no. 8
Batch no. 9
Batch no. 10
Batch no. 11
Batch no. 12
Batch no. 13
Batch no. 14
Batch no. 15
Batch no. 16
Batch no. 17
Completed upserting all batches: 


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'first-upsert': {'vector_count': 1679}},
 'total_vector_count': 1679}

In [None]:

embedded_query = embeddings.embed_query("How do I open the PPP repo in a docker container")

query_response = index.query(
    namespace="first-upsert",
    top_k=10,
    include_values=False,
    include_metadata=True,
    vector=embedded_query
)

query_response

{'matches': [{'id': '144',
              'metadata': {'messages': 'Look in the PPP Docker container in '
                                       'the scripts folder for a script to '
                                       'query UTxOs.'},
              'score': 0.592360675,
              'values': []},
             {'id': '1651',
              'metadata': {'messages': 'Then a dialog asked if I wanted to '
                                       'open the project from within Docker. I '
                                       'said yes.'},
              'score': 0.543517053,
              'values': []},
             {'id': '1673',
              'metadata': {'messages': '<@665713765743853600> Could you '
                                       "explain what's happen with docker file "
                                       'in our repository after the'},
              'score': 0.541511595,
              'values': []},
             {'id': '1645',
              'metadata': {'messages': 'From t