# Hybrid search
- What is Hybrid Search?
    - Hybrid search is about perform semantic and keyword search over the data in one query and combine the results for more relevant results.
    - Here is the workflow :
        - Create dense vectors using an external embedding model.
        - Create sparse vectors using an external model.
        - Create an index that supports sparse-dense vectors (s1 or p1 with the dotproduct metric).
        - Upsert dense and sparse vectors to your index.
        - Search the index using sparse-dense vectors.
        - Pinecone returns sparse-dense vectors.
        
### Read further
- https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/pinecone_hybrid_search.html
- https://docs.pinecone.io/docs/hybrid-search

## Prepare environment

In [None]:
%pip install python-dotenv

In [23]:
import os
from pathlib import Path
path_dir_script = Path(os.getcwd())
path_dir_root = path_dir_script.parent

In [None]:
from dotenv import load_dotenv, find_dotenv
path_file_dotenv = Path(path_dir_root, '.env')
load_dotenv(path_file_dotenv)

True

## Load data

In [86]:
!pip install GitPython



In [87]:
from langchain.document_loaders import GitLoader # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/git.html
import os

In [88]:
loader = GitLoader(
    repo_path=os.environ['PATH_CODE_REPO_1'],
    branch="master", 
    file_filter=lambda file_path: file_path.endswith(".swift") or file_path.endswith(".h") or file_path.endswith(".m")
)

In [89]:
data = loader.load()

In [90]:
len(data)

3

## Split text

In [91]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  texts = text_splitter.split_documents(documents)
  return texts

texts = split_docs(data)
print(len(texts))

17


## Create index

In [95]:
import pinecone

In [96]:
pinecone.init(
    api_key=os.getenv('PINECODE_API_KEY'),
    environment=os.getenv('PINECODE_ENVIRONMENT')
)

In [97]:
# Check existing indexes
print(pinecone.list_indexes())

['ai-repo-reader-1']


In [98]:
index_name = "ai-repo-reader-1"

In [99]:
# Clean up previous index before creating new one
pinecone.delete_index(index_name)

In [100]:
# Note: it’s important to make sure that the “context” field that holds the document text in the metadata is not indexed. 
# Currently you need to specify explicitly the fields you do want to index. For more information checkout Pinecone’s docs.
pinecone.create_index(
    name = index_name, 
    dimension = 1536, # dimensionality of dense model
    metric = "dotproduct", # sparse vqlues supported only for dotproduct
    pod_type = "s1",
    metadata_config={"indexed": []}
)

In [101]:
index = pinecone.Index(index_name)

## Get embeddings

In [102]:
import os
from langchain.embeddings import OpenAIEmbeddings

In [103]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [48]:
!pip install pinecone_text

Collecting pinecone_text
  Downloading pinecone_text-0.4.2-py3-none-any.whl (17 kB)
Collecting mmh3<4.0.0,>=3.1.0
  Downloading mmh3-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl (12 kB)
Collecting torch<2.0.0,>=1.13.1
  Downloading torch-1.13.1-cp310-none-macosx_10_9_x86_64.whl (135.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.3/135.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:03[0m
[?25hCollecting wget<4.0,>=3.2
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting torchvision
  Downloading torchvision-0.14.1-cp310-cp310-macosx_10_9_x86_64.whl (1.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.w

In [104]:
from pinecone_text.sparse import BM25Encoder
# or from pinecone_text.sparse import SpladeEncoder if you wish to work with SPLADE
# use default tf-idf values
bm25_encoder = BM25Encoder().default()

100% [........................................................................] 65406227 / 65406227

In [105]:
corpus = [t.page_content for t in texts]

# fit tf-idf values on your corpus
bm25_encoder.fit(corpus)

# store the values to a json file
bm25_encoder.dump("bm25_values.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")

  0%|          | 0/17 [00:00<?, ?it/s]

## Load Retriever

In [106]:
from langchain.retrievers import PineconeHybridSearchRetriever
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index=index)

In [107]:
retriever.add_texts(corpus)

  0%|          | 0/1 [00:00<?, ?it/s]

## Use Retriever

In [108]:
result = retriever.get_relevant_documents("Where is frequency measured?")

In [109]:
print(result[0].page_content)

AudioKit.stop()
    timer?.pause()
  }
  
  /**
   Exponential smoothing:
   https://en.wikipedia.org/wiki/Exponential_smoothing
  */
  fileprivate func smooth(_ value: Double) -> Double {
    var frequency = value
    if smoothingBuffer.count > 0 {
      let last = smoothingBuffer.last!
      frequency = (smoothing * value) + (1.0 - smoothing) * last
      if smoothingBuffer.count > smoothingBufferCount {
        smoothingBuffer.removeFirst()
      }
    }
    smoothingBuffer.append(frequency)
    return frequency
  }
  
  static func newOutput(_ frequency: Double, _ amplitude: Double) -> TunerOutput {
    let output = TunerOutput()
    
    var norm = frequency
    while norm > frequencies[frequencies.count - 1] {
      norm = norm / 2.0
    }
    while norm < frequencies[0] {
      norm = norm * 2.0
    }
    
    var i = -1
    var min = Double.infinity
    for n in 0...frequencies.count-1 {
      let diff = frequencies[n] - norm
      if abs(diff) < abs(min) {


In [110]:
result = retriever.get_relevant_documents("Where is AudioKit instance used?")

In [111]:
print(result[0].page_content)

// MARK:- Imports

import AudioKit
import Chronos


// MARK:- Constants


In [112]:
print(result[1].page_content)

/**
A Tuner uses the devices microphone and interprets the frequency, pitch, etc.
*/
@objc public class Tuner: NSObject {
  
  fileprivate let updateInterval: TimeInterval = 0.03
  fileprivate let smoothingBufferCount = 30
    
  /**
  Object adopting the TunerDelegate protocol that should receive callbacks
  from this tuner.
  */
  public var delegate: TunerDelegate?
  
  fileprivate let threshold: Double
  fileprivate let smoothing: Double
  fileprivate let microphone: AKMicrophone
  fileprivate let tracker: AKFrequencyTracker
  fileprivate let silence: AKBooster
  fileprivate var timer: DispatchTimer?
  fileprivate var smoothingBuffer: [Double] = []
  
  /**
  Initializes a new Tuner.
  
   - parameter threshold: The minimum amplitude to recognize, 0 < threshold < 1
   - parameter smoothing: Exponential smoothing factor, 0 < smoothing < 1
   
  */
  public init(threshold: Float = 0.0, smoothing: Float = 0.25) {
    self.threshold = Double(min(abs(threshold), 1.0))
