# Search Cisco Corpus
Given a query (question), search the best corresponding chunks in the whole Cisco Corpus using Facebook AI Similarity Search (FAISS) library. FAISS has excellent GPU implementation of "brute-force" kNN (meaning that no approximation techniques compromising the accuracy of the search).

In [None]:
import os
import pickle


def search_corpus(query, indir, n_gpu):
    """
    """
    # Load FAISS index.
    if os.path.exists(os.path.join(indir, "cisco_corpus.index")):
        index = faiss.read_index(os.path.join(indir, "cisco_corpus.index"))
        if n_gpu > 0:
            co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
            co.shard = True
            index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=n_gpu)  # Convert CPU index to GPU index.
    else:
        print("Error: no index found in {}... Make sure to create the index before searching in corpus. Exiting...".format(indir))
        sys.exit(0)
        
    # Load the chunks.
    if os.path.exists(os.path.join(indir, "chunks.txt")):
        with open(os.path.join(indir, "chunks.txt"), "rb") as f:
            chunks = pickle.load(f)
    else:
        print("Error: no chunks found in {}... Make sure to create the index before searching in corpus. Exiting...".format(indir))
        sys.exit(0)
        

## Create FAISS GPU index

In [5]:
import os
import sys
import glob
import pickle

import faiss
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm


def load_embeddings(input_dir):
    """
    """
    # Create dataframe.
    cols = ['feat'+str(i+1) for i in range(768)]
    cols.append('Chunk')
    df = pd.DataFrame(columns=cols)

    #Concat embeddings from all files.
    filepaths = glob.glob(input_dir + '*.h5')
    for file in tqdm(filepaths, desc='Files'):
        df_file = pd.read_hdf(file)
        df_file['Chunk'] = df_file['Chunk'].astype(str)
        df = pd.concat([df, df_file], ignore_index=True, sort=False)

    #Check for duplicated chunks in the concatenated dataframe.
    df.drop_duplicates(subset=['Chunk'], keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Get chunks and their embeddings.
    chunks = df.iloc[:,-1].values
    embeddings = df.iloc[:,:-1].values
    embeddings = np.ascontiguousarray(embeddings, dtype=np.float32) # Necessary for FAISS indexing afterwards.
    
    return chunks, embeddings


def create_faiss_index(vecs, method='l2', n_gpu=0):
    """
    Create FAISS index on GPU(s).
    To create a GPU index with FAISS, one first needs to create it on CPU then copy it on GPU. 
    Note that a "flat" index means that it is brute-force, with no approximation techniques.
    """
    # Build flat CPU index given the chosen method.
    if method=='l2':
        index = faiss.IndexFlatL2(vecs.shape[1])  # Exact Search for L2
    elif method=='ip':
        index = faiss.IndexFlatIP(vecs.shape[1])  # Exact Search for Inner Product (also for cosine, just normalize vectors beforehand)
    else:
        print("Error: Please choose between L2 distance ('l2') or Inner Product ('ip') as brute-force method for exact search. Exiting...")
        sys.exit(0)
    
    # Convert to flat GPU index.
    if n_gpu > 0:
        co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
        co.shard = True
        index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=n_gpu)  # Convert CPU index to GPU index.
    
    # Add vectors to GPU index.
    index.add(vecs)
    
    # Convert back to cpu index (needed for saving it to disk).
    index = faiss.index_gpu_to_cpu(index)

    return index


def create_corpus_index(input_dir, output_dir, n_gpu):
    """
    """
    print("\nLoad all embeddings of Cisco corpus from {}...".format(input_dir))
    chunks, embeddings = load_embeddings(input_dir)
    
    print("Create FAISS (GPU) index...")
    index = create_faiss_index(vecs=embeddings, n_gpu=n_gpu)
    
    print("\nSave index to {}...".format(output_dir))
    faiss.write_index(index, os.path.join(output_dir, "cisco_corpus.index"))
    
    print("\nSave chunks to {}...".format(output_dir))
    with open(os.path.join(output_dir,"cisco_chunks.txt"), "wb") as f:
        pickle.dump(chunks, f)

    print("\nFAISS index created.")

In [7]:
create_corpus_index(input_dir='/raid/antoloui/Master-thesis/_data/embeddings/test/', 
                    output_dir='/raid/antoloui/Master-thesis/_data/embeddings/test/',
                    n_gpu=1)


Load all embeddings of Cisco corpus from /raid/antoloui/Master-thesis/_data/embeddings/test/...


HBox(children=(FloatProgress(value=0.0, description='  Files', max=2.0, style=ProgressStyle(description_width=…


Create FAISS (GPU) index...

Save index to /raid/antoloui/Master-thesis/_data/embeddings/test/...

Save chunks to /raid/antoloui/Master-thesis/_data/embeddings/test/...

FAISS index created.
