In [1]:
#@title install
!pip install fasttext
!apt install libomp-dev
!python -m pip install --upgrade faiss faiss-gpu

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
0 upgraded, 0 newly installed, 0 to remove and 17 not upgraded.
Requirement already up-to-date: faiss in /usr/local/lib/python3.6/dist-packages (1.5.3)
Requirement already up-to-date: faiss-gpu in /usr/local/lib/python3.6/dist-packages (1.7.0)


In [2]:
#@title imports
import faiss, fasttext as ft, json, string, numpy as np, pandas as pd,fasttext.util as ftu
printable = set(string.printable)


In [3]:
#@title constants { form-width: "20%" }
MODEL_PATH = '/content/drive/MyDrive/models/fasttext/cc.en.300.bin'
SQUAD_TRAINING_DATA_PATH = "/content/drive/MyDrive/Colab data/squad/train-v2.0.json"
FAISS_TOPK = 10

# **FUNCTIONS**

In [4]:
#@title embedding functions: create vector and get distance
def getSentenceVector(s):
  return model.get_sentence_vector(s)

def cosine_distance_wordembedding_method(vector_1,vector_2,doEuclidean=False):
  if doEuclidean:
    cosine = scipy.spatial.distance.euclidean(vector_1, vector_2)
  else:
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
  return round( (1-cosine) , 2)

In [5]:
#@title FAISS functions : create and search

'''
given a query and index of faiss. this returns top indexes and their similarity scores
@params:
query: string -> query 
index: faiss.swigfaiss.IndexIDMap  -> faiss index map which is returned by createFaissIndexing
'''
def searchQueryInFaiss(query,index,top_k):
  D, I = index.search(np.array([getSentenceVector(query)]), k=top_k)
  similarityScores = D.flatten().tolist()
  topIndexes = I.flatten().tolist()
  return similarityScores,topIndexes

def createFaissIndexing_internal(embeddings,embeddingsCounter):
  # Step 2: Instantiate the index
  index = faiss.IndexFlatL2(embeddings.shape[1])

  # Step 3: Pass the index to IndexIDMap
  index = faiss.IndexIDMap(index)

  # Step 4: Add vectors and their IDs
  index.add_with_ids(embeddings, embeddingsCounter)

  return index

'''
given a list of text(strings) , this function will create embeddings vector
@params 
data : list of string -> data list
@returns
embeddings: list of vectors -> embeddings vector
embeddingsCounter: list of ints -> a very dumb list that i am not proud of making. contains 0 to n natural numbers , n being the len of the data
'''
def createEmbeddings(data):
  embeddings = []
  embeddingsCounter = []
  for eachQuestionCounter,eachQuestion in enumerate(data):
    localVec = getSentenceVector(eachQuestion)
    embeddings.append( localVec )
    embeddingsCounter.append( eachQuestionCounter )
    
    # index.add(localVec)   

  embeddings = np.array(embeddings).astype('float32')
  embeddingsCounter = np.array(embeddingsCounter)
  return embeddings,embeddingsCounter

#given a list of texts, this function will create embeddings and create faiss indexing and return the index
def createFaissIndexing(data):
  embeddings,embeddingsCounter = createEmbeddings(data)
  return createFaissIndexing_internal( embeddings,embeddingsCounter  )



In [6]:
#@title normalize_text function { form-width: "20%" }
def normalize_text(s,isLower=True):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""

  # return white_space_fix(remove_articles(remove_punc(removeSlashTags(lower(s)))))
  if isLower:
    return white_space_fix(removeSlashTags(remonveNonPrintables(s.lower())))
  else:
    return white_space_fix(removeSlashTags(remonveNonPrintables(s)))


def removeSlashTags(text):
  replace_characters = ['\r','\n','\\','\t','/','\b',"-"]
  for ch in replace_characters:
    if ch in text:
      text = text.replace(ch," ")
  text = text.replace("__","")
  # text = text.replace("-","")
  return text

def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

def white_space_fix(text):
    return " ".join(text.split())

def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

def remonveNonPrintables(text):
  return "".join(filter(lambda c: c in printable, (text) ))

def normalizeTextForList(strList):
  strListRet = []
  for i in strList:
    strListRet.append(normalize_text(i))

  return strListRet

In [7]:
#@title IndexToText
def IndexToText(topIndexes,allQuestions):
  retData= []
  for eachTopIndex in topIndexes:
    retData.append(allQuestions[eachTopIndex])

  return retData

In [8]:
#@title squad data reading function 
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

# **MAIN**

In [None]:
#@title model loading 
#if not already download then first download. if downloaded then just pass the location path to load the model. I already have the model downlaoded

# ftu.download_model('en', if_exists='ignore')  # English
model = ft.load_model(MODEL_PATH)

In [None]:
#@title reading data
df = squad_json_to_dataframe_train(SQUAD_TRAINING_DATA_PATH)
questionsList = df["question"]
questionsList = normalizeTextForList(questionsList)
len(questionsList)

In [None]:
#@title creating faiss index
faissIndex = createFaissIndexing(questionsList)

In [None]:
#@title finding similar questions { form-width: "20%" }

#define query 
myIndex = 110
query = questionsList[myIndex]
# query = "any query"

print("Question : ",query)
#get top indexes and similarity scores
similarityScores,topIndexes = searchQueryInFaiss(query,faissIndex,FAISS_TOPK)
#get top indexes' text
similarTexts = IndexToText(topIndexes,questionsList)

#score -> lower the better. ranked list
for index,score,text in zip(topIndexes,similarityScores,similarTexts):
  if index!=myIndex:
    print(index ," -> ", score ,"->",text)