## Import Library

In [1]:
# Imports
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
# import mlflow
import os
import pickle

from frequently_requested_docs.docs_helper import getModel, getSaveName, loadEmbeddings, getEmbeddingPath, test_sentence
from frequently_requested_docs.docs_config import TOP_K, MODEL_NAMES, DATA_CSV_PATH

## Model Selection and Initialization

In [2]:
# List of models optimized for semantic textual similarity can be found at:
# https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0

model_name = [
    'nli-mpnet-base-v2',
    'nli-roberta-base-v2',
    'princeton-nlp/sup-simcse-roberta-large',
    'princeton-nlp/unsup-simcse-roberta-large',
    'stsb-distilroberta-base-v2',
    'stsb-mpnet-base-v2',
    'stsb-roberta-base',
    'stsb-roberta-base-v2',
    'stsb-roberta-large',
]

m = 0
        

In [3]:
# look at MODEL_NAMES in config.py for more model names to test
save_name = getSaveName(model_name[m])
    
model = getModel(model_name[m], save_name)

Loading model from disc


## Retrieve Top K most similar docs from freqdoc dataset given a request

In [4]:
# Format of corpus sentences
corpus_docs = []
data = pd.read_csv(DATA_CSV_PATH)
data.reset_index()

for ind, row in data.iterrows():
    if isinstance(row['Document'], str):
        corpus_docs.append(row)

# Load corpus embeddings if exist, otherwise encode embeddings
embedding_path = getEmbeddingPath(save_name)
corpus_embeddings = None
            
corpus_docs, corpus_embeddings = loadEmbeddings(model, embedding_path, corpus_docs)
        

Loading pre-computed embeddings from disc


In [5]:
# Test 1 or more sentences
examples = [
    'I am searching for the Detention Facility Reviews for the Randall County Jail in Amarillo, Texas', 
    'Statements made by former georgia senator david perdue about visas.', 
    "All documents regarding the TSA’s throughput data for August 2017"]

i = 2


In [6]:
sentence = examples[i]

test_sentence(sentence, model, corpus_docs, corpus_embeddings, TOP_K)

Sentence: All documents regarding the TSA’s throughput data for August 2017 

Top 25 most similar sentences in corpus:
TSA Throughput Data August 6, 2017 to August 12, 2017 (Score: 0.7647)
TSA Throughput Data August 13, 2017 to August 19, 2017 (Score: 0.7618)
TSA Throughput Data July 2, 2017 to July 8, 2017 (Score: 0.7550)
TSA Throughput Data July 30, 2017 to August 5, 2017 (Score: 0.7524)
TSA Throughput Data August 27, 2017 to September  02, 2017 (Score: 0.7513)
TSA Throughput Data August 20, 2017 to August 26, 2017 (Score: 0.7479)
TSA Throughput Data August 5, 2018 to August 11, 2018 (Score: 0.7421)
TSA Throughput Data October 08, 2017 to October 14, 2017 (Score: 0.7338)
TSA Throughput Data June 4, 2017 to June 10, 2017 (Score: 0.7249)
TSA Throughput Data July 16, 2017 to July 22, 2017 (Score: 0.7185)
TSA Throughput Data July 29, 2018 to August 4, 2018 (Score: 0.7155)
TSA Throughput Data July 9, 2017 to July 15, 2017 (Score: 0.7153)
TSA Throughput Data August 12, 2018 to August 18, 2