# Semnatic Search App¶
## Author : Evergreen Technologies
### This script loads sentence embeddings from SBERT and use it to perform semantic search on news dataset


#### A million news dataset can be found at:
#### https://www.kaggle.com/therohk/million-headlines

In [1]:
from sentence_transformers import SentenceTransformer
import scipy
import os
import pandas as pd

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# LOAD BERT SENTENCE MODEL

In [2]:
# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and 
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [3]:
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1', 'utf-16','ascii']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

# Set up corpus

In [5]:
# A corpus is a list with documents split by sentences.
#BASE_DIR = '/Volumes/My Passport for Mac/data'
#TEXT_DATA_DIR = os.path.join(BASE_DIR, 'million-news-dataset')
#NEWS_FILE_NAME = "abcnews-date-text.csv"



input_df = read_csv('E:/Semantic Search/bertSemantic Search/nlp-master/EnglishTransOnlyTicktDesc.csv')
input_df = input_df.head(20000)
print(input_df.head(20))

sentences = input_df['English'].values.tolist()

#sentences = ['aba decides against community broadcasting licence', 
#             'act fire witnesses must be aware of defamation',
#             'a g calls for infrastructure protection summit',
#             'air nz staff in aust strike for pay rise',
#             'air nz strike to affect australian travellers',
#             'ambitious olsson wins triple jump',
#             'antic delighted with record breaking barca',
#             'aussie qualifier stosur wastes four memphis match',
#             'aust addresses un security council over iraq',
#             'australia is locked into war timetable opp',
#             'australia to contribute 10 million in aid to iraq']

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

                                              English
0           23495 Question about the client installer
1                             23494Partner interested
2                            23493 Follow-up favorite
3                                  23492Queue display
4                              23491 Inquiry 25 users
5   23490 Troubleshooting Server service does not ...
6                            23489 Bookmarks adjusted
7                                   23488 test system
8          23487 Permissions to create blank document
9                              23486 Viewer Customize
10                                 23484 Resubmission
11                 23483 Independent upgrade to 3.6.1
12                                 23482XML archiving
13  23481 Scanned documents are not separated by b...
14                23480TWAIN ERROR On terminal server
15                    23479bitfarm server unreachable
16                           23478 Asks for call back
17                 23477 Tak

# PERFORM SEMANTIC SEARCH

In [6]:
#@title Sematic Search Form

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py

#query = 'bad weather' #@param {type: 'string'}
#query = 'employee stop working' #@param {type: 'string'}
#query = "moderate lift in economy"
#query = 'global warming impact'
query = 'selectall is not working'

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 3 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: selectall is not working

Top 5 most similar sentences in corpus:
15348searchpattern does not work (Cosine Score: 0.9522)
5019 Scanning does not work (Cosine Score: 0.9479)
15993Workflow is not working (Cosine Score: 0.9457)
