# Semnatic Search App¶
## Author : Evergreen Technologies
### This script loads sentence embeddings from SBERT and use it to perform semantic search on news dataset


#### A million news dataset can be found at:
#### https://www.kaggle.com/therohk/million-headlines

In [15]:
from sentence_transformers import SentenceTransformer
import scipy
import os
import pandas as pd

# LOAD BERT SENTENCE MODEL

In [2]:
# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and 
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [23]:
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1', 'utf-16','ascii']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

# Set up corpus

In [25]:
# A corpus is a list with documents split by sentences.
BASE_DIR = '/Volumes/My Passport for Mac/data'
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'million-news-dataset')
NEWS_FILE_NAME = "abcnews-date-text.csv"



input_df = read_csv(os.path.join(TEXT_DATA_DIR, NEWS_FILE_NAME))
input_df = input_df.head(20000)
print(input_df.head(20))

sentences = input_df['headline_text'].values.tolist()

#sentences = ['aba decides against community broadcasting licence', 
#             'act fire witnesses must be aware of defamation',
#             'a g calls for infrastructure protection summit',
#             'air nz staff in aust strike for pay rise',
#             'air nz strike to affect australian travellers',
#             'ambitious olsson wins triple jump',
#             'antic delighted with record breaking barca',
#             'aussie qualifier stosur wastes four memphis match',
#             'aust addresses un security council over iraq',
#             'australia is locked into war timetable opp',
#             'australia to contribute 10 million in aid to iraq']

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

    publish_date                                      headline_text
0       20030219  aba decides against community broadcasting lic...
1       20030219     act fire witnesses must be aware of defamation
2       20030219     a g calls for infrastructure protection summit
3       20030219           air nz staff in aust strike for pay rise
4       20030219      air nz strike to affect australian travellers
5       20030219                  ambitious olsson wins triple jump
6       20030219         antic delighted with record breaking barca
7       20030219  aussie qualifier stosur wastes four memphis match
8       20030219       aust addresses un security council over iraq
9       20030219         australia is locked into war timetable opp
10      20030219  australia to contribute 10 million in aid to iraq
11      20030219  barca take record as robson celebrates birthda...
12      20030219                         bathhouse plans move ahead
13      20030219      big hopes for launceston c

# PERFORM SEMANTIC SEARCH

In [38]:
#@title Sematic Search Form

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py

#query = 'bad weather' #@param {type: 'string'}
#query = 'employee stop working' #@param {type: 'string'}
#query = "moderate lift in economy"
#query = 'global warming impact'
query = 'wildfires in australia'

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 3 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: wildfires in australia

Top 5 most similar sentences in corpus:
vic bushfires inquiry begins (Cosine Score: 0.7667)
massive vic bushfire contained (Cosine Score: 0.7602)
mp visits fire impacted gippsland (Cosine Score: 0.7439)
