In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 10)

headlines_df = pandas.read_csv('data/300_stock_headlines.csv')
headlines_df.drop('Unnamed: 0.1', axis=1, inplace=True)
headlines_df.drop('Unnamed: 0', axis=1, inplace=True)
headlines_df.reset_index()
headlines_df.head(5)


In [None]:
#Calculate financial sentiment for each headline

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
headlines = headlines_df["headline"].tolist()

#get financial sentiment for all headlines
results = nlp(headlines)

#show results for first 2 headlines
print(results[:2])

In [None]:
#Put headlines and financial sentiment in 1 dataframe
sentiment_df = pd.DataFrame(results)
sentiment_df.reset_index()

result_df = pd.concat([headlines_df, sentiment_df],axis=1)
#show the first 10 rows 
result_df.head(10)

In [None]:
#Generate embeddings (vectors) for each headline
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
headline_vectors = [ model.encode(sentence) for sentence in result_df['headline']]
#check how many dimensions in a single vector 
headline_vectors[0].shape

In [None]:
#connect to redis

from redis import Redis
import redisearch

host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
print ('Connected to redis')

In [None]:
#load articles into redis hash
import numpy as np
def load_vectors(client:Redis, headlines_df, vector_data,vector_field_name):
    #start from fresh
    client.flushall()
    #pipeline the 300 articles in one go
    p = client.pipeline(transaction=False)
    for index, row in headlines_df.iterrows():    
        #hash key
        key='article:'+ str(index)
        #hash fields
        headline=row['headline']
        url=row['url']
        publisher=row['publisher']
        date=row['date']
        label=row['label']
        score=row['score']
        headline_vector = vector_data[index].astype(np.float32).tobytes()
        headline_data_mapping ={'headline':headline,'url':url,'publisher':publisher,'label':label,'score':score, vector_field_name:headline_vector}
        
        p.hset(key,mapping=headline_data_mapping)
    p.execute()
    

In [None]:
load_vectors(redis_conn, result_df, headline_vectors,'headline_vector')

In [None]:
#Utility Functions to Create Indexes on Vector field

def create_bf_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2'):
    bf_index = redisearch.Client(index_name, conn=redis_conn)
    bf_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA",vector_field_name, "VECTOR", "FLOAT32", vector_dimensions, distance_metric, "BF", "INITIAL_CAP", number_of_vectors)
    return bf_index

def create_hnsw_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2',M=40,EF=200):
    hnsw_index = redisearch.Client(index_name, conn=redis_conn)
    hnsw_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA", vector_field_name, "VECTOR", "FLOAT32", vector_dimensions, distance_metric, "HNSW", "INITIAL_CAP", number_of_vectors, "M", M, "EF", EF)
    return hnsw_index

def delete_index(vector_index):
    delete_data(vector_index.redis)
    

In [None]:
#Utility Functions to Perform Similarity Search

def find_similar_bf(headline_q, query_encoder, vector_index,vector_field_name, topK=5):
    #vectorize the query
    query_vector = query_encoder.encode(headline_q).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'@{vector_field_name}:[$vec_param TOPK {topK}]').sort_by(f'{vector_field_name}_score').paging(0,topK).return_fields(f'{vector_field_name}_score','score','headline','headline','label')
    #Execute the query
    results = vector_index.search(q, query_params = {'vec_param': query_vector})
    return results 

def find_similar_hnsw(headline_q, query_encoder, vector_index,vector_field_name, topK=5,EF=5):
    #vectorize the query
    query_vector = query_encoder.encode(headline_q).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'@{vector_field_name}:[$vec_param TOPK {topK}]  => {{$EFRUNTIME : {EF}}}').sort_by(f'{vector_field_name}_score').paging(0,topK).return_fields(f'{vector_field_name}_score','headline','score','label')
    #Execute the query
    results = vector_index.search(q, query_params = {'vec_param': query_vector})
    return results 

In [None]:
#Brute-Force - Load and Index Product Data
NUMBER_ARTICLES = 300
VECTOR_FIELD_NAME = 'headline_vector'
redis_conn.flushall()
bf_index = create_bf_index(redis_conn,'bf_index',VECTOR_FIELD_NAME,NUMBER_ARTICLES,768,'L2')
load_vectors(bf_index.redis,result_df,headline_vectors,'headline_vector')

In [None]:
#query for similarity

new_headline='major panic as market turmoil in the european markets'

results = find_similar_bf (new_headline,model,bf_index,VECTOR_FIELD_NAME,5)
for doc in results.docs:
    print ('***************Product  found ************')
    #this must be a bug, I get the the headline by retrieving the "vector score" field!?
    print ('headline = ' + doc.headline_vector_score)
    print ('label = ' + doc.label)
    

In [None]:
results.docs[3]