# 1. Get 300 Financial News Headlines


In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 10)

headlines_df = pd.read_csv('data/300_stock_headlines.csv')
headlines_df.drop('Unnamed: 0.1', axis=1, inplace=True)
headlines_df.drop('Unnamed: 0', axis=1, inplace=True)
headlines_df.reset_index()
headlines_df.head(5)


# 2. Calculate Financial Sentiment for each headline
Using a pre-trained model fine-tuned on financial news/report data

In [None]:
#Calculate financial sentiment for each headline

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
headlines = headlines_df["headline"].tolist()

#get financial sentiment for all headlines
results = nlp(headlines)

#show results for first 2 headlines
print(results[:2])

# 3. Join Financial Sentiment and Headline into a single Dataframe

In [None]:
#Put headlines and financial sentiment in 1 dataframe
sentiment_df = pd.DataFrame(results)
sentiment_df.reset_index()

result_df = pd.concat([headlines_df, sentiment_df],axis=1)
#show the first 10 rows 
result_df.head(5)

# 4. Generate Embeddings for each headline
Using a HuggingFace Sentence Embedder

In [None]:
#Generate embeddings (vectors) for each headline
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
headline_vectors = [ model.encode(sentence) for sentence in result_df['headline']]
#check how many dimensions in a single vector 
headline_vectors[0].shape

# 5. Connect to Redis

In [None]:
#connect to redis

from redis import Redis
import redisearch

host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
print ('Connected to redis')

# 6 Utility functions to load data into Redis 
We'll be loading into a "hash" structure (a table-like structure)

In [None]:
#load articles into redis hash
import numpy as np
def load_vectors(client:Redis, headlines_df, vector_data,vector_field_name):
    #pipeline the 300 articles in one go
    p = client.pipeline(transaction=False)
    for index, row in headlines_df.iterrows():    
        #hash key
        key='article:'+ str(index)
        #hash fields
        headline=row['headline']
        url=row['url']
        publisher=row['publisher']
        date=row['date']
        label=row['label']
        score=row['score']
        headline_vector = vector_data[index].astype(np.float32).tobytes()
        headline_data_mapping ={'headline':headline,'url':url,'publisher':publisher,'label':label,'score':score, vector_field_name:headline_vector}
        
        p.hset(key,mapping=headline_data_mapping)
    p.execute()
    

# 7. More Utility Functions to Define vector indexes 

In [None]:
#Utility Functions to Create Indexes on Vector field

def create_bf_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2'):
    bf_index = redisearch.Client(index_name, conn=redis_conn)
    bf_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA",vector_field_name, "VECTOR", "FLAT", "8", "TYPE", "FLOAT32", "DIM", vector_dimensions, "DISTANCE_METRIC", distance_metric, "INITIAL_CAP", number_of_vectors)
    return bf_index

def create_hnsw_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2',M=40,EF=200):
    hnsw_index = redisearch.Client(index_name, conn=redis_conn)
    hnsw_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA", vector_field_name, "VECTOR", "HNSW", "12", "TYPE", "FLOAT32", "DIM", vector_dimensions, "DISTANCE_METRIC", distance_metric,  "INITIAL_CAP", number_of_vectors, "M", M, "EF", EF)
    return hnsw_index

def delete_index(vector_index):
    delete_data(vector_index.redis)
    

# 8. Utility Functions to Query a vector index in Redis
One function to query brute-force index and another to query an HNSW index

In [None]:
#Utility Functions to Perform Similarity Search

def find_similar_bf(headline_q, query_encoder, vector_index,vector_field_name, topK=5):
    #vectorize the query
    query_vector = query_encoder.encode(headline_q).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'(*)=>[TOP_K $K @{vector_field_name} $BLOB AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','score','headline','label')
    #Execute the query
    results = vector_index.search(q, query_params = {'BLOB': query_vector, 'K':topK})
    return results 

def find_similar_hnsw(headline_q, query_encoder, vector_index,vector_field_name, topK=5,EF=5):
    #vectorize the query
    query_vector = query_encoder.encode(headline_q).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'(*)=>[TOP_K $K @{vector_field_name} $BLOB EF_RUNTIME $EF AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','score','headline','label')    #Execute the query
    results = vector_index.search(q, query_params = {'BLOB': query_vector, 'K':topK, 'EF':EF})
    return results 

# 9. Finally Load the Data into Redis

In [None]:
#Brute-Force - Load and Index article & vector Data
NUMBER_ARTICLES = 300
VECTOR_FIELD_NAME = 'headline_vector'
redis_conn.flushall()
bf_index = create_bf_index(redis_conn,'bf_index',VECTOR_FIELD_NAME,NUMBER_ARTICLES,768,'L2')
load_vectors(bf_index.redis,result_df,headline_vectors,VECTOR_FIELD_NAME)

# 10. Query for similarity on the Brute-force index
Get Top5 most semantically similar headlines to a given sentence

In [None]:
#query for similarity

new_headline='bearish conditions ahead'

results = find_similar_bf (new_headline,model,bf_index,VECTOR_FIELD_NAME,5)
for doc in results.docs:
    print ('***************Product  found ************')
    #this must be a bug, I get the the headline by retrieving the "vector score" field!?
    print ('headline = ' + doc.headline)
    print ('label = ' + doc.label)
    
    print ('id = ' + doc.id)
    

In [None]:
results.docs[4]