# 1. Get 300 Financial News Headlines


In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 10)

headlines_df = pd.read_csv('data/300_stock_headlines.csv')
headlines_df.drop('Unnamed: 0.1', axis=1, inplace=True)
headlines_df.drop('Unnamed: 0', axis=1, inplace=True)
headlines_df.reset_index()
headlines_df.head(5)


Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5…… Million of Senior Notes,http://www.gurufocus.com/news/1153187/agilent-technologies-announces-pricing-of-500-million-of-s...,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's in the Cards?,http://www.zacks.com/stock/news/931205/agilent-a-gears-up-for-q2-earnings-whats-in-the-cards?cid...,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquidation of Six Exchange-Traded Funds,http://www.gurufocus.com/news/1138923/jp-morgan-asset-management-announces-liquidation-of-six-ex...,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys Agilent Technologies Inc, The Howard Hughes Corp, ...",http://www.gurufocus.com/news/1138704/pershing-square-capital-management-lp-buys-agilent-technol...,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden Ticket at LabCentral,http://www.gurufocus.com/news/1134012/agilent-awards-trilogy-sciences-with-a-golden-ticket-at-la...,GuruFocus,2020-05-12 00:00:00,A


# 2. Calculate Financial Sentiment for each headline
Using a pre-trained model fine-tuned on financial news/report data

In [2]:
#Calculate financial sentiment for each headline

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
headlines = headlines_df["headline"].tolist()

#get financial sentiment for all headlines
results = nlp(headlines)

#show results for first 2 headlines
print(results[:2])

2022-02-10 16:56:34.963982: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-10 16:56:34.964040: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Downloading:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

[{'label': 'neutral', 'score': 0.9999771118164062}, {'label': 'neutral', 'score': 0.999295711517334}]


# 3. Join Financial Sentiment and Headline into a single Dataframe

In [3]:
#Put headlines and financial sentiment in 1 dataframe
sentiment_df = pd.DataFrame(results)
sentiment_df.reset_index()

result_df = pd.concat([headlines_df, sentiment_df],axis=1)
#show the first 10 rows 
result_df.head(5)

Unnamed: 0,headline,url,publisher,date,stock,label,score
0,Agilent Technologies Announces Pricing of $5…… Million of Senior Notes,http://www.gurufocus.com/news/1153187/agilent-technologies-announces-pricing-of-500-million-of-s...,GuruFocus,2020-06-01 00:00:00,A,neutral,0.999977
1,Agilent (A) Gears Up for Q2 Earnings: What's in the Cards?,http://www.zacks.com/stock/news/931205/agilent-a-gears-up-for-q2-earnings-whats-in-the-cards?cid...,Zacks,2020-05-18 00:00:00,A,neutral,0.999296
2,J.P. Morgan Asset Management Announces Liquidation of Six Exchange-Traded Funds,http://www.gurufocus.com/news/1138923/jp-morgan-asset-management-announces-liquidation-of-six-ex...,GuruFocus,2020-05-15 00:00:00,A,neutral,0.999535
3,"Pershing Square Capital Management, L.P. Buys Agilent Technologies Inc, The Howard Hughes Corp, ...",http://www.gurufocus.com/news/1138704/pershing-square-capital-management-lp-buys-agilent-technol...,GuruFocus,2020-05-15 00:00:00,A,neutral,0.999965
4,Agilent Awards Trilogy Sciences with a Golden Ticket at LabCentral,http://www.gurufocus.com/news/1134012/agilent-awards-trilogy-sciences-with-a-golden-ticket-at-la...,GuruFocus,2020-05-12 00:00:00,A,positive,0.971711


# 4. Generate Embeddings for each headline
Using a HuggingFace Sentence Embedder

In [4]:
#Generate embeddings (vectors) for each headline
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
headline_vectors = [ model.encode(sentence) for sentence in result_df['headline']]
#check how many dimensions in a single vector 
headline_vectors[0].shape

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

(768,)

# 5. Connect to Redis

In [5]:
#connect to redis

from redis import Redis
import redisearch

host = 'vecsim'
port = 6379
redis_conn = Redis(host = host, port = port)
print ('Connected to redis')

Connected to redis


# 6 Utility functions to load data into Redis 
We'll be loading into a "hash" structure (a table-like structure)

In [6]:
#load articles into redis hash
import numpy as np
def load_vectors(client:Redis, headlines_df, vector_data,vector_field_name):
    #pipeline the 300 articles in one go
    p = client.pipeline(transaction=False)
    for index, row in headlines_df.iterrows():    
        #hash key
        key='article:'+ str(index)
        #hash fields
        headline=row['headline']
        url=row['url']
        publisher=row['publisher']
        date=row['date']
        label=row['label']
        score=row['score']
        headline_vector = vector_data[index].astype(np.float32).tobytes()
        headline_data_mapping ={'headline':headline,'url':url,'publisher':publisher,'label':label,'score':score, vector_field_name:headline_vector}
        
        p.hset(key,mapping=headline_data_mapping)
    p.execute()
    

# 7. More Utility Functions to Define vector indexes 

In [13]:
#Utility Functions to Create Indexes on Vector field

def create_bf_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2'):
    bf_index = redisearch.Client(index_name, conn=redis_conn)
    bf_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA",vector_field_name, "VECTOR", "FLAT", "8", "TYPE", "FLOAT32", "DIM", vector_dimensions, "DISTANCE_METRIC", distance_metric, "INITIAL_CAP", number_of_vectors)
    return bf_index

def create_hnsw_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2',M=40,EF=200):
    hnsw_index = redisearch.Client(index_name, conn=redis_conn)
    hnsw_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA", vector_field_name, "VECTOR", "HNSW", "12", "TYPE", "FLOAT32", "DIM", vector_dimensions, "DISTANCE_METRIC", distance_metric,  "INITIAL_CAP", number_of_vectors, "M", M, "EF", EF)
    return hnsw_index

def delete_index(vector_index):
    delete_data(vector_index.redis)
    

# 8. Utility Functions to Query a vector index in Redis
One function to query brute-force index and another to query an HNSW index

In [19]:
#Utility Functions to Perform Similarity Search

def find_similar_bf(headline_q, query_encoder, vector_index,vector_field_name, topK=5):
    #vectorize the query
    query_vector = query_encoder.encode(headline_q).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'(*)=>[TOP_K $K @{vector_field_name} $BLOB AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','score','headline','label')
    #Execute the query
    results = vector_index.search(q, query_params = {'BLOB': query_vector, 'K':topK})
    return results 

def find_similar_hnsw(headline_q, query_encoder, vector_index,vector_field_name, topK=5,EF=5):
    #vectorize the query
    query_vector = query_encoder.encode(headline_q).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'(*)=>[TOP_K $K @{vector_field_name} $BLOB EF_RUNTIME $EF AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','score','headline','label')    #Execute the query
    results = vector_index.search(q, query_params = {'BLOB': query_vector, 'K':topK, 'EF':EF})
    return results 

# 9. Finally Load the Data into Redis

In [15]:
#Brute-Force - Load and Index article & vector Data
NUMBER_ARTICLES = 300
VECTOR_FIELD_NAME = 'headline_vector'
redis_conn.flushall()
bf_index = create_bf_index(redis_conn,'bf_index',VECTOR_FIELD_NAME,NUMBER_ARTICLES,768,'L2')
load_vectors(bf_index.redis,result_df,headline_vectors,VECTOR_FIELD_NAME)

# 10. Query for similarity on the Brute-force index
Get Top5 most semantically similar headlines to a given sentence

In [20]:
#query for similarity

new_headline='bearish conditions ahead'

results = find_similar_bf (new_headline,model,bf_index,VECTOR_FIELD_NAME,5)
for doc in results.docs:
    print ('***************Product  found ************')
    #this must be a bug, I get the the headline by retrieving the "vector score" field!?
    print ('headline = ' + doc.headline_vector_score)
    print ('label = ' + doc.label)
    
    print ('id = ' + doc.id)
    

***************Product  found ************


AttributeError: 'Document' object has no attribute 'headline_vector_score'

In [None]:
results.docs[4]