In [1]:
import pandas
# import yfinance as yf
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams
from sentence_transformers import SentenceTransformer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import uuid
import json
import os
import datetime
from tqdm import tqdm
from llama_index.core import Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, Document, StorageContext
import torch
from llama_index.llms.ollama import Ollama
from pydantic import BaseModel
from llama_index.core.node_parser import SentenceSplitter

  from tqdm.autonotebook import tqdm, trange
2024-11-23 15:41:26.675178: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vishwaasnarasinh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [48]:
!pip install llama-index-embeddings-openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama-index-embeddings-openai
  Downloading llama_index_embeddings_openai-0.3.0-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-core<0.13.0,>=0.12.0 (from llama-index-embeddings-openai)
  Downloading llama_index_core-0.12.1-py3-none-any.whl.metadata (2.5 kB)
Collecting openai>=1.1.0 (from llama-index-embeddings-openai)
  Downloading openai-1.55.0-py3-none-any.whl.metadata (24 kB)
Collecting filetype<2.0.0,>=1.2.0 (from llama-index-core<0.13.0,>=0.12.0->llama-index-embeddings-openai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting distro<2,>=1.7.0 (from openai>=1.1.0->llama-index-embeddings-openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai>=1.1.0->llama-index-embeddings-openai)
  Downloading jiter-0.7.1-cp310-cp310-macosx_10_12_x86_64.whl.metadata (5.2 kB)
Downloading llama_index_embeddings_openai-0.3.0-py3-none-any.whl (6.1 kB)
Downloading llama_index_core-0.12.1-py3-none-

In [2]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name='llmrails/ember-v1'
)


In [43]:
Settings.chunk_size = 2048

# The below command works only on Mac and Linux
!export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
def initialize_qdrant():
    # Connect to Qdrant
    qdrant_client = QdrantClient(host="localhost", port=6333)
    
    # # Create the collection (specify vector size and distance metric)
    # vector_params = VectorParams(size=1024, distance="Cosine")  # Example: 768-dimensional vectors (e.g., BERT)
    
    # qdrant_client.recreate_collection(
    #     collection_name="finance_collection", 
    #     vectors_config=vector_params
    # )

    return qdrant_client


In [36]:
# Delete old collection if necessary
# client.delete_collection('finance_collection')

In [39]:
def get_locations(json_data):
    locations_list = json_data['entities']['locations']
    locations = []
    for ld in locations_list:
        if ('name' in ld) and ld['name'].strip()!='':
            locations.append(ld['name'])    

    if locations:
        return ', '.join(locations)

    return ''

def json_2_text(json_data):
    title = json_data['title']
    location = get_locations(json_data)
    
    title_template = f'The title of the news article is {title}.'
    if location:
        location_template = f'The locations relavant to the article are: {location}.'
    else:
        location_template = ''
    published = json_data['published'][:10]
    date_template = f'This article was published on {published}'
    article = json_data['text']
    article_template = f'Article: {article}'

    item = {'title': title, 'location': location, 'published':  published, 'article': article}
    text = ' '.join([title_template, location_template, date_template, article_template])
    return text, item
    
    
def extract_data(qdrant_client, number_of_folders=4, number_of_files_per_folder=10000, collection_name='finance_collection'):
    base_folder = 'data/US_Financial_News_Articles'
    c=0
    llama_documents = []
    for folder_name in os.listdir(base_folder)[:number_of_folders]:
        folder_path = os.path.join(base_folder, folder_name)
        if not os.path.isdir(folder_path):
            continue
        for file_name in tqdm(os.listdir(folder_path)[:number_of_files_per_folder]):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as f:
                json_data = json.load(f)
                text, item = json_2_text(json_data)
                point_id = str(uuid.uuid4())
                vector = Settings.embed_model.get_text_embedding(text)
                point = {
                "id": point_id,
                "vector": vector,
                "payload": {
                    "title": item["title"],
                    "location": item["location"],
                    "published": item["published"],
                    "article": item["article"],
                }
                }
            c+=1

            document = Document(metadata=item, text=text)
            llama_documents.append(document)
            # qdrant_client.upsert(collection_name=collection_name, points=[point])
    return llama_documents

def index_data(qdrant_client, llama_documents, collection_name='finance_collection'):
    vector_store = QdrantVectorStore(client=qdrant_client, collection_name=collection_name)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    # print(llama_documents)
    index = VectorStoreIndex([]) 
    for doc in tqdm(llama_documents[:]):
        try:
            index.insert(doc)
        except ValueError as v:
            print(f'Skipped: {v}')

        
    # index = VectorStoreIndex.from_documents(llama_documents[:40000], storage_context=storage_context)




In [41]:
client = initialize_qdrant() 
number_of_folders = 2 # max 4 possible
number_of_files_per_folder = 10000
llama_documents = extract_data(client, 1, 100)


100%|█████████████████████████████████████████| 100/100 [01:02<00:00,  1.60it/s]


In [34]:
index_data(client, llama_documents)

  3%|█▎                                         | 6/200 [00:30<22:23,  6.92s/it]

Skipped Metadata length (1166) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Skipped Metadata length (1528) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


  5%|██                                        | 10/200 [00:34<09:02,  2.86s/it]

Skipped Metadata length (4630) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 28%|███████████▌                              | 55/200 [02:50<03:27,  1.43s/it]

Skipped Metadata length (1095) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 33%|█████████████▊                            | 66/200 [03:09<02:34,  1.15s/it]

Skipped Metadata length (1219) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 40%|█████████████████                         | 81/200 [05:02<12:29,  6.30s/it]

Skipped Metadata length (1045) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 51%|████████████████████▉                    | 102/200 [06:19<04:01,  2.46s/it]

Skipped Metadata length (7425) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 67%|███████████████████████████▍             | 134/200 [08:55<15:06, 13.74s/it]

Skipped Metadata length (1434) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 74%|██████████████████████████████▎          | 148/200 [09:11<00:54,  1.05s/it]

Skipped Metadata length (1920) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 75%|██████████████████████████████▊          | 150/200 [09:12<00:38,  1.31it/s]

Skipped Metadata length (1025) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 82%|█████████████████████████████████▌       | 164/200 [09:20<00:22,  1.60it/s]

Skipped Metadata length (4348) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Skipped Metadata length (2041) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 85%|██████████████████████████████████▊      | 170/200 [09:22<00:14,  2.08it/s]

Skipped Metadata length (1193) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 88%|████████████████████████████████████     | 176/200 [09:56<01:26,  3.61s/it]

Skipped Metadata length (1172) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 95%|██████████████████████████████████████▉  | 190/200 [11:06<00:30,  3.02s/it]

Skipped Metadata length (3129) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 96%|███████████████████████████████████████▌ | 193/200 [11:09<00:13,  1.94s/it]

Skipped Metadata length (2438) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


 99%|████████████████████████████████████████▌| 198/200 [11:23<00:04,  2.28s/it]

Skipped Metadata length (1618) is longer than chunk size (1024). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


100%|█████████████████████████████████████████| 200/200 [11:26<00:00,  3.43s/it]


In [44]:
class CustomQdrantClient:
    def __init__(self, client) -> None:
        self._client: QdrantClient = client

    def collection_exists(self, c):
        return self._client.collection_exists(c)

    def search(self, collection_name, query_vector, limit, query_filter):
        return self._client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=limit,
            query_filter=query_filter,
            score_threshold = 0.3
        )

class QdrantService:
    def __init__(self, client, collection_name='finance_collection'):
        self.collection_name = collection_name
        self.custom_qdrant_client = CustomQdrantClient(client)

    def get_vector_store_index(self):

        if not self.custom_qdrant_client.collection_exists(c=self.collection_name):
            self.logger.warning(f"Collection {self.collection_name} does not exist !")
            return None

        vector_store = QdrantVectorStore(
            client=self.custom_qdrant_client,
            collection_name=self.collection_name,
            parallel=1,
        )
        index: VectorStoreIndex = VectorStoreIndex.from_vector_store(vector_store=vector_store)
        query_engine: BaseQueryEngine = index.as_query_engine(
            similarity_top_k=5,
            verbose=False, streaming=False
        )
        return query_engine


In [45]:
class Message(BaseModel):
    role: str
    content: str

In [47]:
class FinanceQA:

    def __init__(self, client):
        self.llm_model: CustomLLM = self.get_model_instance()
        self.set_defaults()

        self.qdrant_ser: QdrantService = QdrantService(client)
        self.vector_store_index: BaseQueryEngine = self.qdrant_ser.get_vector_store_index()
    
    def get_model_instance(self):
        model_instance = Ollama(
        model = 'phi3:3.8b-mini-128k-instruct-q8_0',
        request_timeout = 480,
        temperature = 0.3,
        tokenizer_mode = "slow",
        context_window = 3000,
        additional_kwargs = {
                'num_thread': 8,
                'num_ctx': 2500,
                'num_predict': 650
        },
        base_url = 'http://localhost:11434')
        return model_instance

    def convert_to_format(self, messages):
        chat_messages = []
        for message in messages:
            chat_message = ChatMessage(
                role=message.role,  # Convert string to MessageRole enum
                content=message.content
            )
            chat_messages.append(chat_message)
        return chat_messages
    
    


    def set_defaults(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        Settings.embed_model = HuggingFaceEmbedding(
            model_name = 'llmrails/ember-v1', device=device
        )
        Settings.llm = self.llm_model

    def llm_request(self, messages):
        response = None
        if self.vector_store_index:
            response = self.vector_store_index.query(messages[-1].content)
        if (response is None) or len(response.source_nodes) <= 0:
            # self.logger.debug("vector index store response: {0} \n".format(response))
            messages: List[ChatMessage] = self.convert_to_format(messages)
            response = self.llm_model.chat(messages)
            # self.logger.debug("llm response: {0} \n".format(response))
        extra_info: str = "is_LLM"
        return {"response": response, "extra_info": extra_info}

    def async_health_check(self):
        self.qdrant_ser.async_health_check()


In [18]:
messages = [Message(role='user', content='How has Trump affected Financial markets?')]

In [48]:
# client = initialize_qdrant() 
finance_qa = FinanceQA(client)

In [35]:
finance_qa.llm_request(messages)

{'response': Response(response="Trump has had mixed effects on financial markets initially causing concern and uncertainty among investors, which resulted in significant drops across major indices such as the Dow Jones Industrial Average, S&P 500, and Nasdaq Composite Index due to fears of a potential global trade war. However, his subsequent flexibility regarding NAFTA negotiations indicated that he might back off from tariffs if an agreement is reached which could potentially alleviate some concerns among investors. Furthermore, Trump's comments on Mexico’s role in stopping drugs entering the U.S also added to market uncertainty with potential retaliation threats by Canada and Mexico against tariffs further adding tension within financial markets. Despite these initial impacts, certain sectors like utilities saw positive movements as investors shifted towards safer assets amidst trade war fears.", source_nodes=[NodeWithScore(node=TextNode(id_='a90d5187-5387-444c-9804-e8453df00e1f', e