In [1]:
import firebase_admin
from firebase_admin import credentials, firestore

# Use a service account
cred = credentials.Certificate('./key.json')
firebase_admin.initialize_app(cred)

<firebase_admin.App at 0x108a61d30>

In [3]:
import subprocess

command = "mkdir data"
subprocess.run(command, shell=True, executable="/bin/bash")

CompletedProcess(args='mkdir data', returncode=0)

In [4]:
import random
def get_all_documents(date=None):
    db = firestore.client()
    ref = db.collection('toi_news_2024-10-22')
    docs = ref.get()
    for doc in docs:
        if random.uniform(0,1) < 0.9:
            continue
        with open(f"./data/{doc.id}.txt", 'w') as file:
            file.write(doc.to_dict()['content'])
            
get_all_documents()

In [5]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
loader = DirectoryLoader('./data/', glob="*.txt", loader_cls=TextLoader)
news_articles = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 25
)

#Create a split of the document using the text splitter
news_articles_split = text_splitter.split_documents(news_articles)

In [9]:
print(f"Total news documents before split: {len(news_articles)}")
print(f"Total news documents after split: {len(news_articles_split)}")
print(f"Type of document: {type(news_articles_split[0])}")

Total news documents before split: 48
Total news documents after split: 648
Type of document: <class 'langchain_core.documents.base.Document'>


In [11]:
from langchain_ollama import OllamaEmbeddings

In [16]:
embedding_model = OllamaEmbeddings(model="mistral")

In [18]:
index_len = len(embedding_model.embed_query("hello world"))

In [19]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [20]:
index = faiss.IndexFlatL2(index_len)

vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [23]:
from uuid import uuid4
from tqdm.auto import tqdm

In [None]:
for doc in tqdm(news_articles_split):
    uuid = str(uuid4())
    vector_store.add_documents(documents=[doc], ids=[uuid])

In [None]:
vector_store.save_local("faiss_index")

In [77]:
# load the vector_store
new_vector_store = FAISS.load_local(
    "../../../Downloads/news_2024-10-22/", embedding_model, allow_dangerous_deserialization=True
)

docs = new_vector_store.similarity_search("What is happening in the Indian business market?", k=10)

In [87]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = PromptTemplate(
    template="""Act as an experienced stock trader with a strong understanding of the \
Indian stock market. You know how to use news to determine whether to buy or sell a stock. 
I will provide you with all the relevant news for a specific day from India. Based on the news provided, \
I need you to perform the following tasks: 

1. Identify the best potential stock to buy (BEST_STOCK) – it should be a company that is present in the Indian stock market and performing \
very well based on the news I provided below. Provide a brief reason for selecting the stock, explaining why it's a good investment opportunity. 

2. Identify the best potential stock to sell (WORST_STOCK) – it should be a company that is present in the Indian stock market and expected to face challenges \
or underperform due to the news. Provide a brief reason for selecting the stock, explaining why selling or avoiding this \
stock would be wise.

The output should be a json with BEST_STOCK and WORST_STOCK key.

News articles: {documents}
""",
    input_variables=["documents"],
)

In [88]:
from langchain_ollama import OllamaLLM
from langchain.chains import LLMChain

In [89]:
ollama = OllamaLLM(model="mistral")
llm_chain = LLMChain(llm=ollama, prompt=prompt)

In [90]:
response = llm_chain.run({"documents": docs})

In [91]:
print(response)

 {
      "BEST_STOCK": "Reliance Industries Limited (RIL)",
      "REASON": "The news about RBI's commitment to active and adaptable liquidity management suggests a positive outlook for banking sector stocks, especially those with strong financial status like RIL. The company has been performing well financially, and its diversified business portfolio including retail, petrochemicals, and telecommunications makes it less vulnerable to market fluctuations. Additionally, the upcoming textile exhibition may provide further growth opportunities for RIL's textiles segment.",

      "WORST_STOCK": "Predatory Pricing Industry Players",
      "REASON": "The news about predatory pricing casting a shadow over the industry indicates potential legal and financial risks for companies involved in such practices. Engaging in such behavior could lead to fines or investigations, which would negatively impact the stock performance of these companies. Avoiding stocks from this sector might be a wise choi

In [92]:
import json

In [93]:
json_response = json.loads(response)

In [94]:
json_response

{'BEST_STOCK': 'Reliance Industries Limited (RIL)',
 'REASON': 'The news about predatory pricing casting a shadow over the industry indicates potential legal and financial risks for companies involved in such practices. Engaging in such behavior could lead to fines or investigations, which would negatively impact the stock performance of these companies. Avoiding stocks from this sector might be a wise choice until the situation becomes clearer.',
 'WORST_STOCK': 'Predatory Pricing Industry Players'}

In [95]:
json_response['BEST_STOCK']

'Reliance Industries Limited (RIL)'

In [96]:
import pandas as pd

In [102]:
data = pd.read_csv("./fetched-data/stock-sectors.csv")

In [103]:
data

Unnamed: 0,sector,description,total_companies,link
0,abrasives,The one that scraps away all inconsistencies,3,/abrasives
1,advertising_media,The one that creates and distributes media con...,11,/advertising-media
2,agriculture,The one that gives us vegetables and fruits to...,49,/agriculture
3,air_conditioners,The one thats keeps it cool with no noise,6,/air-conditioners
4,airlines,The one that takes us around the world in the ...,6,/airlines
...,...,...,...,...
129,travel_services,The one that takes you to your favourite holidays,17,/travel-services
130,tv_broadcasting_software_production,The one that brings you your favourite TV shows,32,/tv-broadcasting-software-production
131,tyres_allied,The one that makes vehicles move,12,/tyres-allied
132,watches_accessories,The one that makes your life more functional,2,/watches-accessories


In [119]:
data[data.sector == "transmission_towers_equipments"]

Unnamed: 0,sector,description,total_companies,link
128,transmission_towers_equipments,The one that aids the electricity power lines,2,/transmission-towers-equipments


In [104]:
data.total_companies.sum()

4912

In [120]:
stocks = pd.read_csv("./fetched-data/stock-names.csv")

In [141]:
stocks

Unnamed: 0,sector,ticker_symbol,company_name,market_cap,company_link
0,abrasives,WENDT,Wendt India,3029.00,/company/WENDT
1,abrasives,GRINDWELL,Grindwell Norton,23492.57,/company/GRINDWELL
2,abrasives,CARBORUNIV,Carborundum Univer.,26272.53,/company/CARBORUNIV
3,advertising_media,SCRIP-300010,Navoday Enterprises,7.21,/company/SCRIP-300010
4,advertising_media,SIGNPOST,Signpost India,1229.35,/company/SIGNPOST
...,...,...,...,...,...
4913,wood_wood_products,SCRIP-304681,Alkosign,86.59,/company/SCRIP-304681
4914,wood_wood_products,SCRIP-116003,Duroply Industries,249.88,/company/SCRIP-116003
4915,wood_wood_products,SCRIP-126225,Bloom Dekor,10.41,/company/SCRIP-126225
4916,wood_wood_products,SCRIP-123888,VR Woodart,7.15,/company/SCRIP-123888


In [149]:
def convert_to_float(value):
    # Remove commas and convert to float
    return float(value.replace(',', ''))

# Apply the function to the 'values' column
stocks['market_cap'] = stocks['market_cap'].apply(convert_to_float)

In [151]:
stocks.market_cap.sum()

44617202.92

In [136]:
keys = stocks.sector.value_counts().index
values = stocks.sector.value_counts().values

In [140]:
for i, key in enumerate(keys):
    if data[data.sector == key].total_companies.values[0] != values[i]:
        print(f"Sector: {key}")
        print(f"Expected value: {data[data.sector == key].total_companies.values[0]} --- Actual value: {values[i]}")

Sector: chemicals
Expected value: 177 --- Actual value: 178
Sector: auto_ancillary
Expected value: 97 --- Actual value: 98
Sector: finance_stock_broking
Expected value: 64 --- Actual value: 65
Sector: sugar
Expected value: 43 --- Actual value: 44
Sector: miscellaneous
Expected value: 29 --- Actual value: 31
