In [2]:
# Test importing libraries
import yfinance as yf
import pandas as pd
import requests
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, RagTokenForGeneration
from datasets import Dataset
from bs4 import BeautifulSoup
import torch
from textblob import TextBlob


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
company_tickers = {
    'HDFC Bank Ltd.': 'HDFCBANK.NS',
    'Reliance Industries Ltd.': 'RELIANCE.NS',
    'ICICI Bank Ltd.': 'ICICIBANK.NS',
    'Infosys Ltd.': 'INFY.NS',
    'Larsen & Toubro Ltd.': 'LT.NS',
    'Tata Consultancy Services Ltd.': 'TCS.NS',
    'ITC Ltd.': 'ITC.NS',
    'Bharti Airtel Ltd.': 'BHARTIARTL.NS',
    'Axis Bank Ltd.': 'AXISBANK.NS',
    'State Bank of India': 'SBIN.NS',
    'Mahindra & Mahindra Ltd.': 'M&M.NS',
    'Kotak Mahindra Bank Ltd.': 'KOTAKBANK.NS',
    'Hindustan Unilever Ltd.': 'HINDUNILVR.NS',
    'Bajaj Finance Ltd.': 'BAJFINANCE.NS',
    'NTPC Ltd.': 'NTPC.NS',
    'Tata Motors Ltd.': 'TATAMOTORS.NS',
    'Sun Pharmaceutical Industries Ltd.': 'SUNPHARMA.NS',
    'Maruti Suzuki India Ltd.': 'MARUTI.NS',
    'HCL Technologies Ltd.': 'HCLTECH.NS',
    'Power Grid Corporation of India Ltd.': 'POWERGRID.NS',
    'Tata Steel Ltd.': 'TATASTEEL.NS',
    'Titan Company Ltd.': 'TITAN.NS',
    'UltraTech Cement Ltd.': 'ULTRACEMCO.NS',
    'Asian Paints Ltd.': 'ASIANPAINT.NS',
    'Adani Ports and Special Economic Zone Ltd.': 'ADANIPORTS.NS',
    'Coal India Ltd.': 'COALINDIA.NS',
    'Oil & Natural Gas Corporation Ltd.': 'ONGC.NS',
    'Bajaj Auto Ltd.': 'BAJAJ-AUTO.NS',
    'Hindalco Industries Ltd.': 'HINDALCO.NS',
    'Grasim Industries Ltd.': 'GRASIM.NS',
    'IndusInd Bank Ltd.': 'INDUSINDBK.NS',
    'Nestle India Ltd.': 'NESTLEIND.NS',
    'Tech Mahindra Ltd.': 'TECHM.NS',
    'JSW Steel Ltd.': 'JSWSTEEL.NS',
    'Bajaj Finserv Ltd.': 'BAJAJFINSV.NS',
    'Adani Enterprises Ltd.': 'ADANIENT.NS',
    'Shriram Finance Ltd.': 'SHRIRAMFIN.NS',
    'Cipla Ltd.': 'CIPLA.NS',
    'Dr. Reddy\'s Laboratories Ltd.': 'DRREDDY.NS',
    'Hero MotoCorp Ltd.': 'HEROMOTOCO.NS',
    'Wipro Ltd.': 'WIPRO.NS',
    'Tata Consumer Products Ltd.': 'TATACONSUM.NS',
    'SBI Life Insurance Company Ltd.': 'SBILIFE.NS',
    'Britannia Industries Ltd.': 'BRITANNIA.NS',
    'Eicher Motors Ltd.': 'EICHERMOT.NS',
    'Apollo Hospitals Enterprise Ltd.': 'APOLLOHOSP.NS',
    'HDFC Life Insurance Company Ltd.': 'HDFCLIFE.NS',
    'Bharat Petroleum Corporation Ltd.': 'BPCL.NS',
    'Divi\'s Laboratories Ltd.': 'DIVISLAB.NS',
    'LTIMindtree Ltd.': 'LTIM.NS'
}


In [4]:
def get_news_from_yahoo_finance(ticker):
    url = f'https://finance.yahoo.com/quote/{ticker}/news/'
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {ticker}. Status code: {response.status_code}")
            return []

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all div elements with class "content yf-1044anq"
        news_items = soup.find_all('div', class_='content yf-1e4au4k')

        news_data = []
        for item in news_items:
            title = item.find('h3', class_='clamp yf-1e4au4k')
            news = item.find('p', class_='clamp yf-1e4au4k')

            if title and news:
                news_data.append({
                    'title': title.get_text(strip=True),
                    'news': news.get_text(strip=True)
                })

        return news_data

    except Exception as e:
        print(f"An error occurred for {ticker}: {str(e)}")
        return []

In [5]:
def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity


In [6]:
news_dataset = []

for company_name, ticker in company_tickers.items():
    print(f"Collecting news for {company_name} ({ticker})...")
    news_data = get_news_from_yahoo_finance(ticker)
    
    for news_item in news_data:
        title = news_item['title']
        news = news_item['news']
        sentiment = analyze_sentiment(news)
        
        news_dataset.append({
            'title': title,
            'news': news,
            'sentiment': 'Positive' if sentiment > 0 else 'Negative' if sentiment < 0 else 'Neutral'
        })

Collecting news for HDFC Bank Ltd. (HDFCBANK.NS)...
Collecting news for Reliance Industries Ltd. (RELIANCE.NS)...
Collecting news for ICICI Bank Ltd. (ICICIBANK.NS)...
Collecting news for Infosys Ltd. (INFY.NS)...
Collecting news for Larsen & Toubro Ltd. (LT.NS)...
Collecting news for Tata Consultancy Services Ltd. (TCS.NS)...
Collecting news for ITC Ltd. (ITC.NS)...
Collecting news for Bharti Airtel Ltd. (BHARTIARTL.NS)...
Collecting news for Axis Bank Ltd. (AXISBANK.NS)...
Collecting news for State Bank of India (SBIN.NS)...
Collecting news for Mahindra & Mahindra Ltd. (M&M.NS)...
Collecting news for Kotak Mahindra Bank Ltd. (KOTAKBANK.NS)...
Collecting news for Hindustan Unilever Ltd. (HINDUNILVR.NS)...
Collecting news for Bajaj Finance Ltd. (BAJFINANCE.NS)...
Collecting news for NTPC Ltd. (NTPC.NS)...
Collecting news for Tata Motors Ltd. (TATAMOTORS.NS)...
Collecting news for Sun Pharmaceutical Industries Ltd. (SUNPHARMA.NS)...
Collecting news for Maruti Suzuki India Ltd. (MARUTI.N

In [7]:
print(news_dataset)

[{'title': 'HDFC Bank Ltd (NYSE:HDB) A Bull Case Theory', 'news': 'We came across a bullish thesis on HDFC Bank Ltd (HDB) on ValueInvestorsClub by Par03. In this article we will summarize the bulls’ thesis on HDB. HDFC Bank shares were trading at $59.02 when this thesis was published, vs. closing price of $61.00 on Aug 29. HDFC Bank, the largest private sector bank in India, […]', 'sentiment': 'Neutral'}, {'title': 'HDFC Bank gets MSCI boost, expects $2.5 bln inflows', 'news': "MSCI on August 13,\xa0 announced an adjustment in the Foreign Inclusion Factor (FIF) for  HDFC Bank Ltd  (NS:HDBK) as part of its August index review.  This adjustment marks a two-stage increase in the bank's weight within the MSCI indices, a move expected to attract substantial passive inflows.  “MSCI further reported that the FIF might see further upward revision to 1.0 in its November review, provided the foreign room continues to be atleast 20% at the time of the next review,” analysts at Bernstein said in a

In [8]:
import pandas as pd
from datasets import Dataset

df = pd.DataFrame(news_dataset)

# Rename 'news' column to 'text'
df.rename(columns={'news': 'text'}, inplace=True)

# Check if DataFrame is created correctly
if df.empty:
    print("DataFrame is empty. Please check the input data.")
else:
    print("DataFrame created successfully.")

# Create the dataset from the DataFrame
dataset = Dataset.from_pandas(df)


DataFrame created successfully.


In [9]:
df.to_csv("news.csv")

In [8]:
# Check if Dataset is created correctly
if dataset is None:
    print("Dataset creation failed.")
else:
    print("Dataset created successfully.")

Dataset created successfully.


In [9]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
import torch

# Load the context encoder and tokenizer
ctx_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

# Function to compute embeddings (using the 'text' field) with max length set
def embed_passages(batch):
    inputs = ctx_tokenizer(batch['text'], truncation=True, padding='longest', max_length=512, return_tensors='pt')
    with torch.no_grad():
        embeddings = ctx_encoder(**inputs).pooler_output
    batch['embeddings'] = embeddings.cpu().numpy()
    return batch

# Apply the function to the dataset
dataset = dataset.map(embed_passages, batched=True, batch_size=10)


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

In [10]:
# Define the paths
passages_path = 'E:/Data Science/Capstone/nifty_passages'
index_path = 'E:/Data Science/Capstone/nifty_index'

# Check if dataset is valid before proceeding
if dataset is not None:
    # Remove existing FAISS index
    if 'embeddings' in dataset.list_indexes():
        dataset.drop_index('embeddings')
        print("Index 'embeddings' removed.")

    # Save the dataset to disk
    dataset.save_to_disk(passages_path)
    print(f"Dataset saved to {passages_path}.")

    # Reload dataset and add FAISS index
    dataset = Dataset.load_from_disk(passages_path)
    dataset.add_faiss_index(column='embeddings')

    # Save the FAISS index separately
    faiss_index = dataset.get_index('embeddings')
    if faiss_index is not None:
        faiss_index.save(index_path)
        print(f"Index saved to {index_path}.")
    else:
        print("FAISS index could not be retrieved or saved.")
else:
    print("Dataset is not valid.")


Saving the dataset (1/1 shards): 100%|██████████| 607/607 [00:00<00:00, 43366.93 examples/s]


Dataset saved to E:/Data Science/Capstone/nifty_passages.


100%|██████████| 1/1 [00:00<00:00, 166.67it/s]

Index saved to E:/Data Science/Capstone/nifty_index.





: 

In [11]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import torch

# Use CPU or GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

test_input = "top 10 stocks with high returns"

# Tokenize the input using RagTokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
inputs = tokenizer(test_input, return_tensors="pt").to(device)

# Load the retriever and model
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path='E:/Data Science/Capstone/nifty_passages',
    index_path='E:/Data Science/Capstone/nifty_index'
)

# Initialize the model and move to appropriate device (CPU/GPU)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(device)

# Generate a response
with torch.no_grad():
    generated_ids = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print(f"Model Response: {response}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [16]:
import yfinance as yf

# Correct method to create a Ticker object
nstl = yf.Ticker('NESTLEIND.BO')

# Now you can use the nstl object to get information about Nestle India
print(nstl.news())


TypeError: 'list' object is not callable