<a href="https://colab.research.google.com/github/PranavSuresh525/AI-ML-Projects/blob/main/AI_Integration_in_Finance/Finance_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
!pip install -q -U --no-warn-conflicts \
    langchain-huggingface \
    langchain-google-genai \
    langgraph \
    yfinance \
    gnews \
    transformers \
    accelerate \
    duckduckgo-search \
    langchain-text-splitters \
    langchain-chroma \
    langchain-community \
    ddgs
import numpy as np
import os
import re
import json
import time
import operator
from datetime import datetime, timedelta
from typing import TypedDict, List, Annotated, Dict, Any, Optional
import yfinance as yf
from gnews import GNews
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_core.messages import HumanMessage, BaseMessage
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langgraph.graph import StateGraph, END, START
from IPython.display import HTML, Markdown

In [346]:
## the local llm whixh avoids unecessary calls from gemini as there is a API limit
local_pipe = pipeline("text2text-generation", model="google/flan-t5-large", max_new_tokens=256, device_map="auto")
local_llm = HuggingFacePipeline(pipeline=local_pipe)
os.environ["GOOGLE_API_KEY"] = "AIzaSyDSAtgeZUOp2o1D2s_XeP-OY5Bn7Fy-LL0"
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-exp",
    temperature=0.2,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)
# WEB SEARCH: Used by the local node to find tickers
search_tool = DuckDuckGoSearchRun()
_ticker_cache={}

Device set to use cpu


In [347]:
def smart_llm_invoke(prompt):
  """Try Gemini first, fallback to local LLM if it fails"""
  try:
      response = smart_llm_invoke([HumanMessage(content=prompt)])
      return response.content.strip()
  except Exception as e:
      print(f"Using Local llm...")
      response = local_llm.invoke(prompt)  # Remove the [HumanMessage(...)] wrapper
      return response.strip()

In [348]:
# a basic function to get all the info related to a stock
def get_stock_price(ticker: str):
  try:
    stock = yf.Ticker(ticker)
    info = stock.info
    history = stock.history(period='1y')

    if not info or not info.get('symbol') or history.empty:
      return {"error": f"No data found or invalid ticker for {ticker}"}

    return{
            'ticker': ticker,
            'current_price': info.get('currentPrice', 0),
            'previous_close': info.get('previousClose', 0),
            'day_high': info.get('dayHigh', 0),
            'day_low': info.get('dayLow', 0),
            'volume': info.get('volume', 0),
            'market_cap': info.get('marketCap', 0),
            'company_name': info.get('longName', ticker),
            'pe_ratio': info.get('trailingPE', 0),
            'dividend_yield': info.get('dividendYield', 0),
            'target_mean_price': info.get('targetMeanPrice', 0),
            'recommendation_key': info.get('recommendationKey', 'N/A'),
            '50_day_average': history['Close'].rolling(window=50).mean().iloc[-1] if len(history) >= 50 else 0,
            '200_day_average': history['Close'].rolling(window=200).mean().iloc[-1] if len(history) >= 200 else 0,
            'price_history': history['Close'].tail(30).to_list()
        }
  except Exception as e:
        return {"error": f"Failed to fetch data for {ticker}: {str(e)}"}

In [349]:
# a function which accesses a pandas data frame and gets the latest stock price of the company-'item' here
def get_recent_data(df, items):
  if df.empty:
    return {}
  recent_data={}
  for item in items:
    try:
      if item in df.index:
          value = df.loc[item].iloc[0]
          recent_data[item] = float(value) if value is not None else 0
      else:
          recent_data[item] = 0
    except:
      recent_data[item] = 0


In [350]:
# a function which basically gets the latest info (listed below) from yahoo finance, uses the period here to know how long to look for
def fetch_financial_statements(ticker: str, period: str)->dict:
  try:
    stock=yf.Ticker(ticker)
    if not stock.info or not stock.info.get('symbol'):
      return {'error': f'Invalid or no data for ticker {ticker}'}

    if period=='quaterly':
      balance_sheet=stock.quarterly_balance_sheet
      income_statement=stock.quarterly_income_stmt
      cash_flow=stock.quarterly_cashflow
    else:
      balance_sheet=stock.balance_sheet
      income_statement=stock.income_stmt
      cash_flow=stock.cashflow
    return{
        'balance_sheet': get_recent_data(balance_sheet, ['Total Cash', 'Total Debt']),
        'income_statement': get_recent_data(income_statement, ['Total Revenue', 'Gross Profit']),
        'cash_flow': get_recent_data(cash_flow, ['Net Cash Flow']),
        'period': period
    }
  except Exception as e:
    return {"error": f"Failed to fetch data for {ticker}: {str(e)}"}

In [351]:
# a simple validate function that cross checks with yf to see of the ticker exist
def validate_ticker(potential_ticker: str):
  try:
    stock = yf.Ticker(potential_ticker)
    info = stock.info
    if info and 'symbol' in info and info.get('symbol'):
      return info['symbol']
    return None
  except Exception as e:
    return None

In [352]:
def analyze_sentiment(text: str):
    if not text:
        return 0.0

    prompt = f"""Analyze the sentiment of this financial news text.
    Return ONLY a number between -1 (very negative) and 1 (very positive).

    Text: {text}
    sentiment:"""

    try:
        response = smart_llm_invoke([HumanMessage(content=prompt)])
        sentiment_score = float(response.strip())
        return max(-1, min(1, sentiment_score))
    except:
        pass

    # Fallback to keyword-based analysis
    positive_words = [
        'gain', 'gains', 'up', 'rise', 'surge', 'rally', 'jump', 'soar',
        'climb', 'boost', 'profit', 'growth', 'outperform', 'beat', 'strong',
        'bullish', 'optimistic', 'confidence', 'upgrade', 'buy', 'positive'
    ]

    negative_words = [
        'loss', 'down', 'drop', 'fall', 'decline', 'plunge', 'crash',
        'tumble', 'slump', 'miss', 'disappoint', 'weak', 'bearish',
        'pessimistic', 'concern', 'worry', 'fear', 'downgrade', 'sell'
    ]

    intensifiers = ['very', 'extremely', 'highly', 'significantly']
    negations = ['not', 'no', 'never', "don't", "doesn't", "won't"]

    words = text.lower().split()
    positive_count = 0
    negative_count = 0

    for i, word in enumerate(words):
        # Check for intensifiers
        multiplier = 1.5 if i > 0 and words[i-1] in intensifiers else 1.0

        # Check for negations (flip sentiment)
        is_negated = i > 0 and words[i-1] in negations

        if word in positive_words:
            if is_negated:
                negative_count += multiplier
            else:
                positive_count += multiplier
        elif word in negative_words:
            if is_negated:
                positive_count += multiplier
            else:
                negative_count += multiplier

    if positive_count + negative_count == 0:
        return 0.0

    return (positive_count - negative_count) / (positive_count + negative_count)

In [353]:
# the heart of the model that connects the model through a RAG pipeline
class NewsRAG:
  def __init__(self, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    self.embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model
    )
    self.text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    self.vectorstore = None
  def index_news(self, news_articles: List[Dict]):
    """
    news_articles example:
    {
        "title": "...",
        "content": "...",
        "link": "...",
        "date": "...",
        "source": "..."
    }
    """
    documents = []
    for article in news_articles:
        content = article.get("content", "") or article.get("title", "")
        header=f"Source: {article.get('source')}| Date: {article.get('date')}"
        metadata = {
            "title": article.get("title", ""),
            "link": article.get("link", ""),
            "date": article.get("date", ""),
            "source": article.get("source", "")
        }

        documents.append(
            Document(
                page_content=content,
                metadata=metadata
            )
        )
    if documents:
      splits = self.text_splitter.split_documents(documents)
      self.vectorstore = Chroma.from_documents(
          documents=splits,
          embedding=self.embeddings
      )

  def retrieve_context(self, query: str, k: int = 5) -> List[Dict]:
    if self.vectorstore is None:
      return []
    docs = self.vectorstore.similarity_search(query, k=k)
    return self.__distill_context(query, docs)

  def __distill_context(self, query: str, docs: List[Document]) -> List[Dict]:
    raw_text="\n--\n".join([d.page_content for d in docs])
    distil_prompts=f"""
        Extract ONLY the facts from the news snippets below that directly answer the query: "{query}"
        If the snippets are irrelevant, return "No relevant news found."

        Snippets:
        {raw_text}

        Key Facts:
        """
    response = smart_llm_invoke(distil_prompts)
    return response.strip().split("\n--\n")

In [354]:
class AgentState(TypedDict):
    query: str
    ticker: str
    intent: str
    price_data: dict
    financial_data: dict
    news_articles: list  # Remove Annotated
    news_context: list   # Remove Annotated
    sentiment_score: float
    analysis: str
    recommendation: str
    messages: list       # Remove Annotated

In [355]:
def intent_classifier(state: AgentState):
    query = state['query']
    query_lower = query.lower()

    # Check for specific milestone questions (like "when did X hit Y")
    if any(word in query_lower for word in ['when did', 'when was']) and any(word in query_lower for word in ['hit', 'reach', 'achieve', 'cross']):
        intent = 'milestone_query'
    elif any(word in query_lower for word in ['why', 'reason', 'cause']):
        intent = 'reason_query'
    elif any(word in query_lower for word in ['when', 'trend', 'history']):
        intent = 'trend_analysis'
    elif any(word in query_lower for word in ['compare', 'vs', 'versus']):
        intent = 'comparison'
    elif any(word in query_lower for word in ['price', 'cost', 'trading at']):
        intent = 'price_query'
    else:
        intent = 'general'

    state['intent'] = intent
    state['messages'].append(f"[Intent Classifier] Intent: {intent}")
    return state

In [356]:
def ticker_extractor(state: AgentState):
  query=state['query']
  ticker_pattern= r'\b[A-Z]{1,5}\b'
  matches=re.findall(ticker_pattern, query)

  for match in matches:
    if validate_ticker(match):
      state['ticker']=match
      state['messages'].append(f"[Ticker Extractor] Ticker: {match}")
      return state

  prompt = f"""Extract the company name or stock ticker from this query.
    Return ONLY the ticker symbol (e.g., AAPL, MSFT) or company name.
    Query: {query}
    Answer:"""

  response=smart_llm_invoke([HumanMessage(content=prompt)])
  potential_ticker = response.strip().upper()

  if len(potential_ticker) > 5:
    search_query = f"{potential_ticker} stock ticker symbol"
    search_result = search_tool.run(search_query)
    ticker_match = re.search(r'\(([A-Z]{1,5})\)', search_result)
    if ticker_match:
        potential_ticker = ticker_match.group(1)

  ticker = validate_ticker(potential_ticker)
  if ticker:
    state['ticker']=ticker
    state['messages'].append(f"[Ticker Extractor] Ticker: {ticker}")
  else:
    state['ticker'] = ""
    state['messages'].append(f"[Ticker Extractor] Could not find valid ticker")
  return state

In [357]:
def price_fetcher(state: AgentState):
  ticker=state['ticker']
  if ticker:
    price_data=get_stock_price(ticker)
    state['price_data']=price_data
    state['messages'].append(f"[Price Fetcher] Price Data: {price_data}")
  else:
    state['price_data']={}
    state['messages'].append(f"[Price Fetcher] No price data found")
  return state

In [358]:
def financial_fetcher(state: AgentState):
  ticker=state['ticker']
  intent=state['intent']
  if intent in ['general', 'comparison'] and ticker:
    financial_data=fetch_financial_statements(ticker, 'annual')
    state['financial_data']=financial_data
    state['messages'].append(f"[Financial Fetcher] Financial Data: {financial_data}")
  else:
    state['financial_data']={}
    state['messages'].append(f"[Financial Fetcher] No financial data found")
  return state

In [359]:
def news_fetcher(state: AgentState):
    ticker = state['ticker']
    if not ticker:
        return state

    try:
        company_name = state['price_data'].get('company_name', ticker)
        google_news = GNews(language='en', period='7d', max_results=10)
        news = google_news.get_news(f"{company_name} stock")

        articles = []
        for item in news:
            articles.append({
                'title': item.get('title', ''),
                'content': item.get('description', ''),  # Changed from 'desc'
                'link': item.get('url', ''),
                'date': item.get('published date', ''),
                'source': item.get('publisher', {}).get('title', '') if isinstance(item.get('publisher'), dict) else item.get('publisher', '')
            })

        state['news_articles'] = articles
        state['messages'].append(f"[News Fetcher] Found {len(articles)} articles")

    except Exception as e:
        state['news_articles'] = []
        state['messages'].append(f"[News Fetcher] Error: {str(e)}")

    return state

In [360]:
def news_analyzer(state: AgentState):
  news_articles=state['news_articles']
  if not news_articles:
    state['news_context']=[]
    state['messages'].append(f"[News Analyzer] No news articles to analyze")
    return state
  rag=NewsRAG()
  rag.index_news(news_articles)

  context=state['news_context']
  if not context:
    context=rag.retrieve_context(state['query'])
  state['news_context']=context

  sentiment=analyze_sentiment("\n--\n".join(context))
  state['sentiment_score']=sentiment

  state['messages'].append(f"[News Analyzer] Sentiment Score: {sentiment}")
  return state

In [361]:
def response_generator(state: AgentState):
    """Generate final response based on all gathered data"""
    query = state['query']
    ticker = state['ticker']
    intent = state['intent']
    price_data = state['price_data']
    news_context = state['news_context']
    news_articles = state['news_articles']
    sentiment = state['sentiment_score']

    # Build context for LLM
    context_parts = []

    if price_data and 'error' not in price_data:
        context_parts.append(f"""
Stock Data for {ticker}:
- Current Price: ${price_data.get('current_price', 0):.2f}
- Previous Close: ${price_data.get('previous_close', 0):.2f}
- Day High/Low: ${price_data.get('day_high', 0):.2f} / ${price_data.get('day_low', 0):.2f}
- 50-day Average: ${price_data.get('50_day_average', 0):.2f}
- Recommendation: {price_data.get('recommendation_key', 'N/A')}
""")

    if news_context:
        context_parts.append(f"Recent News Context:\n" + "\n".join(news_context))

    context_parts.append(f"Overall Sentiment Score: {sentiment:.2f} (-1 = very negative, 1 = very positive)")

    full_context = "\n\n".join(context_parts)

    # Create prompt based on intent
    if intent == 'reason_query':
        prompt = f"""You are a financial analyst. Based on the data below, explain WHY {ticker} stock moved recently.

{full_context}

User Question: {query}

Instructions:
- Write in clear, professional prose (not bullet points)
- Cite specific events or news
- Be concise (2-3 sentences max)
- Do NOT copy/paste news headlines

Answer:"""

    elif intent == 'trend_analysis':
        prompt = f"""You are a financial analyst. Based on the data below, analyze WHEN and HOW {ticker} has been trending.

{full_context}

User Question: {query}

Instructions:
- Answer the specific timing question asked
- Reference price movements and dates if available
- Be concise (2-3 sentences max)
- Write in clear prose, not headlines

Answer:"""

    elif intent == 'price_query':
        prompt = f"""Based on the stock data below, answer this price question about {ticker}.

{full_context}

User Question: {query}

Instructions:
- Give the specific price information requested
- Be direct and accurate
- Keep it brief (1-2 sentences)

Answer:"""

    elif intent == 'milestone_query':
      prompt = f"""Answer this question about {ticker} stock using the data below.

Current market cap: ${price_data.get('market_cap', 0):,.0f}

Question: {query}

News headlines:
{chr(10).join([f"- {a.get('title', '')}" for a in news_articles[:5]])}

Answer the question directly. If the milestone isn't mentioned in the news, say "No recent news about this milestone."

Answer:"""

    else:
        prompt = f"""Based on the stock data and news below, answer this question about {ticker}.

{full_context}

User Question: {query}

Instructions:
- Provide a helpful, accurate response
- Write in clear prose
- Be concise

Answer:"""

    # Get response from LLM
    response_text = smart_llm_invoke(prompt)

    # Add sources if news articles exist
    if news_articles and len(news_articles) > 0:
        response_text += "\n\n**Sources:**"
        for i, article in enumerate(news_articles[:5], 1):
            response_text += f"\n{i}. [{article.get('title', 'No title')}]({article.get('link', '#')}) - {article.get('source', 'Unknown')} ({article.get('date', 'No date')})"

    state['analysis'] = response_text

    return state

In [362]:
def build_workflow():
    workflow = StateGraph(AgentState)


    workflow.add_node("intent_classifier", intent_classifier)
    workflow.add_node("ticker_extractor", ticker_extractor)
    workflow.add_node("price_fetcher", price_fetcher)
    workflow.add_node("financial_fetcher", financial_fetcher)
    workflow.add_node("news_fetcher", news_fetcher)
    workflow.add_node("news_analyzer", news_analyzer)
    workflow.add_node("response_generator", response_generator)

    workflow.add_edge(START, "intent_classifier")
    workflow.add_edge("intent_classifier", "ticker_extractor")
    workflow.add_edge("ticker_extractor", "price_fetcher")
    workflow.add_edge("price_fetcher", "financial_fetcher")
    workflow.add_edge("financial_fetcher", "news_fetcher")
    workflow.add_edge("news_fetcher", "news_analyzer")
    workflow.add_edge("news_analyzer", "response_generator")
    workflow.add_edge("response_generator", END)

    return workflow.compile()
agent = build_workflow()

In [363]:
def query_stocks(user_query: str):
    initial_state = {
        'query': user_query,
        'ticker': '',
        'intent': '',
        'price_data': {},
        'financial_data': {},
        'news_articles': [],  # Start empty
        'news_context': [],
        'sentiment_score': 0.0,
        'analysis': '',
        'recommendation': '',
        'messages': []
    }

    result = agent.invoke(initial_state)

    return {
        'answer': result['analysis'],
        'ticker': result['ticker'],
        'sentiment': result['sentiment_score'],
        'debug_messages': result['messages']
    }

In [364]:
result1 = query_stocks("When did Microsoft go up?" )
print(f"Query: When did Microsoft go up?")
print(f"Answer: {result1['answer']}\n")

Using Local llm...
Using Local llm...
Using Local llm...
Using Local llm...
Using Local llm...
Using Local llm...

=== WORKFLOW DEBUG ===
[Intent Classifier] Intent: trend_analysis
[Ticker Extractor] Ticker: MSFT
[Price Fetcher] Price Data: {'ticker': 'MSFT', 'current_price': 465.95, 'previous_close': 450.865, 'day_high': 471.1, 'day_low': 450.6, 'volume': 37533469, 'market_cap': 3463484014592, 'company_name': 'Microsoft Corporation', 'pe_ratio': 33.11656, 'dividend_yield': 0.78, 'target_mean_price': 617.8565, 'recommendation_key': 'strong_buy', '50_day_average': np.float64(480.8758209228516), '200_day_average': np.float64(482.8081169128418), 'price_history': [478.55999755859375, 483.4700012207031, 478.5299987792969, 474.82000732421875, 476.3900146484375, 476.1199951171875, 483.9800109863281, 485.9200134277344, 484.9200134277344, 486.8500061035156, 488.0199890136719, 487.7099914550781, 487.1000061035156, 487.4800109863281, 483.6199951171875, 472.94000244140625, 472.8500061035156, 478.5