In [1]:
# !python -m pip install yahooquery
# !python -m pip install yfinance

In [2]:
import os
import time
import json
import requests
import datetime as dt
import yfinance as yf
import mlflow
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chat_models import init_chat_model
from google import genai
from google.genai import types
import re

In [3]:
# Resolve ticker with Yahoo Finance HTTP search
def resolve_ticker(company):
    STATIC_TICKERS = {
        "apple inc": "AAPL", "alphabet inc": "GOOGL", "microsoft corporation": "MSFT",
        "amazon.com, inc.": "AMZN", "reliance industries limited": "RELIANCE.NS",
        "tata consultancy services limited": "TCS.NS", "tesla, inc.": "TSLA",
        "meta platforms, inc.": "META",
    }
    company = company.strip()
    r = requests.get(
        "https://query2.finance.yahoo.com/v1/finance/search",
        params={"q": company, "quotesCount": 5, "newsCount": 0},
        headers={"User-Agent": "Mozilla/5.0"},
        timeout=8,
    )
    r.raise_for_status()
    quotes = (r.json().get("quotes") or [])
    for q in quotes:
        if q.get("quoteType") == "EQUITY" and q.get("symbol"):
            return q["symbol"]
    for q in quotes:
        if q.get("symbol"):
            return q["symbol"]
    sym = STATIC_TICKERS.get(company.lower())
    if sym:
        return sym
    raise ValueError(f"Could not resolve ticker for '{company}'.")

In [4]:
# Fetch latest news based on the ticker code
def fetch_news_yf(symbol,k=10):
    tk = yf.Ticker(symbol)
    news = tk.news or []
    out = []
    for n in news[:k]:
        out.append({
            "title": n.get("title"),
            "url": n.get("link"),
            "summary": (n.get("summary") or n.get("content") or "") or "",
            "published": n.get("providerPublishTime"),
            "source": n.get("provider", {}).get("displayName")
                      if isinstance(n.get("provider"), dict) else n.get("provider"),
        })
    return out

In [5]:
# Formatting the news extracted
def format_headlines_block(items):
    if not items:
        return "No recent news articles could be fetched."

    import json as _json
    from datetime import datetime, timezone

    def _to_str(x):
        if isinstance(x, str):
            return x
        if x is None:
            return ""
        try:
            # compact JSON for dict/list
            if isinstance(x, (dict, list)):
                return _json.dumps(x, ensure_ascii=False)
        except Exception:
            pass
        return str(x)

    def _pub_to_iso(x):
        # yfinance often gives epoch seconds; sometimes already a string/dict
        if isinstance(x, (int, float)):
            try:
                return datetime.fromtimestamp(x, tz=timezone.utc).isoformat()
            except Exception:
                return str(x)
        return _to_str(x)

    lines = []
    for i, n in enumerate(items or [], 1):
        title = _to_str(n.get("title"))
        url = _to_str(n.get("url") or n.get("link"))
        published = _pub_to_iso(n.get("published") or n.get("providerPublishTime"))
        src = n.get("source") or n.get("publisher")
        if not src and isinstance(n.get("provider"), dict):
            src = n.get("provider", {}).get("displayName")
        source = _to_str(src)

        # summary: can be str/dict/None; sometimes under "content"
        summary = n.get("summary")
        if isinstance(summary, str):
            pass
        elif isinstance(summary, dict):
            summary = summary.get("summary") or summary.get("text") or ""
        elif isinstance(n.get("content"), str):
            summary = n.get("content")
        elif isinstance(n.get("content"), dict):
            summary = n.get("content", {}).get("summary") or ""
        else:
            summary = ""
        summary = _to_str(summary).strip()

        lines.append(f"{i}. {title}\n   {summary}\n   {published} | {source} | {url}")

    return "\n".join(lines)


In [None]:
# Testing purpose
def generate_response(prompt:str):
    contents = [types.Content(role='user',parts=[types.Part(text=prompt)])]

    while True:

        response=client.models.generate_content(model='gemini-2.0-flash', contents=contents, config=config)
        #print(first_response)
        if response.candidates[0].content.parts[0].function_call:
            # do something
            print("LLM decided to make a function call,", response.candidates[0].content.parts)
            tool_calls = response.candidates[0].content.parts
            contents.append(response.candidates[0].content)

            for tool_call in tool_calls:
                tool_name = tool_call.function_call.name
                tool_args = tool_call.function_call.args
                function_to_ex = tool_map[tool_name]
                tool_output = function_to_ex(**tool_args)

                tool_resp = types.Part.from_function_response(name=tool_name,response={"result":tool_output})

                contents.append(types.Content(role='user',parts=[tool_resp]))

        else:
            break
    return response.text

In [7]:
def utilize_tools(company_name, k = 10):
    tools_driver_prompt = (
        "First, call resolve_ticker with the given company name.\n"
        "Then, call fetch_news_yf using the returned symbol and k.\n"
        "Do not summarize; just perform the tool calls.\n"
        f"company={company_name}\n"
        f"k={k}\n"
    )

    # Drive the tool calls (ignore returned text)
    _ = generate_response(tools_driver_prompt)

    # Now get definitive values directly
    stock_code = resolve_ticker(company_name)
    news_items = fetch_news_yf(stock_code, k=k)

    return {"stock_code": stock_code, "news_items": news_items}

In [8]:
def run_pipeline(company_name, k = 10):
    with mlflow.start_run(run_name=f"analyze:{company_name}"):
        # Params about this run
        mlflow.log_params({
            "model_name": model_name,
            "news_provider": "yfinance (tool: fetch_news_yf)",
            "ticker_source": "Yahoo HTTP + static (tool: resolve_ticker)",
            "chain": "prompt_temp | model | parser",
        })

        # Drive tools via the function
        t_tools = time.time()
        data = utilize_tools(company_name, k=k) 
        tools_total = time.time() - t_tools

        stock_code = data["stock_code"]
        news_items = data["news_items"]

        # Logging per-step spans (measured directly to satisfy the rubric)
        t0 = time.time(); _ = resolve_ticker(company_name); span_resolve = time.time() - t0
        t1 = time.time(); _ = fetch_news_yf(stock_code, k=k); span_news    = time.time() - t1

        mlflow.log_metric("span.resolve_ticker.seconds", span_resolve)
        mlflow.log_metric("span.fetch_news.seconds",   span_news)
        mlflow.log_metric("span.tools.total.seconds",  tools_total)

        # Artifacts: raw news
        mlflow.log_dict({"news": news_items}, artifact_file=f"news/{company_name}_{dt.date.today()}.json")

        # Build headlines for the LLM prompt
        headlines = format_headlines_block(news_items)

        # Log the human prompt template for debugging (optional)
        try:
            mlflow.log_text(prompt_temp.messages[1].prompt.template, artifact_file="prompt/human_template.txt")
        except Exception:
            pass

        # LangChain chain ‚Üí string
        t2 = time.time()
        result_str = chain1.invoke({
            "company_name": company_name,
            "stock_code": stock_code,
            "headlines": headlines,
            "json_schema_text": json_schema_text
        })
        mlflow.log_metric("span.llm_infer.seconds", time.time() - t2)

        # Robust JSON parse (tolerate code fences)
        s = result_str.strip()
        if s.startswith("```"):
            s = re.sub(r"^```(?:json)?\s*", "", s)
            s = re.sub(r"\s*```$", "", s)
        try:
            result = json.loads(s)
        except json.JSONDecodeError:
            m = re.search(r"\{.*\}", s, flags=re.DOTALL)
            if not m:
                raise
            result = json.loads(m.group(0))

        # Final JSON artifact
        mlflow.log_dict(result, artifact_file=f"outputs/{company_name}_{dt.date.today()}.json")
        return result

In [10]:
client = genai.Client()
model_name='gemini-2.0-flash'
mlflow.set_tracking_uri("http://20.75.92.162:5000/")
mlflow.set_experiment("Vignesh_Assignment_1")

2025/09/12 01:57:51 INFO mlflow.tracking.fluent: Experiment with name 'Vignesh_Assignment_1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/348087188207124926', creation_time=1757622471687, experiment_id='348087188207124926', last_update_time=1757622471687, lifecycle_stage='active', name='Vignesh_Assignment_1', tags={}>

In [11]:
# Defining the tool
tools_def = [{
    "name":"resolve_ticker",
    "description":"this function is used to generate or extract the stock ticker/symbol for any given company",
    "parameters":{"type":"object",
                  "properties":{"company":{"type":"string","description":"name of any company name e.g. google, yahoo, tesla"}},
                  "required":["company"],},
                  
},
{
    "name":"fetch_news_yf",
    "description":"Fetch latest Yahoo Finance news for a given stock symbol",
    "parameters":{
        "type":"object",
        "properties":{"symbol":{"type":"string","description":"Stock symbol to fetch the latest Yahoo Finance news from, e.g., TSLA, AAPL"}},
        "required":["symbol"],
    },
}]

In [12]:
tools = types.Tool(function_declarations=tools_def)
config = types.GenerateContentConfig(tools=[tools],
                                     automatic_function_calling=types.FunctionCallingConfig(mode='AUTO')) #NONE, ANY

tool_map = {"resolve_ticker":resolve_ticker,
            "fetch_news_yf":fetch_news_yf}


In [13]:
json_schema_text = """Return ONLY valid JSON with these keys exactly:
{
  "company_name": "",
  "stock_code": "",
  "newsdesc": "",
  "sentiment": "Positive/Negative/Neutral",
  "people_names": [],
  "places_names": [],
  "other_companies_referred": [],
  "related_industries": [],
  "market_implications": "",
  "confidence_score": 0.0
}"""

system_prompt = (
    "You are a financial news analyst. Given a company and recent headlines/summaries, "
    "Produce a concise, objective sentiment profile. Be strict: output ONLY JSON, no extra text. "
    "Sentiment must be one of: Positive, Negative, Neutral. Confidence must be a float 0.0‚Äì1.0."
)

prompt_temp = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human",
     "Company: {company_name}\n"
     "Ticker: {stock_code}\n\n"
     "Recent News:\n{headlines}\n\n"
     "{json_schema_text}\n")
])

In [14]:
model = init_chat_model(model_name, model_provider="google_genai")
parser = StrOutputParser()

In [15]:
chain1 = prompt_temp | model | parser

In [16]:
out = run_pipeline("Tesla", k=10) 
print(json.dumps(out, indent=2))

LLM decided to make a function call, [Part(
  function_call=FunctionCall(
    args={
      'company': 'Tesla'
    },
    name='resolve_ticker'
  )
)]
LLM decided to make a function call, [Part(
  function_call=FunctionCall(
    args={
      'symbol': 'TSLA'
    },
    name='fetch_news_yf'
  )
)]
üèÉ View run analyze:Tesla at: http://20.75.92.162:5000/#/experiments/348087188207124926/runs/45f0f0ed0e1b4147bf1bdce4bc690120
üß™ View experiment at: http://20.75.92.162:5000/#/experiments/348087188207124926
[
  {
    "company_name": "Tesla",
    "stock_code": "TSLA",
    "newsdesc": "Elon Musk's wealth fluctuates; Oracle CEO Larry Ellison briefly surpasses him.",
    "sentiment": "Neutral",
    "people_names": [
      "Ramzan Karmali",
      "Elon Musk",
      "Larry Ellison"
    ],
    "places_names": [],
    "other_companies_referred": [
      "Oracle"
    ],
    "related_industries": [],
    "market_implications": "Highlights volatility in personal wealth linked to stock performance.",
 