# Financial Analysis Agent Hackathon

## Preprocessing

separate the questions based on the answer into MCQ (A-D) and RISE-FALL questions

In [None]:
import pandas as pd
import re

In [None]:
def detect_question_type(query: str) -> str:
    q = query.lower()

    # 1. Rise/Fall questions
    if 'rise or fall' in q or 'rise หรือ fall' in q:
        return 'rise_fall'
    if '"ขึ้น" หรือ "ลง"' in q or 'ขึ้นหรือลง' in q:
        return 'rise_fall'

    # 2. Multiple-choice
    if len(re.findall(r'\n?\s*[a-d][\.:]', q)) >= 3 or \
       'multiple choice' in q or 'ตัวเลือก' in q:
        return 'multiple_choice'

    return 'other'

def detect_language(query: str) -> str:
    th = len(re.findall(r'[ก-๙]', query))
    en = len(re.findall(r'[a-zA-Z]', query))
    if th and en:
        return 'mix'
    elif th:
        return 'th'
    elif en:
        return 'en'
    return 'unknown'

In [None]:
df = pd.read_csv("test.csv")         

df["question_type"] = df["query"].apply(detect_question_type)
df["language"]      = df["query"].apply(detect_language)

In [None]:
df.query("question_type == 'multiple_choice'").to_csv("multiple_choice.csv",
                                                      index=False)
df.query("question_type == 'rise_fall'").to_csv("rise_fall.csv",
                                                index=False)

print("✓ Wrote multiple_choice.csv and rise_fall.csv")

## Rise-Fall Question with RAG

Datasets :
- https://huggingface.co/datasets/ICE-PIXIU/acl
- https://huggingface.co/datasets/ICE-PIXIU/Bigdata
- https://huggingface.co/datasets/ICE-PIXIU/cikm

In [None]:
import pandas as pd
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [1]:
df = pd.read_parquet("./RiseFall.parquet")  #combined dataset of ICE-PIXIU/acl, ICE-PIXIU/Bigdata, ICE-PIXIU/cikm from huggingface datasets

chunks = [
    Document(
        page_content=f"Q: {row['query']}\nA: {row['answer']}",
        metadata={"source": f"qa_{i}"}
    )
    for i, row in df.iterrows()
]

Embedding Model
- https://huggingface.co/BAAI/bge-m3

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="model/bge-m3",
    model_kwargs={"device": "cuda"}
)

vectorstore = Chroma.from_documents(
    chunks,
    embedding=embeddings,
    persist_directory="./chroma_rise-fall_rag"
)
vectorstore.persist()

## Load Rag and Inference

load rag and retriever

In [None]:
import pandas as pd
import re
from tqdm import tqdm
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from vllm import LLM, SamplingParams

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="model/bge-m3",
    model_kwargs={"device": "cuda"}
)

vectorstore = Chroma(
    persist_directory="./chroma_rise-fall_rag",
    embedding_function=embeddings
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

Inference **Qwen3-14B** using VLLM

In [None]:
!pip freeze

In [5]:
llm = LLM(
    model="model/Qwen3-14B",
    gpu_memory_utilization=0.80,
    tensor_parallel_size=4
)
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.95,
    max_tokens=4096
)

INFO 06-26 04:25:17 [config.py:823] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'.
INFO 06-26 04:25:17 [config.py:1946] Defaulting to use mp for distributed inference
INFO 06-26 04:25:17 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 06-26 04:25:23 [__init__.py:244] Automatically detected platform cuda.
INFO 06-26 04:25:26 [core.py:455] Waiting for init message from front-end.
INFO 06-26 04:25:26 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='model/Qwen3-14B', speculative_config=None, tokenizer='model/Qwen3-14B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dt

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  12% Completed | 1/8 [00:06<00:42,  6.00s/it]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:10<00:30,  5.03s/it]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:14<00:22,  4.44s/it]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:17<00:16,  4.18s/it]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:23<00:13,  4.63s/it]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:28<00:09,  4.71s/it]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:29<00:03,  3.60s/it]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:34<00:00,  4.18s/it]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:34<00:00,  4.36s/it]
[1;36m(VllmWorker rank=0 pid=39671)[0;0m 


[1;36m(VllmWorker rank=3 pid=39674)[0;0m INFO 06-26 04:26:12 [default_loader.py:272] Loading weights took 34.93 seconds
[1;36m(VllmWorker rank=0 pid=39671)[0;0m INFO 06-26 04:26:12 [default_loader.py:272] Loading weights took 34.96 seconds
[1;36m(VllmWorker rank=2 pid=39673)[0;0m INFO 06-26 04:26:12 [default_loader.py:272] Loading weights took 35.02 seconds
[1;36m(VllmWorker rank=3 pid=39674)[0;0m INFO 06-26 04:26:12 [gpu_model_runner.py:1624] Model loading took 6.9456 GiB and 35.228663 seconds
[1;36m(VllmWorker rank=0 pid=39671)[0;0m INFO 06-26 04:26:12 [gpu_model_runner.py:1624] Model loading took 6.9456 GiB and 35.239307 seconds
[1;36m(VllmWorker rank=2 pid=39673)[0;0m INFO 06-26 04:26:12 [gpu_model_runner.py:1624] Model loading took 6.9456 GiB and 35.294857 seconds
[1;36m(VllmWorker rank=1 pid=39672)[0;0m INFO 06-26 04:26:13 [default_loader.py:272] Loading weights took 36.36 seconds
[1;36m(VllmWorker rank=1 pid=39672)[0;0m INFO 06-26 04:26:13 [gpu_model_runner.py:16

Build RAG prompts (with similarity search)

In [13]:
df = pd.read_csv("rise_fall.csv")
assert "query" in df.columns

queries = df["query"].tolist()
prompts = []

for user_query in queries:
    docs = retriever.invoke(user_query)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""You are a financial assistant. Given the context and the question, respond only with 'Rise' or 'Fall'.
Output your answer inside an <output> tag, and do not include anything else.

Context:
{context}

Question:
{user_query}

<output>"""
    prompts.append(prompt)

Inference

In [10]:
outputs = llm.generate(prompts, sampling_params)

predictions = []
raw_outputs = []

for out in outputs:
    text = out.outputs[0].text.strip()
    raw_outputs.append(text)

    match = re.search(r"<output>\s*(Rise|Fall)\s*</output>", text, re.IGNORECASE)
    prediction = match.group(1).capitalize() if match else "UNKNOWN"
    predictions.append(prediction)

# === Save to final CSV ===
df["raw_output"] = raw_outputs
df["answer"] = predictions

Adding requests:   0%|          | 0/101 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/101 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [9]:
df["answer"].unique()

array(['UNKNOWN', 'Fall', 'Rise'], dtype=object)

In [40]:
df = df.drop(columns=["query","Language","Question_type"])
df

Unnamed: 0,id,raw_output,answer
0,bcca13bc-2675-4645-82cc-7e4c412ed294,Rise \n</output> </output> \nRise \n</outp...,Rise
1,e625dbc8-f448-4c53-9a78-6c3f351b49c3,Fall \n</output> \nA:Fall\n\nQ: Analyse the ...,Fall
2,9bea42e5-3c21-46dc-93f7-0017f382f7cf,A: Fall \n</output> \nAnswer:\nA: Fall\n\nQ:...,Fall
3,b9964445-c648-4661-ad85-7e5e4cd0feb4,Fall </output>,Fall
4,a803daca-2cab-4d53-be68-c75fb71da84a,Rise \n</output> \n\nQuestion:\nพิจารณาจากข้...,Fall
...,...,...,...
96,9ee0b342-46fd-49be-b001-411a98e0951e,Fall</output>,Fall
97,468934ee-b596-4e39-b990-b16e4171fedc,"Alright, let's tackle this problem step by ste...",Rise
98,7f79e8b5-5fbb-44dc-bb8b-2aa3c28126a3,"Rise </output> Based on the context provided, ...",Rise
99,2699eaff-d51f-4ccc-8a15-8615d54f7a48,Fall \n</output>,Fall


In [None]:
df.to_csv("qwen14-rf-k3-02.csv",index = False)

## MCQ (A-D) Questions (with RAG / without RAG)

Datasets :
- https://huggingface.co/datasets/TheFinAI/flare-cfa
- https://huggingface.co/datasets/Josephgflowers/Finance-Instruct-500k

Creating Database

In [None]:
import json
import pandas as pd

from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
with open("dataset/Finance-Instruct.json", "r", encoding="utf-8") as f:
    json_data = [json.loads(line) for line in f]

json_docs = [
    Document(
        page_content=f"Q: {item['user']}\nA: {item['assistant']}",
        metadata={"source": f"json_{i}"}
    )
    for i, item in enumerate(json_data)
]

In [None]:
csv_df = pd.read_parquet("dataset/flare-cfa.parquet")

csv_docs = [
    Document(
        page_content=f"Q: {row['query']}\nA: {row['answer']}",
        metadata={"source": f"csv_{i}"}
    )
    for i, row in csv_df.iterrows()
]

In [None]:
all_docs = json_docs + csv_docs

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="model/bge-m3",
    model_kwargs={"device": "cuda"}
)

# Vectorstore
vectorstore = Chroma.from_documents(
    documents=all_docs,
    embedding=embeddings,
    persist_directory="./chroma_combined_db"
)

vectorstore.persist()

Load  rag and inference

In [None]:
import pandas as pd
import re
from tqdm import tqdm
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from vllm import LLM, SamplingParams

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="model/bge-m3",
    model_kwargs={"device": "cuda"}
)

vectorstore = Chroma(
    persist_directory="./chroma_combined_db",
    embedding_function=embeddings
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

Inference **Qwen3-32B** using VLLM

In [None]:
llm = LLM(
    model="model/Qwen3-32B",
    gpu_memory_utilization=0.80,
    tensor_parallel_size=4
)

sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.95,
    max_tokens=4096
)

Build RAG prompts

In [None]:
df = pd.read_csv("multiple_choice.csv")
assert "query" in df.columns

queries = df["query"].tolist()
prompts = []

for user_query in queries:
    docs = retriever.invoke(user_query)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""You are a financial assistant. Use the following context to carefully think through the question and select the best answer from A, B, C, or D.

Think step-by-step inside the <thinking> tag. Then output your final answer inside the <output> tag. Do not include anything else.

Context:
{context}

Question:
{user_query}

<thinking>
"""
    prompt += "..."
    prompt += "</thinking>\n\n<output>"

    prompts.append(prompt)

In [None]:
outputs = llm.generate(prompts, sampling_params)

predictions = []
raw_outputs = []

for out in outputs:
    text = out.outputs[0].text.strip()
    raw_outputs.append(text)

    match = re.search(r"<output>\s*(Rise|Fall)\s*</output>", text, re.IGNORECASE)
    prediction = match.group(1).capitalize() if match else "UNKNOWN"
    predictions.append(prediction)

df["raw_output"] = raw_outputs
df["answer"] = predictions

In [None]:
df.to_csv("qwen32-mcq-k5.csv",index = False)

### Without RAG

In [None]:
system_prompt = """You are a financial analyst taking a test to evaluate your knowledge of finance of different topics in finance. You think step by step approach with reflection to answer queries. 

Follow these steps:
1. Think through the problem step by step reflect and verify while reasoning within the <thinking> tags.
2. Please and put the answer your final, concise answer within the <output> tags.

The <thinking> sections are for your internal reasoning process only. 
Do not include any part of the final answer in these sections.
The actual response to the query must be entirely contained within the <output> tags.

Hint: ***Financial Reporting:**
```mermaid
graph TD
A[Articulating Purpose and Context] --> B[Collecting Input Data]
    B --> C[Processing Data]
    C --> D[Analyzing and Interpreting Processed Data]
    D --> E[Developing and Communicating Conclusions]
    E --> F[Doing Follow-Up]

    A --> |Defines goals, tools, and audience| B
    B --> |Gather data on economy and industry| C
    C --> |Use tools like ratios and charts| D
    D --> |Interpret data for conclusions| E
    F --> |Periodic review and iteration| A
```
***Fixed Income:***
```mermaid
graph TD
    A[Purpose and Scope] --> B3[Analyze Macro Conditions]
    B --> C[Assess Bond Features]
    C --> D[Risk and Yield Analysis]
    D --> E[Develop Recommendations]
    E --> F[Review Performance]

    %% Notes and detailed steps
    A --> |Set objectives| B
    B --> |Review interest rates and inflation| C
    C --> |Focus on duration, spread| D
    D --> |Assess scenarios| E
``` 
***Equity Investing:*** 
```mermaid
graph TD
    A[Objective Setting] --> B[Market and Sector Insights]
    B --> C[Industry Competitive Analysis]
    C --> D[Company Review]
    D --> E[Valuation and Risks]
    E --> F[Investment Decision]

    %% Step-specific highlights
    B --> |Look at growth patterns| C
    C --> |Evaluate competitors' positions| D
    D --> |Check financial health| E
    E --> |Combine insights into strategy| F
```
***Derivatives:*** 
```mermaid
graph TD
    A[Define Objective and Context] --> B[Identify Derivative Instrument]
    B --> C[Understand Contract Specifications]
    C --> D[Gather Market Data]
    D --> E[Apply Valuation Models]
    E --> F[Assess Risks: Market, Counterparty, etc.]
    F --> G[Construct Payoff Diagrams or Strategies]
    G --> H[Interpret Results and Make Recommendations]
    H --> I[Review, Monitor, and Adjust Strategies]

    %% Example labels or notes (optional)
    A --> |Hedging, speculation, arbitrage| B
    C --> |Features like notional amount, expiration| D
    D --> |Market prices, volatility, risk-free rates| E
    F --> |Sensitivity to Greeks: Delta, Gamma, Vega, etc.| G
    H --> |Adjust based on changing market conditions| I
```
***Economics:*** 
```mermaid
graph TD;
    A[Step 1: Question Breakdown] -->|Extract key terms| A1{Identify Topic}
    A1 -->|Micro: Supply & Demand, Market Structures| A2
    A1 -->|Macro: GDP, Growth, Policy, Trade| A3
    A1 -->|Currency & Regulation| A4

    A2 --> B1[Identify model: Elasticity, Cost Curves, Shutdown Points]
    A3 --> B2[Map to AD-AS, Business Cycles, Growth Theories]
    A4 --> B3[Assess Exchange Rates, Trade, Capital Flows, Regulation]

    B1 -->|Check for formula or concept?| C{Numerical or Conceptual}
    B2 --> C
    B3 --> C

    C -->|Numerical| D1[Extract data, apply formulas, check assumptions]
    C -->|Conceptual| D2[Analyze cause-effect, policy impact]

    D1 --> E[Step 4: Solution Development]
    D2 --> E
    E -->|Construct structured response| E1(Core insight + economic rationale)
    E -->|Consider alternative scenarios| E2(Assess different possibilities)

    E1 --> F[Step 5: Answer Validation]
    E2 --> F
    F -->|Check logic, principles, and assumptions| F1(Verify consistency)
    F1 -->|Ensure completeness & clarity| F2(Confirm answer structure)
```
***Quantitative Methods:*** 
```mermaid
graph TD
    A["Articulating Purpose and Context"] --> B["Collecting Input Data"]
    B --> C["Processing and Cleaning Data"]
    C --> D["Selecting Quantitative Models and Tools"]
    D --> E["Estimating Parameters and Testing Hypotheses"]
    E --> F["Interpreting Results and Communicating Findings"]
    F --> G["Monitoring and Model Reassessment"]
```
***Portfolio Management:*** 
```mermaid
graph TD
    A["Define Investment Objectives"] --> B["Establish Investment Constraints"]
    B --> C["Develop Strategic Asset Allocation"]
    C --> D["Incorporate Tactical Adjustments"]
    D --> E["Select and Optimize Securities"]
    E --> F["Execute Implementation and Trading"]
    F --> G["Measure Performance and Attribution"]
    G --> H["Monitor Risk and Compliance"]
    H --> I["Rebalance and Adjust Portfolio"]
```
***Alternative Investments:*** 
```mermaid
graph TD
    A["Define Investment Objectives and Mandate"] --> B["Identify Alternative Asset Classes"]
    B --> C["Conduct Manager and Strategy Due Diligence"]
    C --> D["Perform Valuation and Pricing Analysis"]
    D --> E["Assess Risk and Liquidity"]
    E --> F["Allocate Alternatives in Portfolio"]
    F --> G["Monitor Performance and Rebalance"]
```
***Corporate Issuer Analysis:*** 
```mermaid
graph TD
    A["Corporate Issuer Overview"] --> B["Industry Classification"]
    B --> C["Sector Trends and Competitive Landscape"]
    A --> D["Financial Statement Analysis"]
    D --> E["Profitability, Liquidity, Leverage"]
    A --> F["Credit Risk Assessment"]
    F --> G["Rating Agencies and Default Probabilities"]
    A --> H["Capital Structure and Issuance History"]
    H --> I["Bond Issuances and Debt Maturities"]
    A --> J["Corporate Governance and Management"]
    J --> K["Board Quality and Managerial Competence"]
    A --> L["Valuation and Investment Analysis"]
    L --> M["DCF, Relative Valuation, Multiples"]
```
### Response Format:
<thinking>
[Think step by step, reflect, and verify the logic behind your answer. Include reasoning here.]
</thinking>

<output>
"sector": [The sector being addressed],
"question": [The financial question],
"answer": [Only one of the following: "A", "B", "C", "D"] 
</output>"""