RAG Document 
-Finance-Instruct-500k.csv
-LONGCOT-Refine-500k.csv


In [None]:
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
from tqdm import tqdm
import torch

# Load the datasets from Hugging Face
dataset1 = pd.read_csv("./RAG_dataset/Finance-Instruct-500k.csv")
dataset2 = pd.read_csv("./RAG_dataset/LONGCOT-Refine-500k.csv")

# Prepare datasets
# For dataset1
df1 = dataset1.rename(columns={"user": "prompt", "assistant": "response"})
df1 = df1[["prompt", "response"]]  

# For dataset2
df2 = dataset2[["prompt", "response"]]

# Combine vertically
combined_df = pd.concat([df1, df2], ignore_index=True)
print(f"Combined dataset size: {combined_df.shape}")

# Clean the data
combined_df["prompt"] = combined_df["prompt"].str.strip()
combined_df["response"] = combined_df["response"].str.strip()


In [None]:
# 1. Load embedding model
embedder = SentenceTransformer('./all-MiniLM-L6-v2')  # lightweight example

# 2. Encode your prompts as document embeddings
documents = combined_df['prompt'].tolist()  # or combined_df['prompt'] + " " + combined_df['response']
doc_embeddings = embedder.encode(documents, convert_to_numpy=True)

# Build Annoy index
dim = 384  # embedding dimension
index = AnnoyIndex(dim, 'angular')

for i, vec in enumerate(doc_embeddings):
    index.add_item(i, vec)

index.build(10)  # 10 trees
index.save('annoy_index.ann')



INFERENCE Fin-o1-8B with RAG

In [None]:
index = AnnoyIndex(dim, 'angular')
index.load('annoy_index.ann')

In [None]:

def retrieve_documents_annoy(query, k=3, threshold=0.7):
    q_emb = embedder.encode([query])[0]
    indices, distances = index.get_nns_by_vector(q_emb, k, include_distances=True)
    
    # Only filter by relevance threshold
    filtered_indices = [idx for idx, dist in zip(indices, distances) if dist < threshold]
    
    retrieved_docs = combined_df.iloc[filtered_indices][['prompt', 'response']]
    
    context_parts = []
    for _, doc in retrieved_docs.iterrows():
        context_parts.append(f"Q: {doc['prompt']}\nA: {doc['response']}")
    
    return "\n\n".join(context_parts)

In [None]:
def build_rag_prompt(query):
    context = retrieve_documents_annoy(query, k=3)
    prompt = (
        "🚨 SYSTEM: You are a financial and general knowledge expert.\n"
        "You MUST respond with ONE WORD ONLY: A, B, C, D, E, Rise, or Fall.\n"
        "NO explanations. NO reasoning. NO punctuation. NO greetings. NO repetition. NO commentary.\n"
        "➡️ If the question has options A–E, respond with only one of those letters.\n"
        "➡️ If the question has no options, respond with Rise or Fall based on the data.\n"
        "➡️ Use tweet sentiment (60%) and price data (40%) to decide.\n"
        "If tweet data is missing, use price data only.\n"
        "⚠️ Your response MUST be ONE of: A, B, C, D, E, Rise, Fall.\n"
        "⚠️ ANY other response is incorrect.\n"
        "❌ Do NOT explain. Do NOT generate full sentences. Do NOT elaborate.\n"
        "\n"
        "คุณเป็นผู้เชี่ยวชาญด้านการเงินและความรู้ทั่วไป\n"
        "คุณต้องตอบด้วยคำเดียวเท่านั้น: A, B, C, D, E, Rise หรือ Fall\n"
        "❌ ห้ามเขียนคำอธิบาย ห้ามเขียนประโยค ห้ามใส่เครื่องหมายวรรคตอน ห้ามพูดคุย ห้ามอธิบายเพิ่ม\n"
        "ตอบเฉพาะ A, B, C, D, E, Rise หรือ Fall เท่านั้น\n"
        "\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query.strip()}\n\n"
        "Answer:"
    )
    
    return prompt

In [None]:
def build_prompt(query):
    prompt = (
        "🚨 SYSTEM: You are a financial and general knowledge expert.\n"
        "You MUST respond with ONE WORD ONLY: A, B, C, D, E, Rise, or Fall.\n"
        "NO explanations. NO reasoning. NO punctuation. NO greetings. NO repetition. NO commentary.\n"
        "➡️ If the question has options A–E, respond with only one of those letters.\n"
        "➡️ If the question has no options, respond with Rise or Fall based on the data.\n"
        "➡️ Use tweet sentiment (60%) and price data (40%) to decide.\n"
        "If tweet data is missing, use price data only.\n"
        "⚠️ Your response MUST be ONE of: A, B, C, D, E, Rise, Fall.\n"
        "⚠️ ANY other response is incorrect.\n"
        "❌ Do NOT explain. Do NOT generate full sentences. Do NOT elaborate.\n"
        "\n"
        "คุณเป็นผู้เชี่ยวชาญด้านการเงินและความรู้ทั่วไป\n"
        "คุณต้องตอบด้วยคำเดียวเท่านั้น: A, B, C, D, E, Rise หรือ Fall\n"
        "❌ ห้ามเขียนคำอธิบาย ห้ามเขียนประโยค ห้ามใส่เครื่องหมายวรรคตอน ห้ามพูดคุย ห้ามอธิบายเพิ่ม\n"
        "ตอบเฉพาะ A, B, C, D, E, Rise หรือ Fall เท่านั้น\n"
        "\n"
        f"Question: {query.strip()}\n\n"
        "Answer:"
    )
    
    return prompt

In [None]:
class ChatPipeline:
    def __init__(self, model_name: str):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True
        )

    def chat(self, user_prompt: str, max_new_tokens=128) -> tuple[str, float]:
        inputs = self.tokenizer(user_prompt, return_tensors="pt").to(self.model.device)

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=self.tokenizer.eos_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            output_scores=True,
            return_dict_in_generate=True
        )
        
        generated_ids = outputs.sequences[0][inputs["input_ids"].shape[-1]:]
        response = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
        
        # Calculate confidence from first generated token (most important for single-word answers)
        if len(outputs.scores) > 0:
            first_token_logits = outputs.scores[0][0]  # First token, first batch
            probs = torch.softmax(first_token_logits, dim=-1)
            confidence = torch.max(probs).item()
        else:
            confidence = 0.0

        return response.strip(), confidence
    
    def chat_rag(self, user_prompt: str, max_new_tokens=128) -> str:
        inputs = self.tokenizer(user_prompt, return_tensors="pt").to(self.model.device)

        output_ids = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=self.tokenizer.eos_token_id,
            eos_token_id=self.tokenizer.eos_token_id
        )
        

        generated_ids = output_ids[0][inputs["input_ids"].shape[-1]:]
        response = self.tokenizer.decode(generated_ids, skip_special_tokens=True)

        return response.strip()

In [None]:
def extract_answer(text: str) -> str:
    # Clean whitespace and look for the answer using regex
    text = text.strip()
    match = re.search(r'\b(A|B|C|D|E|Rise|Fall)\b', text, re.IGNORECASE)
    if match:
        ans = match.group(1)
        if ans in {'Rise', 'Fall'}:
            return ans
        elif ans.upper() in {'A', 'B', 'C', 'D', 'E'}:
            return ans.upper()
    return 'X'

In [None]:
# ======== Load Model ========
model_path = "./Fin-o1-8B" 

pipeline = ChatPipeline(model_path)

# ======== Load Dataset ========
test_df = pd.read_csv("./test.csv") 

results = []


# ======== Loop Through Test Set ========
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    qid = row['id']
    query = row['query']
    
    prompt = build_prompt(query)
    print("\n--- Prompt ---")
    print(prompt)
    
    response, confidence = pipeline.chat(prompt, max_new_tokens=64)
    print("\n--- Raw Response ---")
    print(repr(response))
    print(f"Confidence: {confidence:.4f}")
    
    
    if confidence > 0.7:
        print("Confidence is high, using direct response.")
        answer = extract_answer(response)
        print("answer:", answer)
    else:
        print("Confidence is low, using RAG pipeline.")
        prompt = build_rag_prompt(query)
        response, confidence = pipeline.chat(prompt, max_new_tokens=64)
        answer = extract_answer(response)
        print("Final answer:", answer)
    
    results.append({"id": qid, "Answer": answer})


# ======== Save Results ========
submission_df = pd.DataFrame(results)
submission_df.to_csv("submission.csv", index=False)
print("✅ submission.csv saved.")

