In [None]:
import os
import PyPDF2
import openai
import requests

from library.exportation import export_prompt_response, export_article

In [None]:
def load_pdf_text(pdf_path):
    """Load text from a PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def generate_answer(context, question, api_key):
    model = 'gpt-4o-mini'
    url = "https://api.ohmygpt.com/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    data = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a financial assistant."},
            {"role": "user", "content": f"Context: {context} Question: {question}"}
        ],
        "temperature": 0.7
    }
    response = requests.post(url, headers=headers, json=data)
    response_json = response.json()
    return response_json["choices"][0]["message"]["content"]

def generate_summary(answers, api_key):
    model = 'gpt-4o-mini'
    url = "https://api.ohmygpt.com/v1/chat/completions"

    summary_prompt = f"""Based on the questions and answers provided. please write a financial report. \n\nThe context is as below \n{answers}"""
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a article writer."},
            {"role": "user", "content": summary_prompt}
        ],
        "temperature": 0.7
    }
    response = requests.post(url, headers=headers, json=data)
    return response.json()["choices"][0]["message"]["content"]


In [None]:
api_key = ""
file_name = '6055.HK'
folder_name = '3_data'
pdf_file_path = os.path.join(folder_name, f'{file_name}.pdf')
pdf_text = load_pdf_text(pdf_file_path)

In [None]:
prompt = prompt()
answers = generate_answer(pdf_text, prompt, api_key)
aritcle = generate_summary(answers, api_key)

In [None]:
export_article(file_name, aritcle)

report has been exported to 2_article_log\6055.HK_20241122_0016.txt


In [None]:
# rag_pipeline.py

import requests
from prompt_template import prompt_template
import langchain as lc
from langchain.document_loaders import LocalLoader
from langchain.retrievers import DenseRetriever
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize document loader
loader = LocalLoader(
    path="path/to/your/documents",
    file_extensions=[".txt", ".pdf", ".docx"]
)

# Initialize retriever
retriever = DenseRetriever(
    loader=loader,
    model_name="sentence-transformers/all-mpnet-base-v2"
)

# Load the pre-trained sentence transformer model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def chunk_text(text, chunk_size=512, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def embed_text_chunks(chunks):
    embeddings = embedding_model.encode(chunks, convert_to_tensor=True)
    return embeddings

def get_most_relevant_chunk(question, chunks, embeddings):
    question_embedding = embedding_model.encode([question], convert_to_tensor=True)
    similarity_scores = cosine_similarity(question_embedding, embeddings)
    most_relevant_idx = np.argmax(similarity_scores)
    return chunks[most_relevant_idx]

def generate_response(context, question, api_key):
    model = 'ollama'
    url = "https://api.ollama.ai/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    prompt = prompt_template.format(context=context, question=question)
    
    data = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a knowledgeable assistant."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7
    }
    
    response = requests.post(url, headers=headers, json=data)
    response_json = response.json()
    return response_json["choices"][0]["message"]["content"]

# Main function to load PDF, chunk text, embed chunks, and generate response
def main(pdf_path, question, api_key):
    # Load text from PDF (assuming a function load_pdf_text is defined)
    text = load_pdf_text(pdf_path)
    chunks = chunk_text(text)
    
    chunk_embeddings = embed_text_chunks(chunks)
    most_relevant_chunk = get_most_relevant_chunk(question, chunks, chunk_embeddings)
    
    answer = generate_response(most_relevant_chunk, question, api_key)
    print(answer)

# Example usage
if __name__ == "__main__":
    pdf_path = "path/to/your/pdf_file.pdf"
    question = "What are the recent advancements in AI research?"
    api_key = "your_api_key"
    
    main(pdf_path, question, api_key)
