In [57]:
import os
import json
from datetime import datetime
import warnings
warnings.simplefilter('ignore')
import requests

import torch
import torchvision
import openai
import PyPDF2
import spacy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
import transformers
# summarizer = transformers.pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", revision="a4f8f3e")
from library.exportation import export_prompt_response, export_article
from questions import questions
from get_embedding_function import get_embedding_function


In [18]:
def load_pdf_text(pdf_path):
    """Load text from a PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text


def embedding(chunks, nlp):
    """Encode text chunks using spaCy."""
    encoded_chunks = []
    for chunk in chunks:
        doc = nlp(chunk)
        encoded_chunks.append(doc.vector)  # Get the vector representation
    return encoded_chunks


def summarize_text(text, max_length=100):
    summarizer = transformers.pipeline("summarization")
    summary = summarizer(text, max_length=max_length, min_length=0, do_sample=False)
    return summary[0]['summary_text']



In [16]:
file_name = '6055.HK'
folder_name = '3_data'
pdf_file_path = os.path.join(folder_name, f'{file_name}.pdf')
pdf_text = load_pdf_text(pdf_file_path)

# Chunking and Embedding for PDF

In [68]:
# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True  #add_start_index=True else kernel die
)

# Split the text into chunks
text_chunks = text_splitter.split_text(pdf_text)

# Load the spaCy model
nlp = spacy.load("en_core_web_md")  # Load the spaCy model

# Encode the chunks from pdf
encoded_chunks = embedding(text_chunks, nlp)

# Top 5 similar token 


In [69]:
top_N = 3
top_N_chunks = {}

for i, q in enumerate(questions):
    q_emb = nlp(q).vector
    # Find the most similar chunks to q_emb
    similarities = cosine_similarity([q_emb], encoded_chunks).flatten()

    top_N_idx = similarities.argsort()[::-1][:top_N]
    
    top_N_text = [text_chunks[i] for i in top_N_idx]

    top_N_chunks[q] = top_N_text

# define prompts


In [113]:
def ask_question_prompt(content, question):
    system_prompt = f"""You are an financial analyst.
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, only say 'I don't know.'. Don't repeat the question.
Use 3 sentences maximum for question and keep the answer concise.


Retrieved context:
{content}\n

Question:
{question}
"""
    return system_prompt


def write_report_prompt(content):
    system_prompt = f"""You are a financial report writer. 
    Please combine the text provided and generate a financial analysis report. 
    The report should be about 15 to 20 paragraphs. The report should include the following sections:
    1. Company Overview
    2. Revenue Structure
    3. Profit
    4. Valuation
    5. Summary
    6. Future Outlook
    7. and other details provided.
    
    content is as below:
    {content}
    """
def generate_answer(prompt, api_key):
    model = 'gpt-4o-mini'
    url = "https://api.ohmygpt.com/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    data = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a financial assistant asking questions."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7
    }
    response = requests.post(url, headers=headers, json=data)
    response_json = response.json()
    return response_json["choices"][0]["message"]["content"]

In [104]:
# prompt to llm to ask question
all_prompts = []

# combine chunks and questions to string
for question, top_chunks in top_N_chunks.items():
    top_chunks_combined = '.'.join(top_chunks)
    system_prompt = ask_question_prompt(top_chunks_combined, question)
    all_prompts.append((question, system_prompt))

In [114]:
sample_prompt = all_prompts[5:7]
sample_prompt_response = []
api_key = "sk-NAWSSGI7999d18B51046T3BlBkFJ514d034054e342cc99c3"
for q, p in sample_prompt:
    response = generate_answer(p, api_key)
    sample_prompt_response.append((q, response))

In [115]:
sample_prompt_response

[('How did the gross profit margin change compared to the previous period?',
  "I don't know."),
 ('What were the operating expenses, and how do they compare to revenue?',
  "I don't know.")]

# Put prompt into LLM

In [None]:
summarization_prompt = """
    based on the questions and answers,\n
    generate a analytics report for me.\n
    Please seperate into 5 to 10 paragraphs. Each part should follow the topic below.\n
    1.Company structure and operations.\n
    2.Business segments and their roles.\n
    3.Import/export models and financial performance.\n
    4.Revenue contributions and growth rates.\n
    5.Profit margin analysis and valuation metrics."""

response2: ChatResponse = chat(model='llama3', messages=[
  {
    'role': 'user',
    'content': summarization_prompt,
  },
])

In [None]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.bedrock import BedrockEmbeddings


def get_embedding_function():
    embeddings = BedrockEmbeddings(
        credentials_profile_name="default", region_name="us-east-1"
    )

    #
    # embeddings = OllamaEmbeddings(model="nomic-embed-text") 
    return embeddings

    