In [3]:
import os
import json
from datetime import datetime
import warnings
warnings.simplefilter('ignore')
import requests
from tqdm import tqdm

# import torch
# import torchvision
# import openai
import PyPDF2
import spacy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from langchain.llms import OpenAI

from library.exportation import export_prompt_response, export_article
from get_embedding_function import get_embedding_function
from prompt_template import define_company_prompt, ask_question_prompt, write_report_prompt, generate_response
from question_bank import question_1, question_3306, question_6055, question_778, question_916
key = "API_KEY"

In [4]:
def load_pdf_text(pdf_path):
    """Load text from a PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    text = text.replace('\n', '')
    return text


def embedding(chunks, nlp):
    """Encode text chunks using spaCy."""
    encoded_chunks = []
    for chunk in chunks:
        doc = nlp(chunk)
        encoded_chunks.append(doc.vector)  # Get the vector representation
    return encoded_chunks

In [5]:
file_name = '916.HK'
folder_name = '3_data'
pdf_file_path = os.path.join(folder_name, f'{file_name}.pdf')
pdf_text = load_pdf_text(pdf_file_path)

# Chunking and Embedding for PDF

In [22]:
# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True  #add_start_index=True else kernel die
)

# Split the text into chunks
text_chunks = text_splitter.split_text(pdf_text)

# Load the spaCy model
nlp = spacy.load("en_core_web_md")  # Load the spaCy model

# Encode the chunks from pdf
encoded_chunks = embedding(text_chunks, nlp)

In [5]:
# def define_company_prompt(content):
#     system_prompt = f"""Analyze the following text chunks and identify the company. 
#     Just provide the company and the stock code only.
#     Chunks:
#     {content}
#     """
#     return system_prompt


# def generate_company_name(prompt, api_key, system_content='you are a assistant'):
#     model = 'gpt-4o-mini'
#     url = "https://api.ohmygpt.com/v1/chat/completions"
    
#     headers = {
#         "Content-Type": "application/json",
#         "Authorization": f"Bearer {api_key}"
#     }
    
#     data = {
#         "model": model,
#         "messages": [
#             {"role": "system", "content": system_content},
#             {"role": "user", "content": prompt}
#         ],
#         "temperature": 0.7
#     }
#     response = requests.post(url, headers=headers, json=data)
#     response_json = response.json()
#     return response_json["choices"][0]["message"]["content"]

In [23]:
def get_company_question(company_name):
    if 'Tobacco' in company_name or '6055' in company_name:
        return question_6055
    elif 'Hutchison' in company_name or '1' in company_name:
        return question_1
    elif 'JNBY' in company_name or '3306' in company_name:
        return question_3306
    elif 'Fortune REIT' in company_name or '778' in company_name:
        return question_778
    elif 'Longyua' in company_name or '916' in company_name:
        return question_916
    else:
        stock_code = input('Input the stock code: ')
        if stock_code == '6055':
            return question_6055
        if stock_code == '1':
            return question_1
        if stock_code == '3306':
            return question_3306
        if stock_code == '778':
            return question_778
        if stock_code == '916':
            return question_916

first_chunk = text_chunks[0]
name_prompt = define_company_prompt(first_chunk)
company_name = generate_response(name_prompt, key)
questions = get_company_question(company_name)

# Top 5 similar token 


In [24]:
top_N = 3
top_N_chunks = {}

for i, q in enumerate(questions):
    q_emb = nlp(q).vector
    # Find the most similar chunks to q_emb
    similarities = cosine_similarity([q_emb], encoded_chunks).flatten()

    top_N_idx = similarities.argsort()[::-1][:top_N]
    
    top_N_text = [text_chunks[i] for i in top_N_idx]

    top_N_chunks[q] = top_N_text

# define prompts


In [25]:
# def ask_question_prompt(content, question):
#     system_prompt = f"""You are an financial analyst.
#     Use the following pieces of retrieved context to answer the question. 
#     Use 3 sentences maximum for question and keep the answer concise.


#     Retrieved context:
#     {content}\n

#     Question:
#     {question}
#     """
#     return system_prompt


# def write_report_prompt(content):
#     system_prompt = f"""You are a financial report writer. 
#     Please combine the text provided and generate a financial analysis report. 
#     The report should be about 15 to 20 paragraphs. The report should include the following sections:
#     1. Company Overview
#     2. Revenue Structure
#     3. Profit
#     4. Valuation
#     5. Summary
#     6. Future Outlook
#     7. and other details provided.
    
#     content is as below:
#     {content}
#     """
#     return system_prompt





# def generate_answer(prompt, api_key):
#     model = 'gpt-4o-mini'
#     url = "https://api.ohmygpt.com/v1/chat/completions"
    
#     headers = {
#         "Content-Type": "application/json",
#         "Authorization": f"Bearer {api_key}"
#     }
    
#     data = {
#         "model": model,
#         "messages": [
#             {"role": "system", "content": "You are a financial assistant asking questions."},
#             {"role": "user", "content": prompt}
#         ],
#         "temperature": 0.7
#     }
#     response = requests.post(url, headers=headers, json=data)
#     response_json = response.json()
#     return response_json["choices"][0]["message"]["content"]


# def generate_summary(prompt, api_key):
#     model = 'gpt-4o-mini'
#     url = "https://api.ohmygpt.com/v1/chat/completions"
    
#     headers = {
#         "Content-Type": "application/json",
#         "Authorization": f"Bearer {api_key}"
#     }
    
#     data = {
#         "model": model,
#         "messages": [
#             {"role": "system", "content": "You are a writer."},
#             {"role": "user", "content": prompt}
#         ],
#         "temperature": 0.7
#     }
#     response = requests.post(url, headers=headers, json=data)
#     response_json = response.json()
#     return response_json["choices"][0]["message"]["content"]

In [None]:
# prompt to llm to ask question
all_prompts = []

# combine chunks and questions to string
for question, top_chunks in top_N_chunks.items():
    top_chunks_combined = '. '.join(top_chunks)
    system_prompt = ask_question_prompt(top_chunks_combined, question)
    all_prompts.append((question, system_prompt))

In [None]:
prompt_response = []
c = 0
for i, (q, p) in tqdm(enumerate(all_prompts), 
                      total=len(all_prompts), 
                      desc="Processing: "):
    response = generate_response(p, key)
    prompt_response.append((q, response))

Processing: 100%|██████████| 100/100 [06:43<00:00,  4.03s/it]


In [29]:
prompt_hist = {}
for q, r in  prompt_response:
    prompt_hist[q] = r

export_prompt_response(file_name, prompt_hist)

JSON data has been exported to 1_prompt_log\916.HK_20241127_0009.json


In [None]:
content = ''
for q, r in prompt_response:
    content += f'{q} {r}\n'
summary_prompt = write_report_prompt(content)
report = generate_response(summary_prompt, key)

export_article(file_name, prompt_hist)

report has been exported to 2_report_log\916.HK_20241127_0013.txt
