In [1]:
import json
import pandas as pd
import os
from datetime import datetime
import warnings
warnings.simplefilter('ignore')
import requests
from tqdm import tqdm
from openpyxl import load_workbook
# import torch
# import torchvision
# import openai
import PyPDF2
import spacy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from langchain.llms import OpenAI
from library.exportation import export_prompt_response, export_article

from prompt_template import define_company_prompt, ask_question_prompt, write_report_prompt, generate_response
# from question_bank import question_1, question_3306, question_6055, question_778, question_916
key = "sk-NAWSSGI7999d18B51046T3BlBkFJ514d034054e342cc99c3"

In [2]:
def load_pdf_text(pdf_path):
    """Load text from a PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    text = text.replace('\n', '')
    return text


def embedding(chunks, nlp):
    """Encode text chunks using spaCy."""
    encoded_chunks = []
    for chunk in chunks:
        doc = nlp(chunk)
        encoded_chunks.append(doc.vector)  # Get the vector representation
    return encoded_chunks

In [3]:
# file_name = '1.HK'
# file_name = '778.HK'
# file_name = '916.HK'
# file_name = '3306.HK'
file_name = '6055.HK'

folder_name = '3_data'
pdf_file_path = os.path.join(folder_name, f'{file_name}.pdf')
pdf_text = load_pdf_text(pdf_file_path)

# chunking & Embedding

In [4]:
# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True  #add_start_index=True else kernel die
)

# Split the text into chunks
text_chunks = text_splitter.split_text(pdf_text)

# Load the spaCy model
nlp = spacy.load("en_core_web_md")  # Load the spaCy model

# Encode the chunks from pdf
encoded_chunks = embedding(text_chunks, nlp)

# get questions

In [5]:
def get_company_code(company_name):
    if 'Tobacco' in company_name or '6055' in company_name:
        return '6055'
    elif 'JNBY' in company_name or '3306' in company_name:
        return '3306'
    elif 'Fortune REIT' in company_name or '778' in company_name:
        return '778'
    elif 'Longyua' in company_name or '916' in company_name:
        return '916'
    elif 'Hutchison' in company_name or '1' in company_name:
        return '1'
    else:
        return input('Input the stock code: ')

first_chunk = text_chunks[0]
name_prompt = define_company_prompt(first_chunk)
company_name = generate_response(name_prompt, key)
company_code = get_company_code(company_name)
print('company code: ', company_code)
questions = pd.read_excel('questions.xlsx', sheet_name=company_code).iloc[:, -1].dropna()

company code:  6055


# Top N similar token 


In [7]:
top_N = 3
top_N_chunks = {}

for i, q in enumerate(questions):
    q_emb = nlp(q).vector
    # Find the most similar chunks to q_emb
    similarities = cosine_similarity([q_emb], encoded_chunks).flatten()

    top_N_idx = similarities.argsort()[::-1][:top_N]
    
    top_N_text = [text_chunks[i] for i in top_N_idx]

    top_N_chunks[q] = top_N_text

# define prompts


In [8]:
# prompt to llm to ask question
all_prompts = []

# combine chunks and questions to string
for question, top_chunks in top_N_chunks.items():
    top_chunks_combined = '. '.join(top_chunks)
    system_prompt = ask_question_prompt(top_chunks_combined, question)
    all_prompts.append((question, system_prompt))

sum(len(j) for _, j in all_prompts)

167404

In [9]:
prompt_response = []
c = 0
for i, (q, p) in tqdm(enumerate(all_prompts), 
                      total=len(all_prompts), 
                      desc="Processing: "):
    response = generate_response(p, key)
    prompt_response.append((q, response))

Processing: 100%|██████████| 50/50 [03:21<00:00,  4.02s/it]


# save version of response

In [10]:
file_path = 'response.xlsx'
version = 'ver2'
book = load_workbook(file_path)
update_sheet = company_code

target_company = pd.read_excel(file_path, sheet_name=update_sheet)
target_company[version] = pd.DataFrame(prompt_response).iloc[:, -1]

with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    target_company.to_excel(writer, sheet_name=update_sheet, index=False)

In [22]:
code_list = ['1', '778', '916', '3306', '6055']
report_list = []

for code in code_list:
    content = ''
    q = pd.read_excel('questions.xlsx', sheet_name=code).iloc[:, -1].dropna()
    r = pd.read_excel('response.xlsx', sheet_name=code).iloc[:, -1].dropna()
    for i, j in zip(q, r):
        content += i + j + '\n\n'
    summary_prompt = write_report_prompt(content)
    report = generate_response(summary_prompt, key)
    report_list.append(pd.DataFrame([report], columns=['ver2']))

In [24]:
report_1, report_778, report_916, report_3306, report_6055 = report_list
with pd.ExcelWriter('reports.xlsx') as writer:
    report_1.to_excel(writer, sheet_name='1', index=False)
    report_778.to_excel(writer, sheet_name='778', index=False)
    report_916.to_excel(writer, sheet_name='916', index=False)
    report_3306.to_excel(writer, sheet_name='3306', index=False)
    report_6055.to_excel(writer, sheet_name='6055', index=False)