In [17]:
import os
import requests
import PyPDF2
import pandas as pd
import pickle
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings.base import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import tiktoken
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Paths
DATA_PATH = "data/handcrafted"
REPORTS_JSON_PATH = 'data/reports.json'
VECTOR_DB_NAME = "vector_db.pkl"
REMAKE_ALL = False

# Load Data
df = pd.read_json(REPORTS_JSON_PATH)

# filter df to "dataset":"handcrafted"
df = df[df['dataset'] == 'handcrafted']

# some df['company_name'] values have leading/trailing whitespaces, remove them
df['company_name'] = df['company_name'].str.strip()

print(f"Loading {len(df['company_name'].unique())} companies. Names: {df['company_name'].unique()}")

# Create directory for data if it doesn't exist
os.makedirs(DATA_PATH, exist_ok=True)

# Function to download reports
def download_reports(df: pd.DataFrame, company_name: str, save_dir: str):
    company_dir = os.path.join(save_dir, company_name)
    os.makedirs(company_dir, exist_ok=True)
    
    for url in df['pdf_url']:
        pdf_filename = os.path.basename(url)
        # ignore query parameters in filename
        pdf_filename = pdf_filename.split('?')[0]
        pdf_path = os.path.join(company_dir, pdf_filename)
        try:
            if not os.path.exists(pdf_path):
                response = requests.get(url)
                response.raise_for_status()  # Ensure the request was successful
                with open(pdf_path, 'wb') as file:
                    file.write(response.content)
        except requests.exceptions.RequestException as e:
            print(f"ERROR: Skipping file {pdf_filename} due to download error: {e}")
    print(f"Reports for {company_name} downloaded successfully (if not already present).")

# Function to create vector database from PDF reports
def create_vector_database(files_path: str):
    db_path = os.path.join(files_path, VECTOR_DB_NAME)
    if os.path.exists(db_path) and not REMAKE_ALL:
        print(f"Vector database already exists at {db_path}, skipping creation.")
        return

    documents = []
    for file in os.listdir(files_path):
        _, file_extension = os.path.splitext(file)
        text = ""
        if file_extension == ".pdf":
            try:
                with open(os.path.join(files_path, file), 'rb') as f:
                    reader = PyPDF2.PdfReader(f, strict=False)
                    for page in reader.pages:
                        text += page.extract_text() + "\n"
                
                if text:
                    documents.append(Document(page_content=text, metadata={"source": file}))
                else:
                    print(f"WARNING: No text extracted from {file}")
            except Exception as e:
                print(f"ERROR: Skipping file {file} due to error: {e}")
        else:
            print(f"Unsupported file extension: {file_extension}")
    
    if documents:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=300, separators=["\n\n", "\n"])
        texts = text_splitter.split_documents(documents)
        embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
        db = FAISS.from_documents(texts, embeddings)
        
        tokenizer = tiktoken.get_encoding("cl100k_base")
        build_token_count = sum([len(tokenizer.encode(doc.page_content)) for doc in texts])
        print(f"Token count: {build_token_count}")
        
        with open(db_path, "wb") as f:
            pickle.dump(db.serialize_to_bytes(), f)
        print(f"Vector database created and saved at {db_path}")


# Process each company
for company_name in df['company_name'].unique():
    company_df = df[df['company_name'] == company_name]
    company_dir = os.path.join(DATA_PATH, company_name)

    
    # Step 1: Download reports
    download_reports(company_df, company_name, DATA_PATH)
    
    # Step 2: Create vector database for each company
    create_vector_database(company_dir)

print("All companies processed successfully.")

Loading 8 companies. Names: ['Walmart' 'Amazon' 'Saudi Aramco' 'Apple' 'Volkswagen' 'Google' 'BP'
 'H&M']
Reports for Walmart downloaded successfully (if not already present).
Token count: 86274
Vector database created and saved at data/handcrafted\Walmart\vector_db.pkl
Reports for Amazon downloaded successfully (if not already present).
Token count: 55293
Vector database created and saved at data/handcrafted\Amazon\vector_db.pkl
Reports for Saudi Aramco downloaded successfully (if not already present).
Token count: 139626
Vector database created and saved at data/handcrafted\Saudi Aramco\vector_db.pkl
Reports for Apple downloaded successfully (if not already present).
Token count: 202599
Vector database created and saved at data/handcrafted\Apple\vector_db.pkl
Reports for Volkswagen downloaded successfully (if not already present).
ERROR: Skipping file 2023_Volkswagen_Group_Sustainability_Report.pdf due to error: PyCryptodome is required for AES algorithm
ERROR: Skipping file Nonfinan