# Financial Data Extraction Notebook
This notebook performs the following:
- Reads a PDF file from AWS S3
- Extracts text using `pdfplumber`
- Sends extracted text to a language model to extract:
  - Revenue
  - Margin
  - IT investment direction
- Saves results to a Parquet file
- Mocks sending extracted data to Salesforce

In [None]:
# Install required packages
!pip install boto3 pdfplumber openai pandas pyarrow

In [None]:
import boto3
import pdfplumber
import openai
import pandas as pd
from io import BytesIO
import json

# Setup your credentials here or use environment variables
s3 = boto3.client('s3')
bucket_name = 'your-bucket-name'
pdf_key = 'path/to/your/file.pdf'

In [None]:
# Download PDF from S3
response = s3.get_object(Bucket=bucket_name, Key=pdf_key)
pdf_bytes = BytesIO(response['Body'].read())

In [None]:
# Extract text from PDF using pdfplumber
all_text = ''
with pdfplumber.open(pdf_bytes) as pdf:
    for page in pdf.pages:
        all_text += page.extract_text() + '\n'

In [None]:
# Send to OpenAI LLM (mock your API key here)
openai.api_key = 'your-openai-api-key'

response = openai.ChatCompletion.create(
    model='gpt-4',
    messages=[
        {'role': 'system', 'content': 'Extract financial data from the text.'},
        {'role': 'user', 'content': f"Extract Revenue, Margin, and IT investment direction from the following text:\n{all_text}"}
    ]
)
llm_output = response['choices'][0]['message']['content']

In [None]:
# Parse output into dictionary
result_dict = json.loads(llm_output)
df = pd.DataFrame([result_dict])
df.to_parquet('extracted_financials.parquet')
df

In [None]:
# Mock sending to Salesforce
print("Mock sending to Salesforce:")
print(df.to_dict(orient='records')[0])