# Extracting AMAZON

In [25]:
import openai
import os
import re

# Load OpenAI API key securely (you should set this as an environment variable or store it securely)
openai.api_key = 'key'

# Function to process files for a specific year
def process_year(year, base_dir, ciks_of_interest):
    year_data = []
    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
        qtr_path = os.path.join(base_dir, year, quarter)
        if os.path.exists(qtr_path):
            print(f"Processing directory: {qtr_path}")
            for file_name in os.listdir(qtr_path):
                # Filter relevant 10-K files
                if '10-K' in file_name and file_name.endswith('.txt') and '10-K-A' not in file_name:
                    # Extract CIK from filename using regex
                    cik_match = re.search(r'_edgar_data_(\d+)', file_name)
                    if cik_match:
                        cik = cik_match.group(1)
                        if cik in ciks_of_interest:
                            full_path = os.path.join(qtr_path, file_name)
                            print(f"Processing file: {full_path}")
                            business_section = extract_business_section_with_openai(full_path)
                            year_data.append({
                                'Year': year,
                                'CIK': cik,
                                'Business': business_section,
                                'Path': full_path
                            })
    return year_data

# Function to extract the "Item 1. Business" section using OpenAI
def extract_business_section_with_openai(file_path):
    # Read file content
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

    # Prepare the prompt for OpenAI API
    prompt = f"""
    You are analyzing a 10-K filing document. Extract the full text of the "Item 1. Business" section from Business to "Item 1A Risk Factors",
    and exclude all other sections.
    If the section is not found, return "Section not found".
    Here is the document:
    ----
    {text}
    ----
    """
    
    # Call OpenAI to extract the business section
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=[
            {"role": "system", "content": "You are an expert in financial document analysis, and an NLP expert."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Example usage
base_dir = '/Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k'
ciks_of_interest = ['1018724', '0000320193']  # Replace with actual CIKs of the tech companies you're interested in
year_data = process_year('2023', base_dir, ciks_of_interest)

# Display the results
for data in year_data:
    print(f"Year: {data['Year']}, CIK: {data['CIK']}")
    print(f"Business Section:\n{data['Business']}\n")


Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230203_10-K_edgar_data_1018724_0001018724-23-000004.txt
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR2
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR3
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR4
Year: 2023, CIK: 1018724
Business Section:
Here is the extracted text for "Item 1. Business":

---

Item 1. Business

This Annual Report on Form 10-K and the documents incorporated herein by reference contain forward-looking statements based on expectations, estimates, and projections as of the date of this filing. Actual results and outcomes may differ materially from those expressed in forward-looking statements. See Item 1A of Part I Risk F

# Extracting for more firms

In [23]:
# List of CIKs for 10 major tech firms
ciks_of_interest = [
    '0000320193',  # Apple Inc.
    '0000789019',  # Microsoft Corp.
    '0001652044',  # Alphabet Inc. (Google)
    '0001018724',  # Amazon.com Inc.
    '0001326801',  # Meta Platforms, Inc. (Facebook)
    '0001045810',  # NVIDIA Corporation
    '0001318605',  # Tesla, Inc.
    '0000050863',  # Intel Corporation
    '0000796343',  # Adobe Inc.
    '0000858877'   # Cisco Systems, Inc.
]

# Function to process files for multiple companies
def process_multiple_companies(year, base_dir, ciks_of_interest):
    year_data = []
    # Process files in the base directory, ignoring quarters
    for file_name in os.listdir(base_dir):
        if '10-K' in file_name and file_name.endswith('.txt') and '10-K-A' not in file_name:
            cik_match = re.search(r'_edgar_data_(\d+)', file_name)
            if cik_match:
                cik = cik_match.group(1)
                if cik in ciks_of_interest:
                    full_path = os.path.join(base_dir, file_name)
                    print(f"Processing file: {full_path}")  # Debugging line
                    business_section = extract_business_section_with_openai(full_path)
                    year_data.append({
                        'Year': year,
                        'CIK': cik,
                        'Business': business_section,
                        'Path': full_path
                    })
    return year_data

# Function to extract the "Item 1. Business" section using OpenAI
def extract_business_section_with_openai(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

    prompt = f"""
    You are analyzing a 10-K filing document. Extract the full text of the "Item 1. Business" section from Business to "Item 1A Risk Factors",
    and exclude all other sections.
    If the section is not found, return "Section not found".
    Here is the document:
    ----
    {text}
    ----
    """

    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an expert in financial document analysis, and an NLP expert."},
            {"role": "user", "content": prompt}
        ]
    )
    print(f"OpenAI response: {response}")  # Debugging line
    return response['choices'][0]['message']['content']

# Example usage
base_dir = '/Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023'  # Adjust the base directory path
year_data = process_multiple_companies('2023', base_dir, ciks_of_interest)

# Display the results
for data in year_data:
    print(f"Year: {data['Year']}, CIK: {data['CIK']}")
    print(f"Business Section:\n{data['Business']}\n")