# Extracting APPLE, AMZN, Microsoft Corp., Alphabet Inc.


In [None]:

import openai
import os
import re

# Load OpenAI API key securely (you should set this as an environment variable or store it securely)
openai.api_key = 'key'

# Function to process files for a specific year
def process_year(year, base_dir, ciks_of_interest):
    year_data = []
    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
        qtr_path = os.path.join(base_dir, year, quarter)
        if os.path.exists(qtr_path):
            print(f"Processing directory: {qtr_path}")
            for file_name in os.listdir(qtr_path):
                # Filter relevant 10-K files
                if '10-K' in file_name and file_name.endswith('.txt') and '10-K-A' not in file_name:
                    # Extract CIK from filename using regex
                    cik_match = re.search(r'_edgar_data_(\d+)', file_name)
                    if cik_match:
                        cik = cik_match.group(1)
                        if cik in ciks_of_interest:
                            full_path = os.path.join(qtr_path, file_name)
                            print(f"Processing file: {full_path}")
                            business_section = extract_business_section_with_openai(full_path)
                            year_data.append({
                                'Year': year,
                                'CIK': cik,
                                'Business': business_section,
                                'Path': full_path
                            })
    return year_data

# Function to extract the "Item 1. Business" section using OpenAI
def extract_business_section_with_openai(file_path):
    # Read file content
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

    # Prepare the prompt for OpenAI API
    prompt = f"""
    You are analyzing a 10-K filing document. Extract the full text of the "Item 1. Business" section from Business to "Item 1A Risk Factors",
    and exclude all other sections.
    If the section is not found, return "Section not found".
    Here is the document:
    ----
    {text}
    ----
    """
    
    # Call OpenAI to extract the business section
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=[
            {"role": "system", "content": "You are an expert in financial document analysis, and an NLP expert."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Example usage
base_dir = '/Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k'
ciks_of_interest = [
    '320193',  # Apple Inc.
    '789019',  # Microsoft Corp.
    '1652044',  # Alphabet Inc. (Google)
    '1018724',  # Amazon.com Inc.
]
year_data = process_year('2023', base_dir, ciks_of_interest)



# Display the results
for data in year_data:
    print(f"Year: {data['Year']}, CIK: {data['CIK']}")
    print(f"Business Section:\n{data['Business']}\n")


Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230203_10-K_edgar_data_1018724_0001018724-23-000004.txt
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230203_10-K_edgar_data_1652044_0001652044-23-000016.txt
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230727_10-K_edgar_data_789019_0000950170-23-035122.txt
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20231103_10-K_edgar_data_320193_0000320193-23-000106.txt
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR2
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR3
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/

# Extracting Meta Platforms Inc. (Facebook), NVIDIA Corporation


In [21]:
import openai
import os
import re


# Function to process files for a specific year
def process_year(year, base_dir, ciks_of_interest):
    year_data = []
    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
        qtr_path = os.path.join(base_dir, year, quarter)
        if os.path.exists(qtr_path):
            print(f"Processing directory: {qtr_path}")
            for file_name in os.listdir(qtr_path):
                # Filter relevant 10-K files
                if '10-K' in file_name and file_name.endswith('.txt') and '10-K-A' not in file_name:
                    # Extract CIK from filename using regex
                    cik_match = re.search(r'_edgar_data_(\d+)', file_name)
                    if cik_match:
                        cik = cik_match.group(1)
                        if cik in ciks_of_interest:
                            full_path = os.path.join(qtr_path, file_name)
                            print(f"Processing file: {full_path}")
                            business_section = extract_business_section_with_openai(full_path)
                            year_data.append({
                                'Year': year,
                                'CIK': cik,
                                'Business': business_section,
                                'Path': full_path
                            })
    return year_data

# Function to extract the "Item 1. Business" section using OpenAI
def extract_business_section_with_openai(file_path):
    # Read file content
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

    # Prepare the prompt for OpenAI API
    prompt = f"""
    You are analyzing a 10-K filing document. Extract the full text of the "Item 1. Business" section from Business to "Item 1A Risk Factors",
    and exclude all other sections.
    If the section is not found, return "Section not found".
    Here is the document:
    ----
    {text}
    ----
    """
    
    # Call OpenAI to extract the business section
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=[
            {"role": "system", "content": "You are an expert in financial document analysis, and an NLP expert."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Example usage
base_dir = '/Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k'
ciks_of_interest = [
    '1326801',  # Meta Platforms Inc. (Facebook)
    '1045810',  # NVIDIA Corporation
]
year_data2 = process_year('2023', base_dir, ciks_of_interest)



# Display the results
for data in year_data:
    print(f"Year: {data['Year']}, CIK: {data['CIK']}")
    print(f"Business Section:\n{data['Business']}\n")


Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230202_10-K_edgar_data_1326801_0001326801-23-000013.txt
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230224_10-K_edgar_data_1045810_0001045810-23-000017.txt
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR2
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR3
Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR4
Year: 2023, CIK: 1018724
Business Section:
Here is the full text of the "Item 1. Business" section from the document:

---

Item 1. Business  
This Annual Report on Form 10-K and the documents incorporated herein by reference contain forward-looking statements based on expectations, estimates,

# Extracting Adobe, Cisco


In [27]:
import openai
import os
import re


# Function to process files for a specific year
def process_year(year, base_dir, ciks_of_interest):
    year_data = []
    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
        qtr_path = os.path.join(base_dir, year, quarter)
        if os.path.exists(qtr_path):
            print(f"Processing directory: {qtr_path}")
            for file_name in os.listdir(qtr_path):
                # Filter relevant 10-K files
                if '10-K' in file_name and file_name.endswith('.txt') and '10-K-A' not in file_name:
                    # Extract CIK from filename using regex
                    cik_match = re.search(r'_edgar_data_(\d+)', file_name)
                    if cik_match:
                        cik = cik_match.group(1)
                        if cik in ciks_of_interest:
                            full_path = os.path.join(qtr_path, file_name)
                            print(f"Processing file: {full_path}")
                            business_section = extract_business_section_with_openai(full_path)
                            year_data.append({
                                'Year': year,
                                'CIK': cik,
                                'Business': business_section,
                                'Path': full_path
                            })
    return year_data

# Function to extract the "Item 1. Business" section using OpenAI
def extract_business_section_with_openai(file_path):
    # Read file content
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

    # Prepare the prompt for OpenAI API
    prompt = f"""
    You are analyzing a 10-K filing document. Extract the full text of the "Item 1. Business" section from Business to "Item 1A Risk Factors",
    and exclude all other sections.
    If the section is not found, return "Section not found".
    Here is the document:
    ----
    {text}
    ----
    """
    
    # Call OpenAI to extract the business section
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=[
            {"role": "system", "content": "You are an expert in financial document analysis, and an NLP expert."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Example usage
base_dir = '/Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k'
ciks_of_interest = [
    '796343',  # Adobe Inc.
    '858877'   # Cisco Systems, Inc.
]
year_data3 = process_year('2023', base_dir, ciks_of_interest)



# Display the results
for data in year_data:
    print(f"Year: {data['Year']}, CIK: {data['CIK']}")
    print(f"Business Section:\n{data['Business']}\n")


Processing directory: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230117_10-K_edgar_data_796343_0000796343-23-000007.txt
Processing file: /Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k/2023/QTR1/20230907_10-K_edgar_data_858877_0000858877-23-000023.txt


RateLimitError: Request too large for gpt-4o-mini in organization org-7HuOcWfAGnGBT20Y9lXL9GD5 on tokens per min (TPM): Limit 200000, Requested 242920. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.

# Extracting Intel Corporation, Tesla, Inc.

In [None]:
import openai
import os
import re


# Function to process files for a specific year
def process_year(year, base_dir, ciks_of_interest):
    year_data = []
    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
        qtr_path = os.path.join(base_dir, year, quarter)
        if os.path.exists(qtr_path):
            print(f"Processing directory: {qtr_path}")
            for file_name in os.listdir(qtr_path):
                # Filter relevant 10-K files
                if '10-K' in file_name and file_name.endswith('.txt') and '10-K-A' not in file_name:
                    # Extract CIK from filename using regex
                    cik_match = re.search(r'_edgar_data_(\d+)', file_name)
                    if cik_match:
                        cik = cik_match.group(1)
                        if cik in ciks_of_interest:
                            full_path = os.path.join(qtr_path, file_name)
                            print(f"Processing file: {full_path}")
                            business_section = extract_business_section_with_openai(full_path)
                            year_data.append({
                                'Year': year,
                                'CIK': cik,
                                'Business': business_section,
                                'Path': full_path
                            })
    return year_data

# Function to extract the "Item 1. Business" section using OpenAI
def extract_business_section_with_openai(file_path):
    # Read file content
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

    # Prepare the prompt for OpenAI API
    prompt = f"""
    You are analyzing a 10-K filing document. Extract the full text of the "Item 1. Business" section from Business to "Item 1A Risk Factors",
    and exclude all other sections.
    If the section is not found, return "Section not found".
    Here is the document:
    ----
    {text}
    ----
    """
    
    # Call OpenAI to extract the business section
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=[
            {"role": "system", "content": "You are an expert in financial document analysis, and an NLP expert."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Example usage
base_dir = '/Users/shpetimtafili/Desktop/Market Microstructure/assignment 2/10k'
ciks_of_interest = [
    '50863',  # Intel Corporation
    '1318605',  # Tesla, Inc.
]
year_data4 = process_year('2023', base_dir, ciks_of_interest)



# Display the results
for data in year_data:
    print(f"Year: {data['Year']}, CIK: {data['CIK']}")
    print(f"Business Section:\n{data['Business']}\n")


# COSINE SIMILARITY ANALYSIS

In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming `year_data` is the list of dictionaries from your extraction step:
# e.g. [{'Year': '2023', 'CIK': '1018724', 'Business': '...'}, ...]

# 1. Convert to a DataFrame for convenience
df = pd.DataFrame(year_data)

# 2. Filter out any rows where extraction might have failed
df = df[df['Business'] != "Section not found"]

# 3. Vectorize the “Item 1. Business” text using TF–IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['Business'])

# 4. Compute pairwise cosine similarity for each document’s vector
cosine_sim = cosine_similarity(tfidf_matrix)

# 5. ‘cosine_sim’ is now a matrix showing similarities between each pair of 10-K “Business” sections
print("Cosine Similarity Matrix:\n", cosine_sim)

# (Optional) You can also attach the similarity matrix back to your DataFrame or store it in a file.


Cosine Similarity Matrix:
 [[1.         0.20177203 0.16203503 0.25780041]
 [0.20177203 1.         0.26206667 0.18331304]
 [0.16203503 0.26206667 1.         0.15296673]
 [0.25780041 0.18331304 0.15296673 1.        ]]
