In [None]:
import json
import subprocess
import os

def download_10k_filings(tickers, start_year):
    """
    Downloads and extracts 10-K filings for the specified tickers and starting year until the current year.
    This function will clone the EDGAR crawler repository, download the requirements.txt if it's not present,
    set up configurations, and run the download and extract scripts.

    Args:
        tickers (list): The list of stock ticker symbols for the companies (e.g., ["AAPL", "GOOG"]).
        start_year (int): The year from which to start downloading filings.
    """
    # Get the current year
    from datetime import datetime
    current_year = datetime.now().year

    # Clone the repository
    repo_url = "https://github.com/nlpaueb/edgar-crawler.git"
    repo_dir = "edgar-crawler"

    # Clone the repository only if it doesn't exist
    if not os.path.exists(repo_dir):
        print(f"Cloning the repository from {repo_url}...")
        subprocess.run(["git", "clone", repo_url], check=True)

    # Navigate to the repository directory
    os.chdir(repo_dir)

    # Download the requirements.txt only if it doesn't exist
    requirements_file = "requirements.txt"
    if not os.path.exists(requirements_file):
        print(f"Downloading {requirements_file}...")
        subprocess.run(["curl", "-O", "https://raw.githubusercontent.com/nlpaueb/edgar-crawler/master/requirements.txt"], check=True)

    # Install the required dependencies if not already installed
    print("Installing required dependencies...")
    subprocess.run(["pip", "install", "-r", requirements_file], check=True)

    for ticker in tickers:
        # Create the configuration dictionary
        config = {
            "download_filings": {
                "start_year": start_year,
                "end_year": current_year,
                "quarters": [1, 2, 3, 4],
                "filing_types": ["10-K"],
                "cik_tickers": [ticker],  # Dynamic ticker
                "user_agent": "Your Name (your-email@example.com)",  # Update with your information
                "raw_filings_folder": "RAW_FILINGS",
                "indices_folder": "INDICES",
                "filings_metadata_file": "FILINGS_METADATA.csv",
                "skip_present_indices": True
            },
            "extract_items": {
                "raw_filings_folder": "RAW_FILINGS",
                "extracted_filings_folder": "EXTRACTED_FILINGS",
                "filings_metadata_file": "FILINGS_METADATA.csv",
                "filing_types": ["10-K"],
                "include_signature": False,
                "items_to_extract": [],
                "remove_tables": True,
                "skip_extracted_filings": True
            }
        }

        # Write the config to a file
        with open('config.json', 'w') as f:
            json.dump(config, f, indent=4)

        # Run the download and extract scripts
        try:
            print(f"Downloading filings for {ticker} from {start_year} to {current_year}...")
            subprocess.run(["python", "download_filings.py"], check=True)

            print(f"Extracting items from filings for {ticker}...")
            subprocess.run(["python", "extract_items.py"], check=True)

            print(f"Process completed successfully for {ticker}!")
        except subprocess.CalledProcessError as e:
            print(f"An error occurred for {ticker}: {e}")
        finally:
            # Navigate back to the original directory
            os.chdir('..')
            os.chdir(repo_dir)  # Go back to repo_dir for the next ticker

    # Navigate back to the original directory after processing all tickers
    os.chdir('..')

# Example usage
n = 2  # Number of companies to fetch data for
tickers = ["AAPL", "GOOG"]  # Replace with the desired tickers for the companies
start_year = 2023  # Replace with the desired starting year

if len(tickers) != n:
    raise ValueError("The number of tickers provided does not match the specified number of companies (n).")

download_10k_filings(tickers, start_year)

Cloning the repository from https://github.com/nlpaueb/edgar-crawler.git...
Installing required dependencies...
Downloading filings for AAPL from 2023 to 2025...
Extracting items from filings for AAPL...
Process completed successfully for AAPL!
Downloading filings for GOOG from 2023 to 2025...
Extracting items from filings for GOOG...
Process completed successfully for GOOG!


In [None]:
import os
import json

def extract_all_json_content(folder_path):
    """
    Extracts all content from JSON files in the specified folder, using only the first three parts of the filename.

    Args:
        folder_path (str): Path to the folder containing the JSON files.

    Returns:
        list: A list of dictionaries containing the content of each JSON file.
    """
    extracted_content = []

    # Ensure the folder exists
    if not os.path.exists(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return extracted_content

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        # Process only JSON files
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)

            try:
                # Extract the first three components from the filename
                parts = file_name.replace(".json", "").split("_")[:3]
                if len(parts) < 3:
                    print(f"Skipping invalid filename: {file_name}")
                    continue

                cik, filing_type, year = parts

                # Load the JSON content
                with open(file_path, 'r') as f:
                    content = json.load(f)

                # Add metadata to the content
                content["cik"] = cik
                content["filing_type"] = filing_type
                content["year"] = year

                # Append the content to the list
                extracted_content.append(content)
                print(f"Successfully extracted data from {file_name}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")

    return extracted_content


# Example usage
folder_path = "EXTRACTED_FILINGS"  # Replace with your folder path
data = extract_all_json_content("/content/edgar-crawler/datasets/EXTRACTED_FILINGS/10-K")

# Print summary of extracted content
print(f"\nExtracted data from {len(data)} files.")
for i, record in enumerate(data[:3]):  # Print a preview of the first 3 records
    print(f"\nRecord {i + 1}:")
    print(json.dumps(record, indent=4))


Successfully extracted data from 320193_10K_2024_0000320193-24-000123.json
Successfully extracted data from 1652044_10K_2022_0001652044-23-000016.json
Successfully extracted data from 1652044_10K_2023_0001652044-24-000022.json
Successfully extracted data from 1652044_10K_2024_0001652044-25-000014.json
Successfully extracted data from 320193_10K_2023_0000320193-23-000106.json

Extracted data from 5 files.

Record 1:
{
    "cik": "320193",
    "company": "Apple Inc.",
    "filing_type": "10K",
    "filing_date": "2024-11-01",
    "period_of_report": "2024-09-28",
    "sic": "3571",
    "state_of_inc": "CA",
    "state_location": "CA",
    "fiscal_year_end": "0928",
    "filing_html_index": "https://www.sec.gov/Archives/edgar/data/320193/0000320193-24-000123-index.html",
    "htm_filing_link": "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm",
    "complete_text_filing_link": "https://www.sec.gov/Archives/edgar/data/320193/0000320193-24-000123.txt",
   

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Load FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def analyze_sentiment_with_rating(text):
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits.detach().numpy()[0]
    exp_logits = np.exp(logits)
    probabilities = exp_logits / np.sum(exp_logits)

    # Assign a rating based on probabilities
    # Weighted average for the rating: [Neutral, Positive, Negative] → [3, 5, 1]
    rating_weights = [3, 5, 1]
    rating = sum(prob * weight for prob, weight in zip(probabilities, rating_weights))

    return rating, probabilities

# List of dictionaries containing 10-K report data
data=data
# Initialize an empty DataFrame to hold the results
columns = [f'item_{i}' for i in range(1, 17)]  # Adjust range as needed for your items
results_df = pd.DataFrame(columns=['company'] + columns)

# Process each dictionary in the list
for report in data:
    company_name = report['company']
    row = {'company': company_name}

    for item in columns:
        if item in report:
            text = report[item]
            rating, _ = analyze_sentiment_with_rating(text)
            row[item] = rating
        else:
            row[item] = None  # Handle missing items

    results_df = results_df.append(row, ignore_index=True)

# Display the results as a table
print(results_df)

# Optionally, save the results to a CSV file
results_df.to_csv('10k_report_sentiment_scores.csv', index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


HBox(children=(FloatProgress(value=0.0, description='config.json', max=533.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='vocab.txt', max=226122.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='pytorch_model.bin', max=439101405.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='model.safetensors', max=439044180.0, style=ProgressStyle(…




  results_df = results_df.append(row, ignore_index=True)
  results_df = results_df.append(row, ignore_index=True)
  results_df = results_df.append(row, ignore_index=True)
  results_df = results_df.append(row, ignore_index=True)


         company    item_1    item_2    item_3    item_4    item_5    item_6  \
0     Apple Inc.  3.000026  3.036482  2.916076  2.979752  3.000030  2.946783   
1  Alphabet Inc.  4.994903  3.023447  2.716503  2.846893  3.000016  2.794634   
2  Alphabet Inc.  4.999897  3.036171  2.716503  2.846893  3.000013  2.961737   
3  Alphabet Inc.  4.999888  3.036171  2.716503  2.937676  3.000012  2.968993   
4     Apple Inc.  3.000038  3.031542  2.984476  2.970101  3.000014  2.952803   

     item_7    item_8  item_9   item_10   item_11   item_12   item_13  \
0  3.000322  3.000006  1.9892  3.000130  2.997840  2.995343  2.998478   
1  4.498440  3.000011  1.9892  2.999486  2.999809  2.999714  2.999930   
2  3.685588  3.000021  1.9892  2.999564  2.999859  2.999765  2.999945   
3  3.427395  3.000022  1.9892  2.999997  2.999854  2.999747  2.999938   
4  3.000205  3.000006  1.9892  2.999910  2.997852  2.996360  2.998415   

    item_14   item_15   item_16  
0  2.999835  3.000001  2.951547  
1  2.999757 

  results_df = results_df.append(row, ignore_index=True)


In [None]:
results_df

Unnamed: 0,company,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16
0,Apple Inc.,3.000026,3.036482,2.916076,2.979752,3.00003,2.946783,3.000322,3.000006,1.9892,3.00013,2.99784,2.995343,2.998478,2.999835,3.000001,2.951547
1,Alphabet Inc.,4.994903,3.023447,2.716503,2.846893,3.000016,2.794634,4.49844,3.000011,1.9892,2.999486,2.999809,2.999714,2.99993,2.999757,3.000001,2.943011
2,Alphabet Inc.,4.999897,3.036171,2.716503,2.846893,3.000013,2.961737,3.685588,3.000021,1.9892,2.999564,2.999859,2.999765,2.999945,2.999889,3.000001,2.992582
3,Alphabet Inc.,4.999888,3.036171,2.716503,2.937676,3.000012,2.968993,3.427395,3.000022,1.9892,2.999997,2.999854,2.999747,2.999938,2.999897,3.000001,2.983649
4,Apple Inc.,3.000038,3.031542,2.984476,2.970101,3.000014,2.952803,3.000205,3.000006,1.9892,2.99991,2.997852,2.99636,2.998415,2.999873,3.000001,2.931669


In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Load FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def analyze_sentiment_with_rating(text):
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits.detach().numpy()[0]
    exp_logits = np.exp(logits)
    probabilities = exp_logits / np.sum(exp_logits)

    # Assign a rating based on probabilities
    # Weighted average for the rating: [Neutral, Positive, Negative] → [3, 5, 1]
    rating_weights = [3, 5, 1]
    rating = sum(prob * weight for prob, weight in zip(probabilities, rating_weights))

    return rating, probabilities

# List of dictionaries containing 10-K report data
data=data

# Initialize a dictionary to hold DataFrames for each company
company_dfs = {}

# Process each dictionary in the list
for report in data:
    company_name = report['company']
    report_year = report['year']

    if company_name not in company_dfs:
        # Initialize a DataFrame for the company if it doesn't exist
        columns = [f'item_{i}' for i in range(1, 17)]
        company_dfs[company_name] = pd.DataFrame(columns=['year'] + columns)

    row = {'year': report_year}

    for item in columns:
        if item in report:
            text = report[item]
            rating, _ = analyze_sentiment_with_rating(text)
            row[item] = rating
        else:
            row[item] = None  # Handle missing items

    company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)

# Display the results as tables for each company
for company, df in company_dfs.items():
    print(f"Sentiment Scores for {company}:")
    print(df)
    # Optionally, save the results to CSV files
    df.to_csv(f'{company}_10k_report_sentiment_scores.csv', index=False)
    print("\n")


  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)


Sentiment Scores for Apple Inc.:
   year    item_1    item_2    item_3    item_4    item_5    item_6    item_7  \
0  2024  3.000026  3.036482  2.916076  2.979752  3.000030  2.946783  3.000322   
1  2023  3.000038  3.031542  2.984476  2.970101  3.000014  2.952803  3.000205   

     item_8  item_9  item_10   item_11   item_12   item_13   item_14  \
0  3.000006  1.9892  3.00013  2.997840  2.995343  2.998478  2.999835   
1  3.000006  1.9892  2.99991  2.997852  2.996360  2.998415  2.999873   

    item_15   item_16  
0  3.000001  2.951547  
1  3.000001  2.931669  


Sentiment Scores for Alphabet Inc.:
   year    item_1    item_2    item_3    item_4    item_5    item_6    item_7  \
0  2022  4.994903  3.023447  2.716503  2.846893  3.000016  2.794634  4.498440   
1  2023  4.999897  3.036171  2.716503  2.846893  3.000013  2.961737  3.685588   
2  2024  4.999888  3.036171  2.716503  2.937676  3.000012  2.968993  3.427395   

     item_8  item_9   item_10   item_11   item_12   item_13   item_14  

  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)


In [None]:
for company, df in company_dfs.items():
    print(f"\n{company}'s Sentiment Scores:")
    display(df)


Apple Inc.'s Sentiment Scores:


Unnamed: 0,year,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16
0,2024,3.000026,3.036482,2.916076,2.979752,3.00003,2.946783,3.000322,3.000006,1.9892,3.00013,2.99784,2.995343,2.998478,2.999835,3.000001,2.951547
1,2023,3.000038,3.031542,2.984476,2.970101,3.000014,2.952803,3.000205,3.000006,1.9892,2.99991,2.997852,2.99636,2.998415,2.999873,3.000001,2.931669



Alphabet Inc.'s Sentiment Scores:


Unnamed: 0,year,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16
0,2022,4.994903,3.023447,2.716503,2.846893,3.000016,2.794634,4.49844,3.000011,1.9892,2.999486,2.999809,2.999714,2.99993,2.999757,3.000001,2.943011
1,2023,4.999897,3.036171,2.716503,2.846893,3.000013,2.961737,3.685588,3.000021,1.9892,2.999564,2.999859,2.999765,2.999945,2.999889,3.000001,2.992582
2,2024,4.999888,3.036171,2.716503,2.937676,3.000012,2.968993,3.427395,3.000022,1.9892,2.999997,2.999854,2.999747,2.999938,2.999897,3.000001,2.983649
