In [None]:
import json
import subprocess
import os

def download_10k_filings(ticker, start_year):
    """
    Downloads and extracts 10-K filings for the specified ticker and starting year until the current year.
    This function will clone the EDGAR crawler repository, download the requirements.txt if it's not present,
    set up configurations, and run the download and extract scripts.

    Args:
        ticker (str): The stock ticker symbol for the company (e.g., "AAPL").
        start_year (int): The year from which to start downloading filings.
    """
    # Get the current year
    from datetime import datetime
    current_year = datetime.now().year

    # Clone the repository
    repo_url = "https://github.com/nlpaueb/edgar-crawler.git"
    repo_dir = "edgar-crawler"

    # Clone the repository only if it doesn't exist
    if not os.path.exists(repo_dir):
        print(f"Cloning the repository from {repo_url}...")
        subprocess.run(["git", "clone", repo_url], check=True)

    # Navigate to the repository directory
    os.chdir(repo_dir)

    # Download the requirements.txt only if it doesn't exist
    requirements_file = "requirements.txt"
    if not os.path.exists(requirements_file):
        print(f"Downloading {requirements_file}...")
        subprocess.run(["curl", "-O", "https://raw.githubusercontent.com/nlpaueb/edgar-crawler/master/requirements.txt"], check=True)

    # Install the required dependencies if not already installed
    print("Installing required dependencies...")
    subprocess.run(["pip", "install", "-r", requirements_file], check=True)

    # Create the configuration dictionary
    config = {
        "download_filings": {
            "start_year": start_year,
            "end_year": current_year,
            "quarters": [1, 2, 3, 4],
            "filing_types": ["10-K"],
            "cik_tickers": [ticker],  # Dynamic ticker
            "user_agent": "Your Name (your-email@example.com)",  # Update with your information
            "raw_filings_folder": "RAW_FILINGS",
            "indices_folder": "INDICES",
            "filings_metadata_file": "FILINGS_METADATA.csv",
            "skip_present_indices": True
        },
        "extract_items": {
            "raw_filings_folder": "RAW_FILINGS",
            "extracted_filings_folder": "EXTRACTED_FILINGS",
            "filings_metadata_file": "FILINGS_METADATA.csv",
            "filing_types": ["10-K"],
            "include_signature": False,
            "items_to_extract": [],
            "remove_tables": True,
            "skip_extracted_filings": True
        }
    }

    # Write the config to a file
    with open('config.json', 'w') as f:
        json.dump(config, f, indent=4)

    # Run the download and extract scripts
    try:
        print(f"Downloading filings for {ticker} from {start_year} to {current_year}...")
        subprocess.run(["python", "download_filings.py"], check=True)

        print(f"Extracting items from filings for {ticker}...")
        subprocess.run(["python", "extract_items.py"], check=True)

        print("Process completed successfully!")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")
    finally:
        # Navigate back to the original directory
        os.chdir('..')




# Example usage
ticker = "GOOG"  # Replace with the desired ticker
start_year = 2019  # Replace with the desired starting year
download_10k_filings(ticker,start_year)

Cloning the repository from https://github.com/nlpaueb/edgar-crawler.git...
Installing required dependencies...
Downloading filings for GOOG from 2019 to 2025...
Extracting items from filings for GOOG...
Process completed successfully!


In [None]:
import os
import json

def extract_all_json_content(folder_path):
    """
    Extracts all content from JSON files in the specified folder, using only the first three parts of the filename.

    Args:
        folder_path (str): Path to the folder containing the JSON files.

    Returns:
        list: A list of dictionaries containing the content of each JSON file.
    """
    extracted_content = []

    # Ensure the folder exists
    if not os.path.exists(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return extracted_content

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        # Process only JSON files
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)

            try:
                # Extract the first three components from the filename
                parts = file_name.replace(".json", "").split("_")[:3]
                if len(parts) < 3:
                    print(f"Skipping invalid filename: {file_name}")
                    continue

                cik, filing_type, year = parts

                # Load the JSON content
                with open(file_path, 'r') as f:
                    content = json.load(f)

                # Add metadata to the content
                content["cik"] = cik
                content["filing_type"] = filing_type
                content["year"] = year

                # Append the content to the list
                extracted_content.append(content)
                print(f"Successfully extracted data from {file_name}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")

    return extracted_content


# Example usage
folder_path = "EXTRACTED_FILINGS"  # Replace with your folder path
data = extract_all_json_content("/content/edgar-crawler/datasets/EXTRACTED_FILINGS/10-K")

# Print summary of extracted content
print(f"\nExtracted data from {len(data)} files.")
for i, record in enumerate(data[:3]):  # Print a preview of the first 3 records
    print(f"\nRecord {i + 1}:")
    print(json.dumps(record, indent=4))


Successfully extracted data from 1652044_10K_2019_0001652044-20-000008.json
Successfully extracted data from 1652044_10K_2022_0001652044-23-000016.json
Successfully extracted data from 1652044_10K_2023_0001652044-24-000022.json
Successfully extracted data from 1652044_10K_2020_0001652044-21-000010.json
Successfully extracted data from 1652044_10K_2024_0001652044-25-000014.json
Successfully extracted data from 1652044_10K_2018_0001652044-19-000004.json
Successfully extracted data from 1652044_10K_2021_0001652044-22-000019.json

Extracted data from 7 files.

Record 1:
{
    "cik": "1652044",
    "company": "Alphabet Inc.",
    "filing_type": "10K",
    "filing_date": "2020-02-04",
    "period_of_report": "2019-12-31",
    "sic": "7370",
    "state_of_inc": "DE",
    "state_location": "CA",
    "fiscal_year_end": "1231",
    "filing_html_index": "https://www.sec.gov/Archives/edgar/data/1652044/0001652044-20-000008-index.html",
    "htm_filing_link": "https://www.sec.gov/Archives/edgar/dat

In [None]:
data[0].keys()

dict_keys(['cik', 'company', 'filing_type', 'filing_date', 'period_of_report', 'sic', 'state_of_inc', 'state_location', 'fiscal_year_end', 'filing_html_index', 'htm_filing_link', 'complete_text_filing_link', 'filename', 'item_1', 'item_1A', 'item_1B', 'item_1C', 'item_2', 'item_3', 'item_4', 'item_5', 'item_6', 'item_7', 'item_7A', 'item_8', 'item_9', 'item_9A', 'item_9B', 'item_9C', 'item_10', 'item_11', 'item_12', 'item_13', 'item_14', 'item_15', 'item_16', 'year'])

In [None]:
compname=data[0]["company"]

In [None]:
compname

'Alphabet Inc.'

In [None]:
data[0]["item_2"]

'ITEM 2.\nPROPERTIES\nOur headquarters are located in Mountain View, California. We also own and lease office and building space in the surrounding areas near our headquarters, which we believe is sufficient to accommodate anticipated future growth. In addition, we own and lease office/building space and research and development sites around the world, primarily in North America, Europe, South America, and Asia. We own and operate data centers in the U.S., Europe, South America, and Asia. We believe our existing facilities, both owned and leased, are in good condition and suitable for the conduct of our business.'

In [None]:
data[0]["filing_date"]

'2020-02-04'

In [None]:
result_string=""

In [None]:

result_string = " ".join(data[4].values())

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Load FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def analyze_sentiment_with_rating(text):
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits.detach().numpy()[0]
    exp_logits = np.exp(logits)
    probabilities = exp_logits / np.sum(exp_logits)

    # Assign a rating based on probabilities
    # Weighted average for the rating: [Neutral, Positive, Negative] → [3, 5, 1]
    rating_weights = [3, 5, 1]
    rating = sum(prob * weight for prob, weight in zip(probabilities, rating_weights))

    return rating, probabilities

# List of dictionaries containing 10-K report data
data=data

# Initialize a dictionary to hold DataFrames for each company
company_dfs = {}

# Process each dictionary in the list
for report in data:
    company_name = report['company']
    report_year = report['year']

    if company_name not in company_dfs:
        # Initialize a DataFrame for the company if it doesn't exist
        columns = [f'item_{i}' for i in range(1, 17)]
        company_dfs[company_name] = pd.DataFrame(columns=['year'] + columns)

    row = {'year': report_year}

    for item in columns:
        if item in report:
            text = report[item]
            rating, _ = analyze_sentiment_with_rating(text)
            row[item] = rating
        else:
            row[item] = None  # Handle missing items

    company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)

# Display the results as tables for each company
for company, df in company_dfs.items():
    print(f"Sentiment Scores for {company}:")
    print(df)
    # Optionally, save the results to CSV files
    df.to_csv(f'{company}_10k_report_sentiment_scores.csv', index=False)
    print("\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


HBox(children=(FloatProgress(value=0.0, description='config.json', max=533.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='vocab.txt', max=226122.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='pytorch_model.bin', max=439101405.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='model.safetensors', max=439044180.0, style=ProgressStyle(…




  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)
  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)


Sentiment Scores for Alphabet Inc.:
   year    item_1    item_2    item_3    item_4    item_5    item_6    item_7  \
0  2019  4.987228  3.008968  2.853617  2.916617  3.000019  3.000088  3.034284   
1  2022  4.994903  3.023447  2.716503  2.846893  3.000016  2.794634  4.498440   
2  2023  4.999897  3.036171  2.716503  2.846893  3.000013  2.961737  3.685588   
3  2020  4.940339  3.008968  2.692213  2.916617  3.000019  3.000088  3.055421   
4  2024  4.999888  3.036171  2.716503  2.937676  3.000012  2.968993  3.427395   
5  2018  3.193768  2.999986  2.859544  2.916617  3.000020  3.000088  3.723905   
6  2021  4.943851  3.003889  2.716503  2.916617  3.000013  2.794634  3.132443   

     item_8  item_9   item_10   item_11   item_12   item_13   item_14  \
0  3.000017  1.9892  2.999915  2.999822  2.999699  2.999945  2.999754   
1  3.000011  1.9892  2.999486  2.999809  2.999714  2.999930  2.999757   
2  3.000021  1.9892  2.999564  2.999859  2.999765  2.999945  2.999889   
3  3.000000  1.9892  2.

  company_dfs[company_name] = company_dfs[company_name].append(row, ignore_index=True)


In [None]:
for company, df in company_dfs.items():
    print(f"\n{company}'s Sentiment Scores:")
    display(df)


Alphabet Inc.'s Sentiment Scores:


Unnamed: 0,year,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16
0,2019,4.987228,3.008968,2.853617,2.916617,3.000019,3.000088,3.034284,3.000017,1.9892,2.999915,2.999822,2.999699,2.999945,2.999754,3.0,2.943011
1,2022,4.994903,3.023447,2.716503,2.846893,3.000016,2.794634,4.49844,3.000011,1.9892,2.999486,2.999809,2.999714,2.99993,2.999757,3.000001,2.943011
2,2023,4.999897,3.036171,2.716503,2.846893,3.000013,2.961737,3.685588,3.000021,1.9892,2.999564,2.999859,2.999765,2.999945,2.999889,3.000001,2.992582
3,2020,4.940339,3.008968,2.692213,2.916617,3.000019,3.000088,3.055421,3.0,1.9892,2.999701,2.999803,2.999682,2.99996,2.999751,3.000001,2.943011
4,2024,4.999888,3.036171,2.716503,2.937676,3.000012,2.968993,3.427395,3.000022,1.9892,2.999997,2.999854,2.999747,2.999938,2.999897,3.000001,2.983649
5,2018,3.193768,2.999986,2.859544,2.916617,3.00002,3.000088,3.723905,3.000018,1.9892,2.999562,2.9998,2.99986,2.999942,2.999756,3.0,2.943011
6,2021,4.943851,3.003889,2.716503,2.916617,3.000013,2.794634,3.132443,3.00001,1.9892,2.999517,2.999784,2.99968,2.99993,2.999748,3.000001,2.943011


In [None]:
averages = company_dfs["Alphabet Inc."].mean()

# Convert the averages to a DataFrame and transpose it
averages_df = pd.DataFrame(averages).transpose()

# Add a label for the averages row
averages_df.index = ['Average']
# Concatenate the original DataFrame with the averages DataFrame
df_with_averages = pd.concat([df, averages_df])
df_with_averages['row_average'] = df_with_averages.mean(axis=1)

  df_with_averages['row_average'] = df_with_averages.mean(axis=1)


In [None]:
df_with_averages

Unnamed: 0,year,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16,row_average
0,2019.0,4.987228,3.008968,2.853617,2.916617,3.000019,3.000088,3.034284,3.000017,1.9892,2.999915,2.999822,2.999699,2.999945,2.999754,3.0,2.943011,3.045762
1,2022.0,4.994903,3.023447,2.716503,2.846893,3.000016,2.794634,4.49844,3.000011,1.9892,2.999486,2.999809,2.999714,2.99993,2.999757,3.000001,2.943011,3.11286
2,2023.0,4.999897,3.036171,2.716503,2.846893,3.000013,2.961737,3.685588,3.000021,1.9892,2.999564,2.999859,2.999765,2.999945,2.999889,3.000001,2.992582,3.076727
3,2020.0,4.940339,3.008968,2.692213,2.916617,3.000019,3.000088,3.055421,3.0,1.9892,2.999701,2.999803,2.999682,2.99996,2.999751,3.000001,2.943011,3.034048
4,2024.0,4.999888,3.036171,2.716503,2.937676,3.000012,2.968993,3.427395,3.000022,1.9892,2.999997,2.999854,2.999747,2.999938,2.999897,3.000001,2.983649,3.066184
5,2018.0,3.193768,2.999986,2.859544,2.916617,3.00002,3.000088,3.723905,3.000018,1.9892,2.999562,2.9998,2.99986,2.999942,2.999756,3.0,2.943011,2.976567
6,2021.0,4.943851,3.003889,2.716503,2.916617,3.000013,2.794634,3.132443,3.00001,1.9892,2.999517,2.999784,2.99968,2.99993,2.999748,3.000001,2.943011,3.027427
Average,2.884574600331457e+26,4.722839,3.0168,2.753055,2.899705,3.000016,2.931466,3.508211,3.000014,1.9892,2.999677,2.999819,2.999735,2.999941,2.999793,3.000001,2.955898,3.048511


In [None]:
dfplot=company_dfs["Alphabet Inc."]

In [None]:
item_descriptions = {
    "item_1": "Business Overview",
    "item_1A": "Risk Factors",
    "item_1B": "Staff Comments",
    "item_1C": "Mine Safety",
    "item_2": "Company Properties",
    "item_3": "Legal Proceedings",
    "item_4": "Mine Disclosures",
    "item_5": "Equity Market",
    "item_6": "Financial Summary",
    "item_7": "Management Analysis",
    "item_7A": "Market Risks",
    "item_8": "Financial Statements",
    "item_9": "Accountant Changes",
    "item_9A": "Control Procedures",
    "item_9B": "Other Information",
    "item_9C": "Jurisdiction Limits",
    "item_10": "Corporate Governance",
    "item_11": "Executive Pay",
    "item_12": "Shareholder Info",
    "item_13": "Related Parties",
    "item_14": "Audit Fees",
    "item_15": "Supporting Docs",
    "item_16": "10-K Summary"
}

In [None]:
import pandas as pd
import plotly.express as px

df = pd.DataFrame(dfplot)

df_long = df.melt(id_vars='year', var_name='Item', value_name='Score')

# Map items to their descriptions
df_long['Item Description'] = df_long['Item'].map(item_descriptions)

# Create the interactive plot
fig = px.line(df_long, x='year', y='Score', color='Item Description', title='Sentiment Scores Over Years for Different Items')

# Show the plot
fig.show()

In [None]:
df_with_averages.rename(columns={"row_average": "Year_Score"}, inplace=True)

In [None]:
df_with_averages['Year_Score']

Unnamed: 0,Year_Score
0,3.045762
1,3.11286
2,3.076727
3,3.034048
4,3.066184
5,2.976567
6,3.027427
Average,3.048511


In [None]:
import plotly.express as px

# Assuming your DataFrame is named df_with_averages and has a 'Year_Score' column
fig = px.line(
    df_with_averages,
    y="Year_Score",
    title="Line Graph of Year Score",
    labels={"Year_Score": "Score"},  # Label for y-axis
    markers=True  # Add markers to the line
)

# Customize layout
fig.update_layout(
    xaxis_title="Index",  # Default x-axis if no column is specified
    yaxis_title="Year Score",
    template="plotly_white",
    title_font_size=18
)

# Show the plot
fig.show()


In [None]:
item7_text = data[0]["item_7"]

In [None]:
import re
def extract_revenue(text):
    # Look for revenue values in the text
    revenue_match = re.search(r"Revenues were \$([\d\.]+)\s*(billion|million|trillion)", text, re.IGNORECASE)

    if revenue_match:
        value = float(revenue_match.group(1))
        unit = revenue_match.group(2).lower()

        # Convert revenue to consistent unit (defaulting to billions)
        if unit == "million":
            value /= 1000  # Convert million to billion
        elif unit == "trillion":
            value *= 1000  # Convert trillion to billion

        return f"Revenue: ${value} billion"
    else:
        return "Revenue not found."
for i in range(5):
    revenue_text = data[i]["item_7"]
    revenue_result = extract_revenue(revenue_text)
    print(f"{revenue_result}")

In [None]:
!pip install py-readability-metrics
from readability import Readability

# Function to compute readability metrics for a given text
def compute_readability(text):
    r = Readability(text)
    try:
        flesch = r.flesch().score
    except:
        flesch = None

    try:
        fog = r.gunning_fog().score
    except:
        fog = None

    try:
        smog = r.smog().score
    except:
        smog = None

    return {"Flesch Reading Ease": flesch, "Gunning Fog Index": fog, "SMOG Index": smog}

# List of item keys (assuming 'item_1' to 'item_15')
item_keys = [f"item_{i}" for i in range(1, 16)]

# Function to compute readability for a single company
def compute_company_readability(company_data):
    scores = {"Flesch Reading Ease": [], "Gunning Fog Index": [], "SMOG Index": []}

    for item in item_keys:
        if item in company_data and company_data[item]:  # Check if item exists and is not empty
            readability = compute_readability(company_data[item])
            for key in scores:
                if readability[key] is not None:
                    scores[key].append(readability[key])

    # Compute average readability scores for the company
    avg_scores = {key: sum(values) / len(values) if values else None for key, values in scores.items()}
    return avg_scores

# Assuming `data` is a list of dictionaries containing all companies' data

# Compute readability for the first 5 companies
company_readability_scores = [compute_company_readability(data[i]) for i in range(5)]

# Print results
for i, scores in enumerate(company_readability_scores):
    print(f"Company {i+1} Readability Scores:")
    print(scores)
    print("-" * 50)



In [None]:
!pip install readability-lxml

In [None]:
import re



# Regular expressions for company and person names
company_pattern = r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+Company\b"
people_pattern = r"([A-Z][a-z]+(?:\s[A-Z]\.)?(?:\s[A-Z][a-z]+)+)"
people_matches = re.findall(people_pattern, result_string)

# Filter people matches to exclude irrelevant capitalized text
# Assuming people names appear with clear separation and context
filtered_people = [name for name in people_matches if len(name.split()) >= 2]

# Deduplicate the results
company_names = set(company_pattern)
people_names = set(filtered_people)

# Print the results
print("Company Name:")
for company in company_names:
    print(company)

print("\nPeople Names:")
for person in sorted(people_names):
    print(person)
companies = list(company_names)
people = list(people_names)
s=''
for i in people:
  s+=i+" "

In [None]:
import requests

# TextRazor API Key
API_KEY = "068ac1f6ee66ff8b078bd47d186336f9ae76ce919712ab203f287401"  # Replace with your actual API key

# Input text for NER
text = s

# TextRazor API Endpoint
endpoint = "https://api.textrazor.com/"

# Request Headers
headers = {
    "x-textrazor-key": API_KEY,
    "Content-Type": "application/x-www-form-urlencoded"
}

# Request Data
data = {
    "text": text,
    "extractors": "entities"  # Specify that we want to extract entities
}

# Send the POST request
response = requests.post(endpoint, headers=headers, data=data)

# Check the response
if response.status_code == 200:
    response_data = response.json()
    print("TextRazor NER Results:")

    # Extract entities of type 'Person'
    entities = response_data.get("response", {}).get("entities", [])
    people = [entity["entityId"] for entity in entities if "type" in entity and "Person" in entity["type"]]
    print("People Detected:", people)
else:
    print("Error:", response.status_code, response.text)

In [None]:
import requests

def get_top_news(query, api_key):
    """Fetch top news articles based on the query using the News API.

    Args:
        query (str): Search query.
        api_key (str): Your News API key.

    Returns:
        list: A list of URLs of articles related to the query.
    """
    url = "https://newsapi.org/v2/everything"
    params = {
        'q': query,
        'sortBy': 'relevancy',
        'apiKey': api_key
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        return [article["url"] for article in data.get("articles", [])]
    else:
        print(f"Error: Unable to fetch news for query '{query}'. HTTP Status Code: {response.status_code}")
        return []

if __name__ == "__main__":
    # Replace 'YOUR_API_KEY' with your actual News API key
    API_KEY = "dcc1378340a64ab6b76ba8c6ded498d1"

    # Define the company name
    company= compname
  # Change this to any company name as needed

    # Fetch news articles about the company
    print(f"\nFetching news articles for {company}...\n")
    company_query = f'"{company}"'
    company_urls = get_top_news(company_query, API_KEY)

    # Output the results for the company
    print(f"News articles for {company}:\n")
    if company_urls:
        for url in company_urls:
            print(url)
    else:
        print(f"No news articles found for {company}.")

    # Input a list of names
    print(f"\nFetching news articles for {company} and specific individuals...\n")
    names = people  # Ensure you define the `people` variable with the list of names

    # Dictionary to store URLs for each person
    results = {}

    # Fetch news articles for each person and the company
    for person_name in names:
        query = f'"{person_name}" AND "{company}"'
        urls = get_top_news(query, API_KEY)
        results[person_name] = urls

    # Output the results for individuals
    for person, urls in results.items():
        print(f"\nNews articles for {person} and {company}:\n")
        if urls:
            for url in urls:
                print(url)
        else:
            print(f"No news articles found for {person} and {company}.")

