
## Task 1.1


In [1]:
!pip install sec-edgar-downloader



Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.1.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.6.1-py3-none-any.whl.metadata (24 kB)
Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.6.1-py3-none-any.whl (26 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.6.1 sec-edgar-downloader-5.0.2


In [2]:
from sec_edgar_downloader import Downloader
import os


In [3]:
def download_10k(ticker, start_year=1995, end_year=2023):

  
  for year in range(start_year, end_year + 1):
    dl = Downloader(download_folder=f'/kaggle/working/dataset/{year}',email_address="xyz@yahoo.com",company_name=ticker) 
    try:
      dl.get("10-K", ticker, after=f"{year}-01-01",before=f"{year+1}-01-01")  
      print(f"Downloaded 10-K for {ticker} - {year}")
    except Exception as e:
      print(f"Error downloading {ticker} - {year}: {e}")     

In [None]:
# Here since we're running on Kaggle Kernel the stock name has been hardcded

# we can use input() function to take input from user

In [4]:
ticker = "TSLA"

In [5]:
download_10k(ticker)

Downloaded 10-K for TSLA - 1995
Downloaded 10-K for TSLA - 1996
Downloaded 10-K for TSLA - 1997
Downloaded 10-K for TSLA - 1998
Downloaded 10-K for TSLA - 1999
Downloaded 10-K for TSLA - 2000
Downloaded 10-K for TSLA - 2001
Downloaded 10-K for TSLA - 2002
Downloaded 10-K for TSLA - 2003
Downloaded 10-K for TSLA - 2004
Downloaded 10-K for TSLA - 2005
Downloaded 10-K for TSLA - 2006
Downloaded 10-K for TSLA - 2007
Downloaded 10-K for TSLA - 2008
Downloaded 10-K for TSLA - 2009
Downloaded 10-K for TSLA - 2010
Downloaded 10-K for TSLA - 2011
Downloaded 10-K for TSLA - 2012
Downloaded 10-K for TSLA - 2013
Downloaded 10-K for TSLA - 2014
Downloaded 10-K for TSLA - 2015
Downloaded 10-K for TSLA - 2016
Downloaded 10-K for TSLA - 2017
Downloaded 10-K for TSLA - 2018
Downloaded 10-K for TSLA - 2019
Downloaded 10-K for TSLA - 2020
Downloaded 10-K for TSLA - 2021
Downloaded 10-K for TSLA - 2022
Downloaded 10-K for TSLA - 2023


In [None]:
#Task 1.2 Convert to json object base

In [6]:
import os
import pandas as pd
import spacy
from spacy.matcher import Matcher

import json

In [7]:
# Load spaCy model for English (adjust language if needed)
nlp = spacy.load("en_core_web_sm")

# Define document path (replace with your actual path)
#doc_path = f"/content/sample_data/dataset/{year}/sec-edgar-filings/{ticker}/10-K"

# Define output path for knowledge base (replace as needed)
kb_path = "knowledge_base.json"



In [8]:
def preprocess_text(text):

  # Convert to lowercase and remove special characters
  text = text.lower()
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
  # Sentence segmentation
  sentences = [sent.text.strip() for sent in nlp(text).sents]
  return sentences



In [9]:
def extract_entities(sentences):
    entities_list = []
    for sentence in sentences:
        doc = nlp(sentence)
        matcher = Matcher(nlp.vocab)
        # Define entity patterns (adjust as needed)
        patterns = [
            # Company names
            [{"POS": "PROPN"}]
        ]
        matcher.add("Company", patterns)
        matches = matcher(doc)
        entities = {}
        for match_id, start, end in matches:
            span = doc[start:end]
            entities[span.text] = span.label_
        entities_list.append(entities)
    return entities_list


In [10]:
def process_filing(filepath):
  """Processes a 10-K filing and extracts information.

  Args:
      filepath: The path to the 10-K filing document.

  Returns:
      A dictionary containing extracted information from the filing.
  """
  extracted_data = []

  with open(filepath, "r") as f:
      for line in f:
        line=preprocess_text(line)
        entities = extract_entities(line)
        extracted_data.append({"text": line, "entities": entities})
      
  return extracted_data



In [11]:
import os
import json
import re


def create_knowledge_base(kb_dir):
    """Creates a knowledge base from all text files in a directory.

    Args:
        doc_path: The path to the directory containing text files.
        kb_dir: The directory to save the knowledge base JSON files.
    """
    # Ensure that the directory to save knowledge base files exists
    os.makedirs(kb_dir, exist_ok=True)
    for year in range(1995, 2024):
      doc_path = f"/kaggle/working/dataset/{year}/sec-edgar-filings/{ticker}/10-K"
      if os.path.isdir(doc_path):
        # Loop through each subfolder in the specified directory
       try:
          for subfolder in os.listdir(doc_path):
              subfolder_path = os.path.join(doc_path, subfolder)
              # Check if the item is a directory
              if os.path.isdir(subfolder_path):
                  # List all files in the subfolder
                  files = os.listdir(subfolder_path)
                  # Check if there is exactly one file in the subfolder and it is a text file
                  if len(files) == 1 and files[0].endswith(".txt"):
                      file_path = os.path.join(subfolder_path, files[0])
                      # Preprocess the data from the file
                      print(file_path)
                      
                      filing_data = process_filing(file_path)
                      
                      # Save knowledge base as JSON
                      kb_filename = files[0].replace(".txt", f"{year}_knowledge_base.json")
                      kb_path = os.path.join(kb_dir, kb_filename)
                      with open(kb_path, "w") as f:
                          json.dump({"filename": files[0], "data": filing_data}, f, indent=2)
                      print(f"Knowledge base created and saved to: {kb_path}")
       except FileNotFoundError:
            print(f"Skipping year {year}: Folder not found")            
      else:
        print(f"Skipping year {year}: Year folder not found")
         



In [12]:
# Example usage:
kb_dir = "knowledge_bases"  # Directory to save knowledge base JSON files
create_knowledge_base(kb_dir)


Skipping year 1995: Year folder not found
Skipping year 1996: Year folder not found
Skipping year 1997: Year folder not found
Skipping year 1998: Year folder not found
Skipping year 1999: Year folder not found
Skipping year 2000: Year folder not found
Skipping year 2001: Year folder not found
Skipping year 2002: Year folder not found
Skipping year 2003: Year folder not found
Skipping year 2004: Year folder not found
Skipping year 2005: Year folder not found
Skipping year 2006: Year folder not found
Skipping year 2007: Year folder not found
Skipping year 2008: Year folder not found
Skipping year 2009: Year folder not found
Skipping year 2010: Year folder not found
/kaggle/working/dataset/2011/sec-edgar-filings/TSLA/10-K/0001193125-11-054847/full-submission.txt
Knowledge base created and saved to: knowledge_bases/full-submission2011_knowledge_base.json
/kaggle/working/dataset/2012/sec-edgar-filings/TSLA/10-K/0001193125-12-081990/full-submission.txt
Knowledge base created and saved to: kn