In [2]:
import pandas as pd

# Try loading the CSV file while skipping bad lines
russell_unclean_df = pd.read_csv("split_russel.csv", on_bad_lines='skip')

# Remove commas from the 'Market Value' column and convert to numeric
russell_unclean_df['Market Value'] = russell_unclean_df['Market Value'].str.replace(',', '')
russell_unclean_df['Market Value'] = pd.to_numeric(russell_unclean_df['Market Value'], errors='coerce')

# Sort in DESC order
sorted_df = russell_unclean_df.sort_values(by='Market Value', ascending=False)
print(sorted_df.head())

  Ticker                            Name                   Sector  \
0   INSM                      INSMED INC              Health Care   
1   FTAI               FTAI AVIATION LTD              Industrials   
2  XTSLA  BLK CSH FND TREASURY SL AGENCY  Cash and/or Derivatives   
3     FN                        FABRINET   Information Technology   
4   PCVX                     VAXCYTE INC              Health Care   

    Asset Class  Market Value  Weight (%)  Notional Value          Shares  \
0        Equity  3.026505e+08        0.46  302,650,483.35    3,945,385.00   
1        Equity  2.479806e+08        0.38  247,980,623.80    2,537,665.00   
2  Money Market  2.355547e+08        0.36  235,554,705.69  235,554,706.00   
3        Equity  2.329875e+08        0.36  232,987,503.36      926,318.00   
4        Equity  2.290238e+08        0.35  229,023,767.28    2,779,752.00   

    Price       Location                      Exchange Currency  FX Rate  \
0   76.71  United States                      

In [None]:
API_KEY = 'XXXX'

# create batches of tickers: [[A,B,C], [D,E,F], ...]
# a single batch has a maximum of max_length_of_batch tickers
def create_batches(tickers = [], max_length_of_batch = 100):
  batches = [[]]

  for ticker in tickers:
    if len(batches[len(batches)-1]) == max_length_of_batch:
      batches.append([])

    batches[len(batches)-1].append(ticker)

  return batches

batches = create_batches(list(sorted_df['Ticker']))
print(batches)

[['JNJ', 'MCD', 'RTX']]


In [5]:
from sec_api import QueryApi
import json
import time

queryApi = QueryApi(api_key=API_KEY)

def download_10K_metadata(tickers=[], start_year=2015, end_year=2017):
    print('✅ Starting downloading metadata for years {} to {}'.format(start_year, end_year))

    batches = create_batches(tickers)
    all_metadata = []

    for batch in batches:  # Go through tickers
        ticker_data = {}
        for year in range(start_year, end_year + 1):  # Fulfill all years for each ticker
            tickers_joined = ', '.join(batch)
            ticker_query = 'ticker:({})'.format(tickers_joined)

            query_string = '{ticker_query} AND filedAt:[{start_year}-01-01 TO {end_year}-12-31] AND formType:"10-K" AND NOT formType:"10-K/A" AND NOT formType:NT'.format(
                ticker_query=ticker_query, start_year=year, end_year=year)

            query = {
                "query": query_string,
                "from": "0",
                "size": "200",
                "sort": [{"filedAt": {"order": "desc"}}]
            }

            response = queryApi.get_filings(query)
            filings = response['filings']  # Return values

            for filing in filings:
                ticker = filing['ticker']
                if ticker not in ticker_data:
                    ticker_data[ticker] = []

                ticker_data[ticker].append({
                    'ticker': ticker,
                    'companyName': filing['companyName'],
                    'cik': filing['cik'],
                    'formType': filing['formType'],
                    'year': year,
                    'filedAt': filing['filedAt'],
                    'filingUrl': filing['linkToFilingDetails'],
                })

            time.sleep(5)  # Delay before the next request

        # Filter tickers with complete data for all specified years
        for ticker, data in ticker_data.items():
            years_present = {entry['year'] for entry in data}
            if years_present == set(range(start_year, end_year + 1)):
                all_metadata.extend(data)

        print('✅ Downloaded and filtered metadata for batch', batch)

    if all_metadata:
        with open('additional_10k.json', 'w') as json_file:
            json.dump(all_metadata, json_file, indent=4)
        print('✅ Download completed. Metadata downloaded for {} filings.'.format(len(all_metadata)))
        return all_metadata
    else:
        print('❌ No complete metadata found for the specified tickers and years.')


#---------Run This------------_#
tickers = list(sorted_df['Ticker'])
metadata = download_10K_metadata(tickers=tickers, start_year=2015, end_year=2017)

✅ Starting downloading metadata for years 2015 to 2017
✅ Downloaded and filtered metadata for batch ['JNJ', 'MCD', 'RTX']
✅ Download completed. Metadata downloaded for 9 filings.


In [18]:
#json stats
import json

def analyze_json_file(file_path):
    # Load the JSON file
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
    
    # Calculate the number of rows (length of the JSON list)
    total_rows = len(data)
    
    # Find the number of unique tickers
    unique_tickers = {entry['ticker'] for entry in data}
    number_of_unique_tickers = len(unique_tickers)
    
    # Print the results
    print(f"Total number of rows: {total_rows}")
    print(f"Number of unique tickers: {number_of_unique_tickers}")

    return total_rows, number_of_unique_tickers

# Example usage
json_file_path = 'russel_10k.json'
analyze_json_file(json_file_path)

Total number of rows: 658
Number of unique tickers: 215


(658, 215)

In [6]:
from sec_api import ExtractorApi 

# Load the existing JSON file into a DataFrame
with open("additional_10k.json", 'r') as json_file:
    data = json.load(json_file)
    ticker_df = pd.DataFrame(data)

# Initialize the SEC API with your API key
extractorApi = ExtractorApi(api_key=API_KEY)

# Create empty lists to store the FLS and item7 data
fls_data = []
item7_data = []

# Iterate over each row in the DataFrame
for index, row in ticker_df.iterrows():
    url = row["filingUrl"]
    
    # Get the FLS (Item 1) section
    fls = extractorApi.get_section(filing_url=url, section="1", return_type="text")
    fls_data.append(fls)
    
    # Get the Item 7 section
    item7 = extractorApi.get_section(filing_url=url, section="7", return_type="text")
    item7_data.append(item7)

    print(f"Processed {row['ticker']} for year {row['year']}")
    
    time.sleep(5) #add 5 sec delay for each filing

# Add the new data as columns to the DataFrame
ticker_df['FLS'] = fls_data
ticker_df['item7'] = item7_data

# Convert the updated DataFrame back to a JSON object
updated_data = ticker_df.to_dict(orient='records')

# Save the updated data back to a JSON file
with open("additional_10k_extracted.json", 'w') as json_file:
    json.dump(updated_data, json_file, indent=4)

print("✅ JSON file successfully updated and saved as 'additional_10k_extracted.json'")


Processed MCD for year 2015
Processed MCD for year 2016
Processed MCD for year 2017
Processed JNJ for year 2015
Processed JNJ for year 2016
Processed JNJ for year 2017
Processed RTX for year 2015
Processed RTX for year 2016
Processed RTX for year 2017
✅ JSON file successfully updated and saved as 'additional_10k_extracted.json'


In [3]:
#GRABBING TICKER AND EXCHANGE CODE DATA
import pandas as pd
import json

# Load the JSON data
with open('russel_10k.json', 'r') as json_file:
    data = json.load(json_file)

# Load the CSV data into a DataFrame
csv_df = pd.read_csv('split_russel.csv')

# Prepare a dictionary to hold unique tickers
unique_tickers = {}

# Iterate over each entry in the JSON data
for entry in data:
    ticker = entry['ticker']
    company_name = entry['companyName']
    
    # Check if the ticker is already added
    if ticker not in unique_tickers:
        # Find the exchange for the ticker
        exchange_row = csv_df[csv_df['Ticker'] == ticker]
        
        if not exchange_row.empty:
            exchange = exchange_row.iloc[0]['Exchange']
            
            # Add the ticker to the unique_tickers dictionary
            unique_tickers[ticker] = {
                "ticker": ticker,
                "companyName": company_name,
                "exchange": exchange
            }

# Convert the dictionary values to a list
filtered_data = list(unique_tickers.values())

# Write the filtered data to a new JSON file
with open('filtered_data.json', 'w') as outfile:
    json.dump(filtered_data, outfile, indent=4)

print("Filtered JSON file with unique tickers created successfully.")


Filtered JSON file with unique tickers created successfully.
