# Local Machine (Oscar) Specific Setup

In [1]:
from pathlib import Path
from pprint  import pprint
import sqlite3
import csv
import os
import random

# Set Up Paths for Writing Files

In [2]:
current_directory = os.getcwd()
print(current_directory)
conn = sqlite3.connect(current_directory + '/sqlite/nport.db')
cursor = conn.cursor()
# Set cache size (in pages, where each page is usually 4KB; negative values mean in KB)
cursor.execute("PRAGMA cache_size = -262144;")  # Example: limits cache to 1 GB
# Set temp storage to use file-based storage rather than RAM
cursor.execute("PRAGMA temp_store = 1;")  # 1 = file-based storage, 2 = memory (default)
raw_data_folder = os.getcwd()+'/data/2019q4_nport'
print(raw_data_folder)

d:\GithubRepos\PIMCO-Text2SQL
d:\GithubRepos\PIMCO-Text2SQL/data/2019q4_nport


# Create Table for FUND_REPORTED HOLDING

## Note:
- FUND_REPORTED_HOLDING contains ISSUER_CUSIP
- DEBT_SECURITY_REF_INSTRUMENT, DESC_REF_INDEX_COMPONENT, and DESC_REF_OTHER all contain CUSIP
- I retrieved the unique cusip values for each table to see which had the broadest range that I could sample from.


---



*   Unique cusip count from FUND_REPORTED_HOLDING:  356276
*   Unique cusip count from DEBT_SECURITY_REF_INSTRUMENT:  552
*   Unique cusip count from DESC_REF_INDEX_COMPONENT:  2890
*   Unique cusip count from DESC_REF_OTHER:  3307

I have chosen to sample from FUND_REPORTED_HOLDING.

In [3]:
# Create table from .tsv file
def create_table(filename):
    if filename.endswith('.tsv'):
        table_name = 'FUND_REPORTED_HOLDING_RAW'
        file_path = raw_data_folder +'/'+ filename

        with open(file_path, 'r', newline='') as file:
            reader = csv.reader(file, delimiter='\t')
            column_names = next(reader)  # Get the first row as column names

            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join([f'{col} TEXT' for col in column_names])});"
            cursor.execute(create_table_query)

            # Insert data into the table
            for row in reader:
                insert_query = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({', '.join(['?'] * len(column_names))});"
                cursor.execute(insert_query, row)

In [4]:
create_table("FUND_REPORTED_HOLDING.tsv")

# Randomly sample 1000 CUSIPS

In [5]:
# Randomly sample 1000 distinct CUSIPS as table SAMPLE_CUSIPS
cursor.execute('''CREATE TABLE SAMPLED_CUSIPS AS
                SELECT DISTINCT ISSUER_CUSIP
                FROM FUND_REPORTED_HOLDING_RAW
                WHERE ISSUER_CUSIP != 'N/A'
                ORDER BY RANDOM()
                LIMIT 1000;''')

<sqlite3.Cursor at 0x2272d3658c0>

# Filter FUND_REPORTED_HOLDING with Sampled Cusips

In [6]:
cursor.execute('''
                CREATE TABLE FUND_REPORTED_HOLDING AS
                SELECT *
                FROM FUND_REPORTED_HOLDING_RAW frhr
                JOIN SAMPLED_CUSIPS s ON frhr.ISSUER_CUSIP = s.ISSUER_CUSIP
''')

<sqlite3.Cursor at 0x2272d3658c0>

# Create tables only containing rows that align with sampled cusips

In [7]:
def create_sampled_table(filename, primary_key):
    if filename.endswith('.tsv'):
        table_name = os.path.splitext(filename)[0]
        file_path = raw_data_folder + '/'+filename
        if not os.path.isfile(file_path):
            return f"Exiting early because '{filename}'.tsv does not exist in '{raw_data_folder}"
        # Retrieve primary keys from the fund_reported_holdings table
        primary_key_query = f"SELECT {primary_key} FROM FUND_REPORTED_HOLDING;"
        cursor.execute(primary_key_query)
        primary_keys = {row[0] for row in cursor.fetchall()}  # Store keys in a set for faster lookup

        # After fetching primary keys
        # print("Primary Keys from DATABASE:", primary_keys)
        # print("Number of Primary Keys:", len(primary_keys))
        # unique_primary_keys = set(primary_keys)
        # print("Number of Unique Primary Keys:", len(unique_primary_keys))
        
        with open(file_path, 'r', newline='') as file:
            reader = csv.reader(file, delimiter='\t')
            column_names = next(reader)  # Get the first row as column names

            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join([f'{col} TEXT' for col in column_names])});"
            cursor.execute(create_table_query)

            # Insert only rows that match the primary keys
            for row in reader:
                # Retrieve primary key value from the current row
                row_primary_key = row[column_names.index(primary_key)]

                # Check if the row's primary key exists in the fetched primary keys
                if row_primary_key in primary_keys:
                    insert_query = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({', '.join(['?'] * len(column_names))});"
                    cursor.execute(insert_query, row)

In [8]:
ac_tables = [
    "SUBMISSION",
    "REGISTRANT",
    "FUND_REPORTED_INFO",
    "INTEREST_RATE_RISK",
    "BORROWER",
    "BORROW_AGGREGATE",
    "MONTHLY_TOTAL_RETURN",
    "MONTHLY_RETURN_CAT_INSTRUMENT",
    "FUND_VAR_INFO",
    "EXPLANATORY_NOTE"
]

hid_tables = [
    "IDENTIFIERS",
    "DEBT_SECURITY_REF_INSTRUMENT",
    "CONVERTIBLE_SECURITY_CURRENCY",
    "REPURCHASE_AGREEMENT",
    "REPURCHASE_COUNTERPARTY",
    "REPURCHASE_COLLATERAL",
    "DERIVATIVE_COUNTERPARTY",
    "SWAPTION_OPTION_WARNT_DERIV",
    "DESC_REF_INDEX_BASKET",
    "DESC_REF_INDEX_COMPONENT",
    "DESC_REF_OTHER",
    "FUT_FWD_NONFOREIGNCUR_CONTRACT",
    "FWD_FOREIGNCUR_CONTRACT_SWAP",
    "NONFOREIGN_EXCHANGE_SWAP",
    "FLOATING_RATE_RESET_TENOR",
    "OTHER_DERIV",
    "OTHER_DERIV_NOTIONAL_AMOUNT",
    "SECURITIES_LENDING",
]

for table in ac_tables:
    filename = f"{table}.tsv"
    create_sampled_table(filename, "ACCESSION_NUMBER")

for table in hid_tables:
    filename = f"{table}.tsv"
    create_sampled_table(filename, "HOLDING_ID")

In [9]:
# Drop irrelevant tables
cursor.execute("DROP TABLE FUND_REPORTED_HOLDING_RAW;")
cursor.execute("DROP TABLE SAMPLED_CUSIPS;")

<sqlite3.Cursor at 0x2272d3658c0>

# Drop columns missing more than 85% values

In [10]:
def drop_columns(table_name):
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
    result = cursor.fetchone()

    # Check if the result is not None
    if not result:
        return

    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    total_rows = cursor.fetchone()[0]

    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = cursor.fetchall()

    columns_to_drop = []
    columns_to_keep = []

    for column in columns:
        column_name = column[1]

        # Skip columns that end with ':1'
        if column_name.endswith(':1'):
            print(f"Skipping column {column_name} in table {table_name}")
            continue

        cursor.execute(f"SELECT COUNT({column_name}) FROM {table_name} WHERE {column_name} = ''")
        null_count = cursor.fetchone()[0]

        if null_count > 0.85 * total_rows:
            columns_to_drop.append(column_name)
        else:
            columns_to_keep.append(column_name)

    if columns_to_drop:
        # Create new table with remaining columns
        new_table_name = f"{table_name}_new"
        columns_definition = ', '.join(columns_to_keep)
        cursor.execute(f"CREATE TABLE {new_table_name} ({columns_definition})")

        # Copy columns to new table
        cursor.execute(f"INSERT INTO {new_table_name} ({columns_definition}) SELECT {columns_definition} FROM {table_name}")

        # Drop old table
        cursor.execute(f"DROP TABLE {table_name}")

        # Rename new table to original table name
        cursor.execute(f"ALTER TABLE {new_table_name} RENAME TO {table_name}")

        print(f"Dropped columns {', '.join(columns_to_drop)} from table {table_name}")
    else:
        print(f"No columns to drop from table {table_name}")




In [11]:
tables = ['FUND_REPORTED_HOLDING','REGISTRANT', 'SUBMISSION', 'FUND_REPORTED_INFO', 'INTEREST_RATE_RISK', 'BORROWER', 'BORROW_AGGREGATE', 'MONTHLY_TOTAL_RETURN', 'MONTHLY_RETURN_CAT_INSTRUMENT', 'FUND_VAR_INFO', 'EXPLANATORY_NOTE', 'IDENTIFIERS', 'DEBT_SECURITY_REF_INSTRUMENT', 'CONVERTIBLE_SECURITY_CURRENCY', 'REPURCHASE_AGREEMENT', 'REPURCHASE_COUNTERPARTY', 'REPURCHASE_COLLATERAL', 'DERIVATIVE_COUNTERPARTY', 'SWAPTION_OPTION_WARNT_DERIV', 'DESC_REF_INDEX_BASKET', 'DESC_REF_INDEX_COMPONENT', 'DESC_REF_OTHER', 'FUT_FWD_NONFOREIGNCUR_CONTRACT', 'FWD_FOREIGNCUR_CONTRACT_SWAP', 'NONFOREIGN_EXCHANGE_SWAP', 'FLOATING_RATE_RESET_TENOR', 'OTHER_DERIV', 'OTHER_DERIV_NOTIONAL_AMOUNT', 'SECURITIES_LENDING']

for table in tables:
    drop_columns(table)

Skipping column ISSUER_CUSIP:1 in table FUND_REPORTED_HOLDING
Dropped columns OTHER_UNIT_DESC, EXCHANGE_RATE, OTHER_ASSET, OTHER_ISSUER, DERIVATIVE_CAT from table FUND_REPORTED_HOLDING
No columns to drop from table REGISTRANT
Dropped columns FILE_NUM from table SUBMISSION
No columns to drop from table FUND_REPORTED_INFO
No columns to drop from table INTEREST_RATE_RISK
No columns to drop from table BORROWER
Dropped columns OTHER_DESC from table BORROW_AGGREGATE
No columns to drop from table MONTHLY_TOTAL_RETURN
No columns to drop from table MONTHLY_RETURN_CAT_INSTRUMENT
No columns to drop from table EXPLANATORY_NOTE
Dropped columns OTHER_IDENTIFIER, OTHER_IDENTIFIER_DESC from table IDENTIFIERS
No columns to drop from table DEBT_SECURITY_REF_INSTRUMENT
No columns to drop from table CONVERTIBLE_SECURITY_CURRENCY
No columns to drop from table REPURCHASE_AGREEMENT
No columns to drop from table REPURCHASE_COUNTERPARTY
No columns to drop from table REPURCHASE_COLLATERAL
No columns to drop fro

In [12]:
# View tables currently in database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
tables = [table[0] for table in tables]
print(len(tables))
print(tables)

28
['REGISTRANT', 'FUND_REPORTED_INFO', 'INTEREST_RATE_RISK', 'BORROWER', 'MONTHLY_TOTAL_RETURN', 'MONTHLY_RETURN_CAT_INSTRUMENT', 'EXPLANATORY_NOTE', 'DEBT_SECURITY_REF_INSTRUMENT', 'CONVERTIBLE_SECURITY_CURRENCY', 'REPURCHASE_AGREEMENT', 'REPURCHASE_COUNTERPARTY', 'REPURCHASE_COLLATERAL', 'DERIVATIVE_COUNTERPARTY', 'SWAPTION_OPTION_WARNT_DERIV', 'DESC_REF_INDEX_BASKET', 'DESC_REF_INDEX_COMPONENT', 'FUT_FWD_NONFOREIGNCUR_CONTRACT', 'FWD_FOREIGNCUR_CONTRACT_SWAP', 'FLOATING_RATE_RESET_TENOR', 'OTHER_DERIV', 'OTHER_DERIV_NOTIONAL_AMOUNT', 'FUND_REPORTED_HOLDING', 'SUBMISSION', 'BORROW_AGGREGATE', 'IDENTIFIERS', 'DESC_REF_OTHER', 'NONFOREIGN_EXCHANGE_SWAP', 'SECURITIES_LENDING']


In [13]:
def add_quarter_column(raw_data_folder, table_name):
    # Extract the folder name from the path
    
    folder_name = raw_data_folder.split('/')[-1].split('_')[0]
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = [column_info[1] for column_info in cursor.fetchall()]
    
    if "QUARTER" not in columns:

        # Add the 'quarter' column to the table
        cursor.execute(f'ALTER TABLE "{table_name}" ADD COLUMN QUARTER TEXT')

    # Update all rows in the new 'quarter' column with the folder name
    cursor.execute(f'UPDATE "{table_name}" SET QUARTER = ?', (folder_name,))

    print(f"Added column 'QUARTER' to table '{table_name}' with value '{folder_name}' for all rows.")



In [14]:
for table in tables:
  add_quarter_column(raw_data_folder, table)

Added column 'QUARTER' to table 'REGISTRANT' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'FUND_REPORTED_INFO' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'INTEREST_RATE_RISK' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'BORROWER' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'MONTHLY_TOTAL_RETURN' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'MONTHLY_RETURN_CAT_INSTRUMENT' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'EXPLANATORY_NOTE' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'DEBT_SECURITY_REF_INSTRUMENT' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'CONVERTIBLE_SECURITY_CURRENCY' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'REPURCHASE_AGREEMENT' with value '2019q4' for all rows.
Added column 'QUARTER' to table 'REPURCHASE_COUNTERPARTY' with value '2019q4' for all rows.
Added column 'QUARTER'

In [15]:
def create_sampled_table_and_join(filename, raw_data_folder, primary_key):
        table_name = os.path.splitext(filename)[0]  # Extract table name without extension
        file_path = raw_data_folder +'/'+ filename  # Construct full file path
        if not os.path.isfile(file_path):
            return f"Exiting early because '{filename}'.tsv does not exist in '{raw_data_folder}"
        # Retrieve primary keys from the FUND_REPORTED_HOLDING table
        primary_key_query = f"SELECT {primary_key} FROM FUND_REPORTED_HOLDING;"
        cursor.execute(primary_key_query)
        primary_keys = {row[0] for row in cursor.fetchall()}  # Store keys in a set for faster lookup

        with open(file_path, 'r', newline='') as file:
            reader = csv.reader(file, delimiter='\t')
            column_names = next(reader)  # Get the first row as column names

            # Create the new table with "_NEW" suffix
            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name}_NEW ({', '.join([f'{col} TEXT' for col in column_names])});"
            cursor.execute(create_table_query)

            # Insert only rows that match the primary keys
            for row in reader:
                row_primary_key = row[column_names.index(primary_key)]  # Retrieve primary key value from the current row

                if row_primary_key in primary_keys:  # Check if the row's primary key exists
                    insert_query = f"INSERT INTO {table_name}_NEW ({', '.join(column_names)}) VALUES ({', '.join(['?'] * len(column_names))});"
                    cursor.execute(insert_query, row)

        # Drop columns with more than 85% missing values
        drop_columns(f"{table_name}_NEW")

        # Add a 'quarter' column with corresponding quarter
        add_quarter_column(raw_data_folder, f"{table_name}_NEW")

        # Join new data into original table
        join_query = f"INSERT INTO {table_name} SELECT * FROM {table_name}_NEW;"
        cursor.execute(join_query)

        # Drop the temporary new table
        drop_table_query = f"DROP TABLE {table_name}_NEW;"
        cursor.execute(drop_table_query)

        print(f"Successfully processed '{filename}' and updated '{table_name}'.")


In [16]:
directory_path = "./data"

# List all folders in the directory
folder_names = [name for name in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, name))]

for folder in folder_names:

    raw_data_folder = os.getcwd()+f"/data/'{folder}'"

    # Separate query because ac_tables does not contain FUND_REPORTED_HOLDING
    create_sampled_table_and_join(f"FUND_REPORTED_HOLDING.tsv", raw_data_folder, "ACCESSION_NUMBER")
    for table in ac_tables:
        create_sampled_table_and_join(f"{table}.tsv", raw_data_folder, "ACCESSION_NUMBER")

    for table in hid_tables:
        create_sampled_table_and_join(f"{table}.tsv", raw_data_folder, "HOLDING_ID")

In [17]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")

# Fetch all results and print each table name
lalalalala = cursor.fetchall()
print("Tables in the database:")
for table in lalalalala:
    print(table[0])


Tables in the database:
REGISTRANT
FUND_REPORTED_INFO
INTEREST_RATE_RISK
BORROWER
MONTHLY_TOTAL_RETURN
MONTHLY_RETURN_CAT_INSTRUMENT
EXPLANATORY_NOTE
DEBT_SECURITY_REF_INSTRUMENT
CONVERTIBLE_SECURITY_CURRENCY
REPURCHASE_AGREEMENT
REPURCHASE_COUNTERPARTY
REPURCHASE_COLLATERAL
DERIVATIVE_COUNTERPARTY
SWAPTION_OPTION_WARNT_DERIV
DESC_REF_INDEX_BASKET
DESC_REF_INDEX_COMPONENT
FUT_FWD_NONFOREIGNCUR_CONTRACT
FWD_FOREIGNCUR_CONTRACT_SWAP
FLOATING_RATE_RESET_TENOR
OTHER_DERIV
OTHER_DERIV_NOTIONAL_AMOUNT
FUND_REPORTED_HOLDING
SUBMISSION
BORROW_AGGREGATE
IDENTIFIERS
DESC_REF_OTHER
NONFOREIGN_EXCHANGE_SWAP
SECURITIES_LENDING


In [18]:
cursor.execute("PRAGMA table_info(REGISTRANT);")

# Fetch all results and print column names
columns = cursor.fetchall()
print("Column names in 'REGISTRANT' table:")
for column in columns:
    print(column[1])  # column[1] contains the name of each column


cursor.execute("SELECT * FROM REGISTRANT LIMIT 10;")

# Fetch the first 10 rows
rows = cursor.fetchall()

# Print the rows
print("First 10 rows of 'REGISTRANT' table:")
for row in rows:
    print(row)


Column names in 'REGISTRANT' table:
ACCESSION_NUMBER
CIK
REGISTRANT_NAME
FILE_NUM
LEI
ADDRESS1
ADDRESS2
CITY
STATE
COUNTRY
ZIP
PHONE
QUARTER
First 10 rows of 'REGISTRANT' table:
('0001145549-19-041255', '0001145022', 'HOTCHKIS & WILEY FUNDS /DE/', '811-10487', '549300DFU8YF1PWZ5A57', '725 South Figueroa Street', '39th Floor', 'Los Angeles', 'US-CA', 'US', '90017', '213-430-1000', '2019q4')
('0001752724-19-157564', '0001257927', 'WEITZ FUNDS', '811-21410', '5493001DCTX72JLZFE90', '1125 SOUTH 103 ST', '', 'OMAHA', 'US-NE', 'US', '68124', '402-391-1980', '2019q4')
('0001752724-19-159709', '0001066602', 'Voya Funds Trust', '811-08895', '5493002OZLWDCTJEFB81', '7337 East Doubletree Ranch Road', 'Suite 100', 'Scottsdale', 'US-AZ', 'US', '85258', '1-800-992-0180', '2019q4')
('0001145549-19-042355', '0001747688', 'Pacific Global ETF Trust', '811-23376', '549300HTE2GS8CHIQ361', '840 Newport Center Drive', '7th Floor', 'Newport Beach', 'US-CA', 'US', '92660', '949-219-3391', '2019q4')
('00017527

In [19]:
conn.close()
