## Objective: Create an sampled database via 1000 unique CUSIPS for one NPORT quarter (2024q3)

In [1]:
import os
import sqlite3
import csv

### Set up directory

In [2]:
# Get current directory
curr_dir = os.getcwd()
print(curr_dir)

# Get parent directory
os.chdir("..")
parent_dir = os.getcwd()
print(parent_dir)

d:\GithubRepos\PIMCO-Text2SQL\setup
d:\GithubRepos\PIMCO-Text2SQL


### Establish connection to db file

In [3]:
# Set up connection
conn = sqlite3.connect(parent_dir + '/sqlite/nport.db')
cursor = conn.cursor()

### Get raw data folder

In [4]:
raw_data_folder = parent_dir + '/data/2024q3_nport'
print(raw_data_folder)

d:\GithubRepos\PIMCO-Text2SQL/data/2024q3_nport


### Sampling methodology:
- FUND_REPORTED_HOLDING contains ISSUER_CUSIP
- DEBT_SECURITY_REF_INSTRUMENT, DESC_REF_INDEX_COMPONENT, and DESC_REF_OTHER all contain CUSIP
- We retrieved the unique cusip values for each table to see which had the broadest range that we could sample from.

---

*   Unique cusip count from FUND_REPORTED_HOLDING:  356276
*   Unique cusip count from DEBT_SECURITY_REF_INSTRUMENT:  552
*   Unique cusip count from DESC_REF_INDEX_COMPONENT:  2890
*   Unique cusip count from DESC_REF_OTHER:  3307

We have chosen to sample from FUND_REPORTED_HOLDING.

### Create Complete Fund Reported Holdings Table

In [None]:
# Create Fund Reported Holdings Table

file_path = raw_data_folder +'/'+ 'FUND_REPORTED_HOLDING.tsv'

with open(file_path, 'r', newline='') as file:
    reader = csv.reader(file, delimiter='\t')
    column_names = next(reader)  # Get the first row as column names

    create_table_query = f"CREATE TABLE IF NOT EXISTS {'FUND_REPORTED_HOLDING_RAW'} ({', '.join([f'{col} TEXT' for col in column_names])});"
    cursor.execute(create_table_query)

    # Insert data into the table
    for row in reader:
        insert_query = f"INSERT INTO {'FUND_REPORTED_HOLDING_RAW'} ({', '.join(column_names)}) VALUES ({', '.join(['?'] * len(column_names))});"
        cursor.execute(insert_query, row)

In [6]:
# View first 10 rows of Fund Reported Holdings Table
cursor.execute("SELECT * FROM FUND_REPORTED_HOLDING_RAW LIMIT 10")

# Retrieve column names
column_names = [description[0] for description in cursor.description]
print(column_names)

# Retrieve table data
data = cursor.fetchall()
for inst in data:
    print(inst)

['ACCESSION_NUMBER', 'HOLDING_ID', 'ISSUER_NAME', 'ISSUER_LEI', 'ISSUER_TITLE', 'ISSUER_CUSIP', 'BALANCE', 'UNIT', 'OTHER_UNIT_DESC', 'CURRENCY_CODE', 'CURRENCY_VALUE', 'EXCHANGE_RATE', 'PERCENTAGE', 'PAYOFF_PROFILE', 'ASSET_CAT', 'OTHER_ASSET', 'ISSUER_TYPE', 'OTHER_ISSUER', 'INVESTMENT_COUNTRY', 'IS_RESTRICTED_SECURITY', 'FAIR_VALUE_LEVEL', 'DERIVATIVE_CAT']
('0001752724-24-224070', '128653995', 'LAM RESEARCH CORPORATION', '549300I4GMO6D34U1T02', 'LAM RESEARCH CORP COMMON STOCK', '512807108', '74668', 'NS', '', 'USD', '68787148.32', '', '.836025601461', 'Long', 'EC', '', 'OTHER', 'N/A', 'US', 'N', '1', '')
('0001752724-24-224070', '128654030', 'GLOBALFOUNDRIES INC.', '549300BA76VK784VMX48', 'GLOBALFOUNDRIES INC COMMON STOCK', 'G39387108', '307204', 'NS', '', 'USD', '15670476.04', '', '.190455913299', 'Long', 'EC', '', 'OTHER', 'N/A', 'US', 'N', '1', '')
('0001752724-24-224070', '128654068', 'AMAZON.COM, INC.', 'ZXTILKJKG63JELOEG630', 'AMAZON.COM INC COMMON STOCK', '023135106', '22881

### Randomly sample 1000 CUSIPS

In [7]:
# Randomly sample 1000 distinct CUSIPS as table SAMPLED_CUSIPS
cursor.execute('''CREATE TABLE SAMPLED_CUSIPS AS
                SELECT DISTINCT ISSUER_CUSIP
                FROM FUND_REPORTED_HOLDING_RAW
                WHERE ISSUER_CUSIP != 'N/A'
                ORDER BY RANDOM()
                LIMIT 1000;''')

<sqlite3.Cursor at 0x2538a29edc0>

In [8]:
## View first 10 sampled cusips
cursor.execute("select * from SAMPLED_CUSIPS")

data = cursor.fetchall()
print("Sampled CUSIPS: ")
for inst in data[:10]:
    print(inst)

# Print number of sampled cusips
cursor.execute("select count(*) from SAMPLED_CUSIPS")
print("Number of CUSIPS: ", cursor.fetchone()[0])

Sampled CUSIPS: 
('812643UZ5',)
('139365GD6',)
('ACI0HH5S3',)
('64831TAE7',)
('100853UX6',)
('3137AAYH0',)
('13051AGJ1',)
('41151PAP0',)
('066878ML0',)
('702528LF5',)
Number of CUSIPS:  1000


### Create filtered table FUND_REPORTED_HOLDING

In [9]:
cursor.execute('''
                CREATE TABLE FUND_REPORTED_HOLDING AS
                SELECT *
                FROM FUND_REPORTED_HOLDING_RAW frhr
                JOIN SAMPLED_CUSIPS s ON frhr.ISSUER_CUSIP = s.ISSUER_CUSIP
''')

<sqlite3.Cursor at 0x2538a29edc0>

In [10]:
# View first 10 rows of filtered Fund Reported Holdings Table
cursor.execute("SELECT * FROM FUND_REPORTED_HOLDING LIMIT 10")

# Retrieve column names
column_names = [description[0] for description in cursor.description]
print(column_names)

# Retrieve table data
data = cursor.fetchall()
for inst in data:
    print(inst)

# Print number of rows
cursor.execute("select count(*) from FUND_REPORTED_HOLDING")
print("Number of Rows: ", cursor.fetchone()[0])

['ACCESSION_NUMBER', 'HOLDING_ID', 'ISSUER_NAME', 'ISSUER_LEI', 'ISSUER_TITLE', 'ISSUER_CUSIP', 'BALANCE', 'UNIT', 'OTHER_UNIT_DESC', 'CURRENCY_CODE', 'CURRENCY_VALUE', 'EXCHANGE_RATE', 'PERCENTAGE', 'PAYOFF_PROFILE', 'ASSET_CAT', 'OTHER_ASSET', 'ISSUER_TYPE', 'OTHER_ISSUER', 'INVESTMENT_COUNTRY', 'IS_RESTRICTED_SECURITY', 'FAIR_VALUE_LEVEL', 'DERIVATIVE_CAT', 'ISSUER_CUSIP:1']
('0001752724-24-223537', '128640724', 'Indiana (State of) Finance Authority (United States Steel Corp.)', '549300PS0PAS7NDSSI20', 'Indiana (State of) Finance Authority (United States Steel Corp.), Series 2021 A, Ref. RB', '455054BE5', '500000', 'PA', '', 'USD', '502242.2', '', '1.634062931123', 'Long', 'DBT', '', 'MUN', '', 'US', 'N', '2', '', '455054BE5')
('0001145549-24-062678', '128666648', 'Matterhorn Re Ltd.', '549300QRZ77PO1BYWT39', 'Matterhorn Re Argon 2022-1 Class A', '577092AR0', '1000000', 'PA', '', 'USD', '997927.07', '', '.0324112951', 'Long', 'DBT', '', 'CORP', '', 'BM', 'Y', '2', '', '577092AR0')
(

### Create all other filtered tables

In [11]:
def create_sampled_table(filename, primary_key):
    if filename.endswith('.tsv'): 
        table_name = os.path.splitext(filename)[0]
        file_path = raw_data_folder + '/' + filename

        if not os.path.isfile(file_path):
            return f"Exiting early because '{filename}'.tsv does not exist in '{raw_data_folder}"
        
        # Retrieve primary keys from FUND_REPORTED_HOLDING table
        primary_key_query = f"SELECT {primary_key} FROM FUND_REPORTED_HOLDING;"
        cursor.execute(primary_key_query)
        primary_keys = {row[0] for row in cursor.fetchall()}
        
        with open(file_path, 'r', newline='') as file:
            reader = csv.reader(file, delimiter='\t')
            column_names = next(reader)  # Get the first row as column names

            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join([f'{col} TEXT' for col in column_names])});"
            cursor.execute(create_table_query)

            # Insert only rows that match the primary keys
            for row in reader:
                # Retrieve primary key value from the current row
                row_primary_key = row[column_names.index(primary_key)]

                # Check if the row's primary key exists in the fetched primary keys
                if row_primary_key in primary_keys:
                    insert_query = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({', '.join(['?'] * len(column_names))});"
                    cursor.execute(insert_query, row)

In [12]:
ac_tables = [
    "SUBMISSION",
    "REGISTRANT",
    "FUND_REPORTED_INFO",
    "INTEREST_RATE_RISK",
    "BORROWER",
    "BORROW_AGGREGATE",
    "MONTHLY_TOTAL_RETURN",
    "MONTHLY_RETURN_CAT_INSTRUMENT",
    "FUND_VAR_INFO",
    "EXPLANATORY_NOTE"
]

hid_tables = [
    "IDENTIFIERS",
    "DEBT_SECURITY",
    "DEBT_SECURITY_REF_INSTRUMENT",
    "CONVERTIBLE_SECURITY_CURRENCY",
    "REPURCHASE_AGREEMENT",
    "REPURCHASE_COUNTERPARTY",
    "REPURCHASE_COLLATERAL",
    "DERIVATIVE_COUNTERPARTY",
    "SWAPTION_OPTION_WARNT_DERIV",
    "DESC_REF_INDEX_BASKET",
    "DESC_REF_INDEX_COMPONENT",
    "DESC_REF_OTHER",
    "FUT_FWD_NONFOREIGNCUR_CONTRACT",
    "FWD_FOREIGNCUR_CONTRACT_SWAP",
    "NONFOREIGN_EXCHANGE_SWAP",
    "FLOATING_RATE_RESET_TENOR",
    "OTHER_DERIV",
    "OTHER_DERIV_NOTIONAL_AMOUNT",
    "SECURITIES_LENDING",
]

for table in ac_tables:
    filename = f"{table}.tsv"
    create_sampled_table(filename, "ACCESSION_NUMBER")
    print("Created sampled table using ACCESSION_NUMBER for", filename)

for table in hid_tables:
    filename = f"{table}.tsv"
    create_sampled_table(filename, "HOLDING_ID")
    print("Created sampled table using HOLDING_ID for", filename)


Created sampled table using ACCESSION_NUMBER for SUBMISSION.tsv
Created sampled table using ACCESSION_NUMBER for REGISTRANT.tsv
Created sampled table using ACCESSION_NUMBER for FUND_REPORTED_INFO.tsv
Created sampled table using ACCESSION_NUMBER for INTEREST_RATE_RISK.tsv
Created sampled table using ACCESSION_NUMBER for BORROWER.tsv
Created sampled table using ACCESSION_NUMBER for BORROW_AGGREGATE.tsv
Created sampled table using ACCESSION_NUMBER for MONTHLY_TOTAL_RETURN.tsv
Created sampled table using ACCESSION_NUMBER for MONTHLY_RETURN_CAT_INSTRUMENT.tsv
Created sampled table using ACCESSION_NUMBER for FUND_VAR_INFO.tsv
Created sampled table using ACCESSION_NUMBER for EXPLANATORY_NOTE.tsv
Created sampled table using HOLDING_ID for IDENTIFIERS.tsv
Created sampled table using HOLDING_ID for DEBT_SECURITY.tsv
Created sampled table using HOLDING_ID for DEBT_SECURITY_REF_INSTRUMENT.tsv
Created sampled table using HOLDING_ID for CONVERTIBLE_SECURITY_CURRENCY.tsv
Created sampled table using H

In [13]:
# Print number of rows for each table

for table in ac_tables:
    cursor.execute(f"SELECT COUNT(*) FROM {table}")
    print(f"Number of Rows in {table}:", cursor.fetchone()[0])

for table in hid_tables:
    cursor.execute(f"SELECT COUNT(*) FROM {table}")
    print(f"Number of Rows in {table}:", cursor.fetchone()[0])

Number of Rows in SUBMISSION: 4084
Number of Rows in REGISTRANT: 4084
Number of Rows in FUND_REPORTED_INFO: 4084
Number of Rows in INTEREST_RATE_RISK: 5294
Number of Rows in BORROWER: 16952
Number of Rows in BORROW_AGGREGATE: 583
Number of Rows in MONTHLY_TOTAL_RETURN: 10476
Number of Rows in MONTHLY_RETURN_CAT_INSTRUMENT: 93702
Number of Rows in FUND_VAR_INFO: 1771
Number of Rows in EXPLANATORY_NOTE: 4506
Number of Rows in IDENTIFIERS: 12312
Number of Rows in DEBT_SECURITY: 5421
Number of Rows in DEBT_SECURITY_REF_INSTRUMENT: 0
Number of Rows in CONVERTIBLE_SECURITY_CURRENCY: 0
Number of Rows in REPURCHASE_AGREEMENT: 0
Number of Rows in REPURCHASE_COUNTERPARTY: 0
Number of Rows in REPURCHASE_COLLATERAL: 0
Number of Rows in DERIVATIVE_COUNTERPARTY: 27
Number of Rows in SWAPTION_OPTION_WARNT_DERIV: 3
Number of Rows in DESC_REF_INDEX_BASKET: 0
Number of Rows in DESC_REF_INDEX_COMPONENT: 0
Number of Rows in DESC_REF_OTHER: 3
Number of Rows in FUT_FWD_NONFOREIGNCUR_CONTRACT: 0
Number of Ro

In [14]:
# View tables currently in database

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
tables = [table[0] for table in tables]
print("Number of Tables:", len(tables))
print(tables)

Number of Tables: 32
['FUND_REPORTED_HOLDING_RAW', 'SAMPLED_CUSIPS', 'FUND_REPORTED_HOLDING', 'SUBMISSION', 'REGISTRANT', 'FUND_REPORTED_INFO', 'INTEREST_RATE_RISK', 'BORROWER', 'BORROW_AGGREGATE', 'MONTHLY_TOTAL_RETURN', 'MONTHLY_RETURN_CAT_INSTRUMENT', 'FUND_VAR_INFO', 'EXPLANATORY_NOTE', 'IDENTIFIERS', 'DEBT_SECURITY', 'DEBT_SECURITY_REF_INSTRUMENT', 'CONVERTIBLE_SECURITY_CURRENCY', 'REPURCHASE_AGREEMENT', 'REPURCHASE_COUNTERPARTY', 'REPURCHASE_COLLATERAL', 'DERIVATIVE_COUNTERPARTY', 'SWAPTION_OPTION_WARNT_DERIV', 'DESC_REF_INDEX_BASKET', 'DESC_REF_INDEX_COMPONENT', 'DESC_REF_OTHER', 'FUT_FWD_NONFOREIGNCUR_CONTRACT', 'FWD_FOREIGNCUR_CONTRACT_SWAP', 'NONFOREIGN_EXCHANGE_SWAP', 'FLOATING_RATE_RESET_TENOR', 'OTHER_DERIV', 'OTHER_DERIV_NOTIONAL_AMOUNT', 'SECURITIES_LENDING']


### Drop Unnecessary Tables from Database

In [15]:
# Drop irrelevant tables
cursor.execute("DROP TABLE FUND_REPORTED_HOLDING_RAW;")
cursor.execute("DROP TABLE SAMPLED_CUSIPS;")

<sqlite3.Cursor at 0x2538a29edc0>

### Drop Columns Missing more than 85% of Values

In [16]:
def drop_columns(table_name):
    temp_cursor= conn.cursor()

    temp_cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
    result = temp_cursor.fetchone()

    # Check if the result is not None
    if not result:
        return("There are currently no tables in the database.")

    temp_cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    total_rows = temp_cursor.fetchone()[0]

    temp_cursor.execute(f"PRAGMA table_info({table_name})")
    columns = temp_cursor.fetchall()

    columns_to_drop = []
    columns_to_keep = []


    for column in columns:
        column_name = column[1]

        # Skip columns that end with ':1'
        if column_name.endswith(':1'):
            print(f"Skipping column {column_name} in table {table_name}")
            continue

        temp_cursor.execute(f"SELECT COUNT({column_name}) FROM {table_name} WHERE {column_name} = ''")
        null_count = temp_cursor.fetchone()[0]

        if null_count > 0.85 * total_rows:
            columns_to_drop.append(column_name)
        else:
            columns_to_keep.append(column_name)

    if columns_to_drop:
        # Create new table with remaining columns
        new_table_name = f"{table_name}_new"
        columns_definition = ', '.join(columns_to_keep)
        temp_cursor.execute(f"CREATE TABLE {new_table_name} ({columns_definition})")

        # Copy columns to new table
        temp_cursor.execute(f"INSERT INTO {new_table_name} ({columns_definition}) SELECT {columns_definition} FROM {table_name}")

        # Drop old table
        temp_cursor.execute(f"DROP TABLE {table_name}")

        # Rename new table to original table name
        temp_cursor.execute(f"ALTER TABLE {new_table_name} RENAME TO {table_name}")

        print(f"Dropped columns {', '.join(columns_to_drop)} from table {table_name}")
    else:
        print(f"No columns to drop from table {table_name}")
    
    temp_cursor.close()
    return columns_to_drop, columns_to_keep

In [17]:
tables = ['FUND_REPORTED_HOLDING','REGISTRANT', 'SUBMISSION', 'FUND_REPORTED_INFO', 'INTEREST_RATE_RISK', 'BORROWER', 'BORROW_AGGREGATE', 'MONTHLY_TOTAL_RETURN', 'MONTHLY_RETURN_CAT_INSTRUMENT', 'FUND_VAR_INFO', 'EXPLANATORY_NOTE', 'IDENTIFIERS', 'DEBT_SECURITY_REF_INSTRUMENT', 'CONVERTIBLE_SECURITY_CURRENCY', 'REPURCHASE_AGREEMENT', 'REPURCHASE_COUNTERPARTY', 'REPURCHASE_COLLATERAL', 'DERIVATIVE_COUNTERPARTY', 'SWAPTION_OPTION_WARNT_DERIV', 'DESC_REF_INDEX_BASKET', 'DESC_REF_INDEX_COMPONENT', 'DESC_REF_OTHER', 'FUT_FWD_NONFOREIGNCUR_CONTRACT', 'FWD_FOREIGNCUR_CONTRACT_SWAP', 'NONFOREIGN_EXCHANGE_SWAP', 'FLOATING_RATE_RESET_TENOR', 'OTHER_DERIV', 'OTHER_DERIV_NOTIONAL_AMOUNT', 'SECURITIES_LENDING']
columns_to_drop = {}
columns_to_keep = {}
for table in tables:
    columns_to_drop[table],columns_to_keep[table] = drop_columns(table)
print("Kept columns: ", columns_to_drop)
print("Dropped columns: ", columns_to_keep)

Skipping column ISSUER_CUSIP:1 in table FUND_REPORTED_HOLDING
Dropped columns OTHER_UNIT_DESC, EXCHANGE_RATE, OTHER_ASSET, OTHER_ISSUER, DERIVATIVE_CAT from table FUND_REPORTED_HOLDING
No columns to drop from table REGISTRANT
Dropped columns FILE_NUM from table SUBMISSION
No columns to drop from table FUND_REPORTED_INFO
No columns to drop from table INTEREST_RATE_RISK
No columns to drop from table BORROWER
Dropped columns OTHER_DESC from table BORROW_AGGREGATE
No columns to drop from table MONTHLY_TOTAL_RETURN
No columns to drop from table MONTHLY_RETURN_CAT_INSTRUMENT
No columns to drop from table FUND_VAR_INFO
No columns to drop from table EXPLANATORY_NOTE
Dropped columns IDENTIFIER_TICKER, OTHER_IDENTIFIER, OTHER_IDENTIFIER_DESC from table IDENTIFIERS
No columns to drop from table DEBT_SECURITY_REF_INSTRUMENT
No columns to drop from table CONVERTIBLE_SECURITY_CURRENCY
No columns to drop from table REPURCHASE_AGREEMENT
No columns to drop from table REPURCHASE_COUNTERPARTY
No columns 

In [18]:
# View tables currently in database

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
tables = [table[0] for table in tables]
print("Number of Tables:", len(tables))
print(tables)

for table in tables:
    cursor.execute(f"SELECT COUNT(*) FROM {table}")
    print(f"Number of Rows in {table}:", cursor.fetchone()[0])

Number of Tables: 30
['REGISTRANT', 'FUND_REPORTED_INFO', 'INTEREST_RATE_RISK', 'BORROWER', 'MONTHLY_TOTAL_RETURN', 'MONTHLY_RETURN_CAT_INSTRUMENT', 'FUND_VAR_INFO', 'EXPLANATORY_NOTE', 'DEBT_SECURITY', 'DEBT_SECURITY_REF_INSTRUMENT', 'CONVERTIBLE_SECURITY_CURRENCY', 'REPURCHASE_AGREEMENT', 'REPURCHASE_COUNTERPARTY', 'REPURCHASE_COLLATERAL', 'DERIVATIVE_COUNTERPARTY', 'DESC_REF_INDEX_BASKET', 'DESC_REF_INDEX_COMPONENT', 'FUT_FWD_NONFOREIGNCUR_CONTRACT', 'FWD_FOREIGNCUR_CONTRACT_SWAP', 'FLOATING_RATE_RESET_TENOR', 'OTHER_DERIV', 'OTHER_DERIV_NOTIONAL_AMOUNT', 'FUND_REPORTED_HOLDING', 'SUBMISSION', 'BORROW_AGGREGATE', 'IDENTIFIERS', 'SWAPTION_OPTION_WARNT_DERIV', 'DESC_REF_OTHER', 'NONFOREIGN_EXCHANGE_SWAP', 'SECURITIES_LENDING']
Number of Rows in REGISTRANT: 4084
Number of Rows in FUND_REPORTED_INFO: 4084
Number of Rows in INTEREST_RATE_RISK: 5294
Number of Rows in BORROWER: 16952
Number of Rows in MONTHLY_TOTAL_RETURN: 10476
Number of Rows in MONTHLY_RETURN_CAT_INSTRUMENT: 93702
Number

In [19]:
cursor.close()
conn.commit()
conn.close()