## Objective: Create a full database for one NPORT quarter (2024q3)

In [1]:
import os
import sqlite3
import csv

### Set up directory

In [2]:
# Get current directory
curr_dir = os.getcwd()
print(curr_dir)

# Get parent directory
os.chdir("..")
parent_dir = os.getcwd()
print(parent_dir)

d:\GithubRepos\PIMCO-Text2SQL\setup
d:\GithubRepos\PIMCO-Text2SQL


### Establish connection to db file

In [3]:
# Set up connection
conn = sqlite3.connect(parent_dir + '/sqlite/nport.db')
cursor = conn.cursor()

### Get raw data folder

In [4]:
raw_data_folder = parent_dir + '/data/2024q3_nport'
print(raw_data_folder)

d:\GithubRepos\PIMCO-Text2SQL/data/2024q3_nport


### Create Tables

In [5]:
# Create table from .tsv file
def create_table(filename):
    if filename.endswith('.tsv'):
        table_name = os.path.splitext(filename)[0]
        file_path = raw_data_folder +'/'+ filename
        print(f"Starting initialization of {table_name}")
        with open(file_path, 'r', newline='') as file:
            reader = csv.reader(file, delimiter='\t')
            column_names = next(reader)  # Get the first row as column names

            create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join([f'{col} TEXT' for col in column_names])});"
            cursor.execute(create_table_query)

            # Insert data into the table
            for row in reader:
                insert_query = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({', '.join(['?'] * len(column_names))});"
                cursor.execute(insert_query, row)

        print(f"Finished creating table {table_name}")

In [6]:
for filename in os.listdir(raw_data_folder):
    create_table(filename)

Starting initialization of BORROWER
Finished creating table BORROWER
Starting initialization of BORROW_AGGREGATE
Finished creating table BORROW_AGGREGATE
Starting initialization of CONVERTIBLE_SECURITY_CURRENCY
Finished creating table CONVERTIBLE_SECURITY_CURRENCY
Starting initialization of DEBT_SECURITY
Finished creating table DEBT_SECURITY
Starting initialization of DEBT_SECURITY_REF_INSTRUMENT
Finished creating table DEBT_SECURITY_REF_INSTRUMENT
Starting initialization of DERIVATIVE_COUNTERPARTY
Finished creating table DERIVATIVE_COUNTERPARTY
Starting initialization of DESC_REF_INDEX_BASKET
Finished creating table DESC_REF_INDEX_BASKET
Starting initialization of DESC_REF_INDEX_COMPONENT
Finished creating table DESC_REF_INDEX_COMPONENT
Starting initialization of DESC_REF_OTHER
Finished creating table DESC_REF_OTHER
Starting initialization of EXPLANATORY_NOTE
Finished creating table EXPLANATORY_NOTE
Starting initialization of FLOATING_RATE_RESET_TENOR
Finished creating table FLOATING_

### Drop Columns Missing more than 85% of Values

In [7]:
def drop_columns(table_name):
    temp_cursor= conn.cursor()

    temp_cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
    result = temp_cursor.fetchone()

    # Check if the result is not None
    if not result:
        return("There are currently no tables in the database.")

    temp_cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    total_rows = temp_cursor.fetchone()[0]

    temp_cursor.execute(f"PRAGMA table_info({table_name})")
    columns = temp_cursor.fetchall()

    columns_to_drop = []
    columns_to_keep = []


    for column in columns:
        column_name = column[1]

        # Skip columns that end with ':1'
        if column_name.endswith(':1'):
            print(f"Skipping column {column_name} in table {table_name}")
            continue

        temp_cursor.execute(f"SELECT COUNT({column_name}) FROM {table_name} WHERE {column_name} = ''")
        null_count = temp_cursor.fetchone()[0]

        if null_count > 0.85 * total_rows:
            columns_to_drop.append(column_name)
        else:
            columns_to_keep.append(column_name)

    if columns_to_drop:
        # Create new table with remaining columns
        new_table_name = f"{table_name}_new"
        columns_definition = ', '.join(columns_to_keep)
        temp_cursor.execute(f"CREATE TABLE {new_table_name} ({columns_definition})")

        # Copy columns to new table
        temp_cursor.execute(f"INSERT INTO {new_table_name} ({columns_definition}) SELECT {columns_definition} FROM {table_name}")

        # Drop old table
        temp_cursor.execute(f"DROP TABLE {table_name}")

        # Rename new table to original table name
        temp_cursor.execute(f"ALTER TABLE {new_table_name} RENAME TO {table_name}")

        print(f"Dropped columns {', '.join(columns_to_drop)} from table {table_name}")
    else:
        print(f"No columns to drop from table {table_name}")
    
    temp_cursor.close()
    return columns_to_drop, columns_to_keep

In [8]:
columns_to_drop = {}
columns_to_keep = {}
for table in tables:
    columns_to_drop[table],columns_to_keep[table] = drop_columns(table)
print("Kept columns: ", columns_to_keep)
print("Dropped columns: ", columns_to_drop)

NameError: name 'tables' is not defined

In [None]:
empty_count = 0

for table, kept_columns in columns_to_keep.items():
    if not kept_columns:  # Checks if the list is empty
        print(f"Table '{table}' is empty.")
        empty_count += 1
    else:
        print(f"Table '{table}' has kept columns: {kept_columns}")

print("Empty Tables:", empty_count)

In [None]:
# View tables currently in database

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
tables = [table[0] for table in tables]
print("Number of Tables:", len(tables))
print("Tables:", tables)

empty_count = 0

for table in tables:
    cursor.execute(f"SELECT COUNT(*) FROM {table}")
    count = cursor.fetchone()[0]
    print(f"Number of Rows in {table}:", count)

    if count == 0:
        empty_count += 1

print("Number of Empty Tables:", empty_count)
