# Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import sqlite3

# Formulas

In [2]:
def zip_filler(x):
    try:
        if len(x) > 5:
            return x.zfill(9)[:5]
        elif len(x) > 0:
            return x.zfill(5)
    except:
        return np.nan

# Create SQLite DBs from Hopteam and NPPES

## Taxonomy Detail

In [3]:
tax_detail = pd.read_csv('../data/NPPES/nucc_taxonomy_220.csv')
tax_detail

Unnamed: 0,Code,Grouping,Classification,Specialization,Definition,Notes,Display Name,Section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,Definition to come...,,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,Definition to come...,,Clinical & Laboratory Immunology (Allergy & Im...,Individual
...,...,...,...,...,...,...,...,...
863,343800000X,Transportation Services,Secured Medical Transport (VAN),,A public or privately owned transportation ser...,,Secured Medical Transport (VAN),Non-Individual
864,344600000X,Transportation Services,Taxi,,A land commercial vehicle used for the transpo...,,Taxi,Non-Individual
865,347D00000X,Transportation Services,Train,,An organization or business licensed to provid...,,Train,Non-Individual
866,347E00000X,Transportation Services,Transportation Broker,,An organization that provides transportation f...,Source: Section 6083 of the Deficit Reduction ...,Transportation Broker,Non-Individual


## npi
    - 'NPI'
    - 'Entity Type Code'
    - 'Provider Organization Name (Legal Business Name)'
    - 'Provider Last Name (Legal Name)'
    - 'Provider First Name'
    - 'Provider Middle Name'
    - 'Provider Name Prefix Text'
    - 'Provider Name Suffix Text'
    - 'Provider Credential Text'
    - 'Provider First Line Business Practice Location Address'
    - 'Provider Second Line Business Practice Location Address'
    - 'Provider Business Practice Location Address City Name'
    - 'Provider Business Practice Location Address State Name'
    - 'Provider Business Practice Location Address Postal Code'

In [4]:
npi = pd.read_csv('../data/NPPES/npidata.csv', chunksize = 1000)
npi_chunk1 = next(npi)

In [5]:
taxes = npi_chunk1.filter(regex = '^NPI$|(Taxonomy (Code|Switch))')

taxes_code_long = taxes.melt(id_vars = [col for col in taxes.columns if 'NPI' in col or 'Taxonomy Switch' in col],
           var_name = 'Taxonomy Code Number',
           value_name = 'Taxonomy Code'
          )
taxes_long = taxes_code_long.melt(id_vars = [col for col in taxes_code_long.columns if 'NPI' in col or 'Taxonomy Code' in col],
                 var_name = 'Taxonomy Switch Number',
                 value_name = 'Taxonomy Switch'
            )

taxes_long['Taxonomy Code Number'] = taxes_long['Taxonomy Code Number'].str.extract('(\d+)')
taxes_long['Taxonomy Switch Number'] = taxes_long['Taxonomy Switch Number'].str.extract('(\d+)')

primary_taxes = (taxes_long[(taxes_long['Taxonomy Code Number'] == taxes_long['Taxonomy Switch Number']) 
                & 
                (taxes_long['Taxonomy Switch'] == 'Y')]
                .drop(columns = ['Taxonomy Code Number','Taxonomy Switch Number'])
            )

primary_taxes

Unnamed: 0,NPI,Taxonomy Code,Taxonomy Switch
0,1679576722,207X00000X,Y
1,1588667638,207RC0000X,Y
2,1497758544,251G00000X,Y
5,1023011178,251G00000X,Y
7,1841293990,231H00000X,Y
...,...,...,...
96426,1699778829,207X00000X,Y
112942,1811990047,333600000X,Y
128714,1164425336,3336H0001X,Y
128964,1619970845,3336H0001X,Y


In [6]:
 prof_cols = ['NPI',
                 'Entity Type Code',
                 'Provider Organization Name (Legal Business Name)',
                 'Provider Last Name (Legal Name)',
                 'Provider First Name',
                 'Provider Middle Name',
                 'Provider Name Prefix Text',
                 'Provider Name Suffix Text',
                 'Provider Credential Text',
                 'Provider First Line Business Practice Location Address',
                 'Provider Second Line Business Practice Location Address',
                 'Provider Business Practice Location Address City Name',
                 'Provider Business Practice Location Address State Name',
                 'Provider Business Practice Location Address Postal Code']
prof = npi_chunk1.filter(items = prof_cols)

In [7]:
prof['Provider Business Practice Location Address Postal Code'] = (prof['Provider Business Practice Location Address Postal Code']
                                                                      .astype(str)
                                                                      .str.replace('.0', '', regex = True)
                                                                      .apply(zip_filler)
                                                                     )
prof

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,68847
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,00324
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,00284
3,1306849450,,,,,,,,,,,,,00nan
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,71243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1417950635,1.0,,MARINO,CHRIS,J,DR.,,M.D.,12670 WHITEHALL DR,,FORT MYERS,FL,00337
996,1326041542,1.0,,TAFEL,ALLEN,C,DR.,,M.D.,2531 CLEVELAND AVE,STE 1,FT MYERS,FL,33140
997,1235132457,1.0,,ADAMSON,BRENT,E,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,68847
998,1144223363,1.0,,KATZ,MARC,A,DR.,,DPM,2919 W SWANN AVE,STE 203,TAMPA,FL,33938


In [8]:
# Save these cols for chunking later
npi_cols = prof.columns.tolist() + taxes.columns[1:].tolist()

In [9]:
profile_chunk = pd.merge(prof, primary_taxes)
profile_chunk.columns = [x.lower().replace(' ', '_') for x in profile_chunk.columns]
profile_chunk

Unnamed: 0,npi,entity_type_code,provider_organization_name_(legal_business_name),provider_last_name_(legal_name),provider_first_name,provider_middle_name,provider_name_prefix_text,provider_name_suffix_text,provider_credential_text,provider_first_line_business_practice_location_address,provider_second_line_business_practice_location_address,provider_business_practice_location_address_city_name,provider_business_practice_location_address_state_name,provider_business_practice_location_address_postal_code,taxonomy_code,taxonomy_switch
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,68847,207X00000X,Y
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,00324,207RC0000X,Y
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,00284,251G00000X,Y
3,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,71243,207RH0003X,Y
4,1023011178,2.0,COLLABRIA CARE,,,,,,,414 S JEFFERSON ST,,NAPA,CA,94559,251G00000X,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
927,1417950635,1.0,,MARINO,CHRIS,J,DR.,,M.D.,12670 WHITEHALL DR,,FORT MYERS,FL,00337,2084N0400X,Y
928,1326041542,1.0,,TAFEL,ALLEN,C,DR.,,M.D.,2531 CLEVELAND AVE,STE 1,FT MYERS,FL,33140,2081P2900X,Y
929,1235132457,1.0,,ADAMSON,BRENT,E,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,68847,207X00000X,Y
930,1144223363,1.0,,KATZ,MARC,A,DR.,,DPM,2919 W SWANN AVE,STE 203,TAMPA,FL,33938,213E00000X,Y


## hopteam

In [10]:
ht = pd.read_csv('../data/hop_team/DocGraph_Hop_Teaming.csv', chunksize = 1000)
ht_chunk = next(ht)

In [11]:
ht_chunk

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508062167,1730166109,350,370,53.922,72.612
1,1508065640,1730166109,25,25,49.800,55.006
2,1508052093,1730166109,16,16,109.500,70.593
3,1508172545,1730166109,14,14,103.357,75.483
4,1508285131,1730166109,20,21,89.952,89.880
...,...,...,...,...,...,...
995,1508868506,1730177452,13,13,131.154,118.476
996,1508864257,1730177452,17,18,61.222,53.114
997,1508875980,1730177452,15,16,107.500,80.602
998,1508870163,1730177478,18,18,68.667,53.639


In [12]:
ht_chunk = ht_chunk.reset_index().rename(columns = {'index': 'referral_id'})
from_npi = ht_chunk[['referral_id', 'from_npi']]
to_npi = ht_chunk[['referral_id', 'to_npi']]
referrals = ht_chunk[['referral_id', 
                      'patient_count',
                      'transaction_count',
                      'average_day_wait',
                      'std_day_wait']]

In [13]:
referrals

Unnamed: 0,referral_id,patient_count,transaction_count,average_day_wait,std_day_wait
0,0,350,370,53.922,72.612
1,1,25,25,49.800,55.006
2,2,16,16,109.500,70.593
3,3,14,14,103.357,75.483
4,4,20,21,89.952,89.880
...,...,...,...,...,...
995,995,13,13,131.154,118.476
996,996,17,18,61.222,53.114
997,997,15,16,107.500,80.602
998,998,18,18,68.667,53.639


## cbsa data

In [14]:
zip_cbsa = pd.read_excel('../data/NPPES/ZIP_CBSA_122021.xlsx')

In [15]:
zip_cbsa['zip'] = zip_cbsa['zip'].astype(str).str.zfill(5)
zip_cbsa['cbsa'] = zip_cbsa['cbsa'].astype(str).str.zfill(5)
zip_cbsa = zip_cbsa.rename(columns = {'usps_zip_pref_city': 'city',
                                      'usps_zip_pref_state': 'state'})

In [16]:
zip_cbsa

Unnamed: 0,zip,cbsa,city,state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,00683,41900,SAN GERMAN,PR,0.999842,1.0,1.0,0.999855
1,00683,32420,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
2,00923,41980,SAN JUAN,PR,1.000000,1.0,1.0,1.000000
3,01010,44140,BRIMFIELD,MA,0.976896,1.0,1.0,0.977816
4,01010,49340,BRIMFIELD,MA,0.023104,0.0,0.0,0.022184
...,...,...,...,...,...,...,...,...
47479,60684,16980,CHICAGO,IL,0.000000,1.0,0.0,1.000000
47480,33945,15980,PINELAND,FL,0.000000,0.0,1.0,1.000000
47481,78144,99999,PANNA MARIA,TX,0.000000,1.0,0.0,1.000000
47482,12257,10580,ALBANY,NY,0.000000,1.0,0.0,1.000000


# Create SQLite Database
    - profile
    - referral_from
    - referral_to
    - referral
    - taxonomy
    - zip_cbsa

## Build and Insert profile Table into hopteam Database

In [None]:
# Save these column names for later
prof_cols = ['NPI',
             'Entity Type Code',
             'Provider Organization Name (Legal Business Name)',
             'Provider Last Name (Legal Name)',
             'Provider First Name',
             'Provider Middle Name',
             'Provider Name Prefix Text',
             'Provider Name Suffix Text',
             'Provider Credential Text',
             'Provider First Line Business Practice Location Address',
             'Provider Second Line Business Practice Location Address',
             'Provider Business Practice Location Address City Name',
             'Provider Business Practice Location Address State Name',
             'Provider Business Practice Location Address Postal Code']

# Create the SQLite database
db = sqlite3.connect('../data/hopteam.sqlite')    

# Chunk loop through npidata to build the nppes table in the hopteam database
for chunk in tqdm(pd.read_csv('../data/NPPES/npidata.csv', 
                              usecols = npi_cols,
                              dtype = {'Provider Business Practice Location Address Postal Code': str},
                              chunksize = 100000)):
  
    # Build the primary taxonomy table for this chunk
    taxes = chunk.filter(regex = '^NPI$|(Taxonomy (Code|Switch))')

    taxes_code_long = taxes.melt(id_vars = [col for col in taxes.columns if 'NPI' in col or 'Taxonomy Switch' in col],
               var_name = 'Taxonomy Code Number',
               value_name = 'Taxonomy Code'
              )
    taxes_long = taxes_code_long.melt(id_vars = [col for col in taxes_code_long.columns if 'NPI' in col or 'Taxonomy Code' in col],
                     var_name = 'Taxonomy Switch Number',
                     value_name = 'Taxonomy Switch'
                )

    taxes_long['Taxonomy Code Number'] = taxes_long['Taxonomy Code Number'].str.extract('(\d+)')
    taxes_long['Taxonomy Switch Number'] = taxes_long['Taxonomy Switch Number'].str.extract('(\d+)')

    primary_taxes = (taxes_long[(taxes_long['Taxonomy Code Number'] == taxes_long['Taxonomy Switch Number']) 
                    & 
                    (taxes_long['Taxonomy Switch'] == 'Y')]
                    .drop(columns = ['Taxonomy Code Number','Taxonomy Switch Number'])
                )
    
    # Build the profile table for this chunk
    
    prof = chunk.filter(items = prof_cols)
    prof['Provider Business Practice Location Address Postal Code'] = (prof['Provider Business Practice Location Address Postal Code']
                                                                      .apply(zip_filler)
                                                                     )
    
    # Merge profile and taxonomy tables to get
    
    profile_chunk = pd.merge(prof, primary_taxes)
    
    profile_chunk.columns = [x.lower().replace(' ', '_') for x in profile_chunk.columns]
    
    # Append chunk to nppes table in hopteam database
    
    profile_chunk.to_sql('profile', db, if_exists = 'append', index = False)

0it [00:00, ?it/s]

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


## Build and Insert referral tables into hopteam Database

In [None]:
for chunk in tqdm(pd.read_csv('../data/hop_team/DocGraph_Hop_Teaming.csv', chunksize = 10000)):
    ht_chunk = chunk.reset_index().rename(columns = {'index': 'referral_id'})
    from_npi_chunk = ht_chunk[['referral_id', 'from_npi']]
    to_npi_chunk = ht_chunk[['referral_id', 'to_npi']]
    referrals_chunk = ht_chunk[['referral_id', 
                          'patient_count',
                          'transaction_count',
                          'average_day_wait',
                          'std_day_wait']]
    
    from_npi_chunk.to_sql('referral_from', db, if_exists = 'append', index = False)
    to_npi_chunk.to_sql('referral_to', db, if_exists = 'append', index = False)
    referrals_chunk.to_sql('referrals', db, if_exists = 'append', index = False)

In [None]:
tax_detail.to_sql('taxonomy', db, if_exists = 'append', index = False)
zip_cbsa.to_sql('zip_cbsa', db, if_exists = 'append', index = False)

In [21]:
db.close()