# Imports

In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import sqlite3

# Create SQLite DBs from Hopteam and NPPES

## Explore NPPES CSVs

In [2]:
end = pd.read_csv('../data/NPPES/endpoint.csv', chunksize = 1000)
npi = pd.read_csv('../data/NPPES/npidata.csv', chunksize = 1000)
other = pd.read_csv('../data/NPPES/othername.csv', chunksize = 1000)
pl = pd.read_csv('../data/NPPES/pl.csv', chunksize = 1000)

## endpoint

In [4]:
end_chunk1 = next(end)
end_chunk1.head()

Unnamed: 0,NPI,Endpoint Type,Endpoint Type Description,Endpoint,Affiliation,Endpoint Description,Affiliation Legal Business Name,Use Code,Use Description,Other Use Description,Content Type,Content Description,Other Content Description,Affiliation Address Line One,Affiliation Address Line Two,Affiliation Address City,Affiliation Address State,Affiliation Address Country,Affiliation Address Postal Code
1000,1639173594,DIRECT,Direct Messaging Address,julie.grannanfnpc.p63@direct.ahni.nextgenshare...,N,,,DIRECT,Direct,,OTHER,Other,CCDA,4101 Technology Ave,,New Albany,IN,US,471508548.0
1001,1548264401,DIRECT,Direct Messaging Address,robert.caffrey@digichart.direct-ci.com,N,,,,,,,,,19550 E 39th St S,Ste 300,Independence,MO,US,640572303.0
1002,1003810961,DIRECT,Direct Messaging Address,michael.d.marshall2@upmcdirect.com,Y,,UPMC,,,,,,,600 Grant St,Floor 58,Pittsburgh,PA,US,152192702.0
1003,1912901877,DIRECT,Direct Messaging Address,Direct@IllinoisEyeCenterPeoriaIL.CompulinkDire...,N,"CCD only, does not accept attachments",,DIRECT,Direct,,,,,8921 N Wood Sage Rd,,Peoria,IL,US,616157822.0
1004,1558365411,DIRECT,Direct Messaging Address,eric.kanter.p1@direct.rvc-nj.nextgenshare.com,N,Direct Address,,HIE,Health Information Exchange (HIE),,CSV,CSV,,349 E Northfield Rd,Ste 100,Livingston,NJ,US,70394802.0


In [5]:
end_chunk1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 1000 to 1999
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   NPI                              1000 non-null   int64  
 1   Endpoint Type                    999 non-null    object 
 2   Endpoint Type Description        999 non-null    object 
 3   Endpoint                         999 non-null    object 
 4   Affiliation                      1000 non-null   object 
 5   Endpoint Description             291 non-null    object 
 6   Affiliation Legal Business Name  148 non-null    object 
 7   Use Code                         454 non-null    object 
 8   Use Description                  454 non-null    object 
 9   Other Use Description            39 non-null     object 
 10  Content Type                     307 non-null    object 
 11  Content Description              307 non-null    object 
 12  Other Content Des

## npi
    - 'NPI'
    - 'Entity Type Code'
    - 'Provider Organization Name (Legal Business Name)'
    - 'Provider Last Name (Legal Name)'
    - 'Provider First Name'
    - 'Provider Middle Name'
    - 'Provider Name Prefix Text'
    - 'Provider Name Suffix Text'
    - 'Provider Credential Text'
    - 'Provider First Line Business Practice Location Address'
    - 'Provider Second Line Business Practice Location Address'
    - 'Provider Business Practice Location Address City Name'
    - 'Provider Business Practice Location Address State Name'
    - 'Provider Business Practice Location Address Postal Code'

In [6]:
npi_chunk1 = next(npi)

In [7]:
taxes = npi_chunk1.filter(regex = '^NPI$|(Taxonomy (Code|Switch))')

taxes_code_long = taxes.melt(id_vars = [col for col in taxes.columns if 'NPI' in col or 'Taxonomy Switch' in col],
           var_name = 'Taxonomy Code Number',
           value_name = 'Taxonomy Code'
          )
taxes_long = taxes_code_long.melt(id_vars = [col for col in taxes_code_long.columns if 'NPI' in col or 'Taxonomy Code' in col],
                 var_name = 'Taxonomy Switch Number',
                 value_name = 'Taxonomy Switch'
            )

taxes_long['Taxonomy Code Number'] = taxes_long['Taxonomy Code Number'].str.extract('(\d+)')
taxes_long['Taxonomy Switch Number'] = taxes_long['Taxonomy Switch Number'].str.extract('(\d+)')

primary_taxes = (taxes_long[(taxes_long['Taxonomy Code Number'] == taxes_long['Taxonomy Switch Number']) 
                & 
                (taxes_long['Taxonomy Switch'] == 'Y')]
                .drop(columns = ['Taxonomy Code Number','Taxonomy Switch Number'])
            )

primary_taxes

Unnamed: 0,NPI,Taxonomy Code,Taxonomy Switch
0,1962405183,2085R0202X,Y
1,1871596098,174400000X,Y
2,1780687905,207RG0100X,Y
3,1598768715,174400000X,Y
4,1407859622,363LF0000X,Y
...,...,...,...
96505,1568465789,225100000X,Y
128559,1295738409,3336H0001X,Y
144705,1962405159,261QP0905X,Y
208019,1952304164,225100000X,Y


In [8]:
 profile_cols = ['NPI',
                 'Entity Type Code',
                 'Provider Organization Name (Legal Business Name)',
                 'Provider Last Name (Legal Name)',
                 'Provider First Name',
                 'Provider Middle Name',
                 'Provider Name Prefix Text',
                 'Provider Name Suffix Text',
                 'Provider Credential Text',
                 'Provider First Line Business Practice Location Address',
                 'Provider Second Line Business Practice Location Address',
                 'Provider Business Practice Location Address City Name',
                 'Provider Business Practice Location Address State Name',
                 'Provider Business Practice Location Address Postal Code']
profile = npi_chunk1.filter(items = profile_cols)
profile

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code
1000,1962405183,1.0,,FOX,DANIEL,R,DR.,,MD,11995 SINGLETREE LN STE 500,,EDEN PRAIRIE,MN,553445349.0
1001,1871596098,1.0,,DIAZ-LACAYO,MARVIN,,,,MD,21150 BISCAYNE BLVD,STE 101,AVENTURA,FL,331801226.0
1002,1780687905,1.0,,KIRK,ROBERT,M,DR.,,M.D.,815 E PARRISH AVE,STE 450,OWENSBORO,KY,423033223.0
1003,1598768715,1.0,,MOSCA,PHILIP,,DR.,,MD,4200 S DOUGLAS AVE,STE 300,OKLAHOMA CITY,OK,731093215.0
1004,1407859622,1.0,,MARKWARDT,GEORGE,LEE,MR.,,NP,55 CENTRAL PLZ,,ILION,NY,133571701.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1659374775,1.0,,CLARK,MARY,MCGINN,DR.,,PH.D.,10981 SAN DIEGO MISSION RD,STE 114,SAN DIEGO,CA,921082448.0
1996,1568465680,1.0,,HORGAN,JOHN,D.,,,M.D.,10707 PACIFIC ST,SUITE 101,OMAHA,NE,681144762.0
1997,1477556595,1.0,,CLOYD,DAVID,G,DR.,,MD,2710 E HARNEY ST,SUITE 100,LARAMIE,WY,820722884.0
1998,1386647402,2.0,"PHYSICIANS DAY SURGERY CENTER, LLC",,,,,,,850 111TH AVE N,,NAPLES,FL,341081803.0


In [10]:
nppes_chunk = pd.merge(profile, primary_taxes)
nppes_chunk

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,Taxonomy Code,Taxonomy Switch
0,1962405183,1.0,,FOX,DANIEL,R,DR.,,MD,11995 SINGLETREE LN STE 500,,EDEN PRAIRIE,MN,553445349.0,2085R0202X,Y
1,1871596098,1.0,,DIAZ-LACAYO,MARVIN,,,,MD,21150 BISCAYNE BLVD,STE 101,AVENTURA,FL,331801226.0,174400000X,Y
2,1780687905,1.0,,KIRK,ROBERT,M,DR.,,M.D.,815 E PARRISH AVE,STE 450,OWENSBORO,KY,423033223.0,207RG0100X,Y
3,1598768715,1.0,,MOSCA,PHILIP,,DR.,,MD,4200 S DOUGLAS AVE,STE 300,OKLAHOMA CITY,OK,731093215.0,174400000X,Y
4,1407859622,1.0,,MARKWARDT,GEORGE,LEE,MR.,,NP,55 CENTRAL PLZ,,ILION,NY,133571701.0,363LF0000X,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,1659374775,1.0,,CLARK,MARY,MCGINN,DR.,,PH.D.,10981 SAN DIEGO MISSION RD,STE 114,SAN DIEGO,CA,921082448.0,103TC1900X,Y
920,1568465680,1.0,,HORGAN,JOHN,D.,,,M.D.,10707 PACIFIC ST,SUITE 101,OMAHA,NE,681144762.0,208800000X,Y
921,1477556595,1.0,,CLOYD,DAVID,G,DR.,,MD,2710 E HARNEY ST,SUITE 100,LARAMIE,WY,820722884.0,207V00000X,Y
922,1386647402,2.0,"PHYSICIANS DAY SURGERY CENTER, LLC",,,,,,,850 111TH AVE N,,NAPLES,FL,341081803.0,261QA1903X,Y


## pl

## Hopteam

In [12]:
ht = pd.read_csv('../data/hop_team/DocGraph_Hop_Teaming.csv', chunksize = 1000)
ht_chunk = next(ht)

In [13]:
ht_chunk

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508062167,1730166109,350,370,53.922,72.612
1,1508065640,1730166109,25,25,49.800,55.006
2,1508052093,1730166109,16,16,109.500,70.593
3,1508172545,1730166109,14,14,103.357,75.483
4,1508285131,1730166109,20,21,89.952,89.880
...,...,...,...,...,...,...
995,1508868506,1730177452,13,13,131.154,118.476
996,1508864257,1730177452,17,18,61.222,53.114
997,1508875980,1730177452,15,16,107.500,80.602
998,1508870163,1730177478,18,18,68.667,53.639


# Create SQLite Database

## Build and Insert nppes Table into Referral Database

In [None]:
# Save these column names for later
 profile_cols = ['NPI',
             'Entity Type Code',
             'Provider Organization Name (Legal Business Name)',
             'Provider Last Name (Legal Name)',
             'Provider First Name',
             'Provider Middle Name',
             'Provider Name Prefix Text',
             'Provider Name Suffix Text',
             'Provider Credential Text',
             'Provider First Line Business Practice Location Address',
             'Provider Second Line Business Practice Location Address',
             'Provider Business Practice Location Address City Name',
             'Provider Business Practice Location Address State Name',
             'Provider Business Practice Location Address Postal Code']

# Create the SQLite database
db = sqlite3.connect('data/hopteam.sqlite')    

# Chunk loop through npidata to build the nppes table in the hopteam database
for chunk in tqdm(pd.read_csv('../data/NPPES/npidata.csv', chunksize = 10000)):
  
    # Build the primary taxonomy table for this chunk
    taxes = chunk.filter(regex = '^NPI$|(Taxonomy (Code|Switch))')

    taxes_code_long = taxes.melt(id_vars = [col for col in taxes.columns if 'NPI' in col or 'Taxonomy Switch' in col],
               var_name = 'Taxonomy Code Number',
               value_name = 'Taxonomy Code'
              )
    taxes_long = taxes_code_long.melt(id_vars = [col for col in taxes_code_long.columns if 'NPI' in col or 'Taxonomy Code' in col],
                     var_name = 'Taxonomy Switch Number',
                     value_name = 'Taxonomy Switch'
                )

    taxes_long['Taxonomy Code Number'] = taxes_long['Taxonomy Code Number'].str.extract('(\d+)')
    taxes_long['Taxonomy Switch Number'] = taxes_long['Taxonomy Switch Number'].str.extract('(\d+)')

    primary_taxes = (taxes_long[(taxes_long['Taxonomy Code Number'] == taxes_long['Taxonomy Switch Number']) 
                    & 
                    (taxes_long['Taxonomy Switch'] == 'Y')]
                    .drop(columns = ['Taxonomy Code Number','Taxonomy Switch Number'])
                )
    
    # Build the profile table for this chunk
    
    profile = chunk.filter(items = profile_cols)
    
    # Merge profile and taxonomy tables to get
    
    nppes_chunk = pd.merge(profile, primary_taxes)
    
    # Append chunk to nppes table in hopteam database
    
    chunk.to_sql('nppes', db, if_exists = 'append', index = False)

# Insert hopteam Table into Referral Database

In [None]:
for chunk in tqdm(pd.read_csv('../data/hop_team/DocGraph_Hop_Teaming.csv', chunksize = 10000)):
    chunk.to_sql('hopteam', db, if_exists = 'append', index = False)

In [None]:
db.close()