In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

## Hopteam 

In [2]:
hop_team = pd.read_csv('data/Docgraph_Hop_Teaming_2018.csv', nrows = 100)
hop_team.head(5)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508062167,1730166109,350,370,53.922,72.612
1,1508065640,1730166109,25,25,49.8,55.006
2,1508052093,1730166109,16,16,109.5,70.593
3,1508172545,1730166109,14,14,103.357,75.483
4,1508285131,1730166109,20,21,89.952,89.88


#### Adding 'referral_id' as an index 

In [3]:
hop_team=hop_team.reset_index().rename(columns={'index': 'referral_id'})
hop_team.head(5)

Unnamed: 0,referral_id,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,0,1508062167,1730166109,350,370,53.922,72.612
1,1,1508065640,1730166109,25,25,49.8,55.006
2,2,1508052093,1730166109,16,16,109.5,70.593
3,3,1508172545,1730166109,14,14,103.357,75.483
4,4,1508285131,1730166109,20,21,89.952,89.88


#### Splitting hop_team table into 3 tables

In [4]:
from_npi=hop_team[['referral_id', 'from_npi']]
to_npi=hop_team[['referral_id', 'to_npi']]
referrals= hop_team[['referral_id', 'patient_count', 'transaction_count', 'average_day_wait', 'std_day_wait']]

### cbsa

In [5]:
cbsa_zip=pd.read_excel('data/ZIP_CBSA_122021.xlsx')
cbsa_zip.head(5)

Unnamed: 0,zip,cbsa,usps_zip_pref_city,usps_zip_pref_state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,683,41900,SAN GERMAN,PR,0.999842,1.0,1.0,0.999855
1,683,32420,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
2,923,41980,SAN JUAN,PR,1.0,1.0,1.0,1.0
3,1010,44140,BRIMFIELD,MA,0.976896,1.0,1.0,0.977816
4,1010,49340,BRIMFIELD,MA,0.023104,0.0,0.0,0.022184


In [6]:
#changing the column type
cbsa_zip['zip']=cbsa_zip['zip'].astype(str).str.zfill(5)
cbsa_zip['cbsa']=cbsa_zip['cbsa'].astype(str).str.zfill(5)

#renaming columns 
cbsa_zip=cbsa_zip.rename(columns={'usps_zip_pref_city': 'city',
                                 'usps_zip_pref_state': 'state'})

cbsa_zip.head(5)

Unnamed: 0,zip,cbsa,city,state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,683,41900,SAN GERMAN,PR,0.999842,1.0,1.0,0.999855
1,683,32420,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
2,923,41980,SAN JUAN,PR,1.0,1.0,1.0,1.0
3,1010,44140,BRIMFIELD,MA,0.976896,1.0,1.0,0.977816
4,1010,49340,BRIMFIELD,MA,0.023104,0.0,0.0,0.022184


### NPI

In [7]:
profile = pd.read_csv('data/npidata_pfile_20050523-20220213.csv', nrows = 100)
profile.head(5)

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Employer Identification Number (EIN),Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,...,Healthcare Provider Taxonomy Group_7,Healthcare Provider Taxonomy Group_8,Healthcare Provider Taxonomy Group_9,Healthcare Provider Taxonomy Group_10,Healthcare Provider Taxonomy Group_11,Healthcare Provider Taxonomy Group_12,Healthcare Provider Taxonomy Group_13,Healthcare Provider Taxonomy Group_14,Healthcare Provider Taxonomy Group_15,Certification Date
0,1679576722,1.0,,,,WIEBE,DAVID,A,,,...,,,,,,,,,,
1,1588667638,1.0,,,,PILCHER,WILLIAM,C,DR.,,...,,,,,,,,,,
2,1497758544,2.0,,<UNAVAIL>,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,...,,,,,,,,,,
3,1306849450,,,,,,,,,,...,,,,,,,,,,
4,1215930367,1.0,,,,GRESSOT,LAURENT,,DR.,,...,,,,,,,,,,


In [8]:
#list of all columns
col_list = profile.columns.values.tolist()
col_list

['NPI',
 'Entity Type Code',
 'Replacement NPI',
 'Employer Identification Number (EIN)',
 'Provider Organization Name (Legal Business Name)',
 'Provider Last Name (Legal Name)',
 'Provider First Name',
 'Provider Middle Name',
 'Provider Name Prefix Text',
 'Provider Name Suffix Text',
 'Provider Credential Text',
 'Provider Other Organization Name',
 'Provider Other Organization Name Type Code',
 'Provider Other Last Name',
 'Provider Other First Name',
 'Provider Other Middle Name',
 'Provider Other Name Prefix Text',
 'Provider Other Name Suffix Text',
 'Provider Other Credential Text',
 'Provider Other Last Name Type Code',
 'Provider First Line Business Mailing Address',
 'Provider Second Line Business Mailing Address',
 'Provider Business Mailing Address City Name',
 'Provider Business Mailing Address State Name',
 'Provider Business Mailing Address Postal Code',
 'Provider Business Mailing Address Country Code (If outside U.S.)',
 'Provider Business Mailing Address Telephone Nu

### creating taxonomy table

In [9]:
#extracting NPI and Taxonomy code/switch columns
taxes = profile.filter(regex = '^NPI$|(Taxonomy (Code|Switch))')


#using melt (kind of pivoting)
taxes_code_long = taxes.melt(id_vars = [col for col in taxes.columns if 'NPI' in col or 'Taxonomy Switch' in col],
               var_name = 'Taxonomy Code Number',
               value_name = 'Taxonomy Code')


taxes_long = taxes_code_long.melt(id_vars = [col for col in taxes_code_long.columns if 'NPI' in col or 'Taxonomy Code' in col],
                     var_name = 'Taxonomy Switch Number',
                     value_name = 'Taxonomy Switch')


#extracting numbers from the string
taxes_long['Taxonomy Code Number'] = taxes_long['Taxonomy Code Number'].str.extract('(\d+)')
taxes_long['Taxonomy Switch Number'] = taxes_long['Taxonomy Switch Number'].str.extract('(\d+)')


taxes_long


Unnamed: 0,NPI,Taxonomy Code Number,Taxonomy Code,Taxonomy Switch Number,Taxonomy Switch
0,1679576722,1,207X00000X,1,Y
1,1588667638,1,207RC0000X,1,Y
2,1497758544,1,251G00000X,1,Y
3,1306849450,1,,1,
4,1215930367,1,174400000X,1,N
...,...,...,...,...,...
22495,1649273673,15,,15,
22496,1083617013,15,,15,
22497,1992708929,15,,15,
22498,1801899836,15,,15,


In [10]:
primary_taxes= (taxes_long[(taxes_long['Taxonomy Code Number']==taxes_long['Taxonomy Switch Number'])
                          &
                          (taxes_long['Taxonomy Switch']=='Y')]
                          .drop(columns=['Taxonomy Code Number', 'Taxonomy Switch Number'])
    )
                          
primary_taxes          


Unnamed: 0,NPI,Taxonomy Code,Taxonomy Switch
0,1679576722,207X00000X,Y
1,1588667638,207RC0000X,Y
2,1497758544,251G00000X,Y
5,1023011178,251G00000X,Y
7,1841293990,231H00000X,Y
...,...,...,...
3227,1740283795,313M00000X,Y
3228,1659374601,207Q00000X,Y
4849,1043213093,3336H0001X,Y
4869,1720081763,3336C0003X,Y


In [11]:
#list of columns we want to keep in the npi table
profile_cols={'NPI',
             'Entity Type Code',
             'Provider Organization Name (Legal Business Name)',
            'Provider Last Name (Legal Name)',
            'Provider First Name',
            'Provider Middle Name',
            'Provider Name Prefix Text',
            'Provider Name Suffix Text',
            'Provider Credential Text',
             'Provider First Line Business Practice Location Address',
            'Provider Second Line Business Practice Location Address',
            'Provider Business Practice Location Address City Name',
            'Provider Business Practice Location Address State Name',
            'Provider Business Practice Location Address Postal Code'   
    
}

#changing the type of the column
profile['Provider Business Practice Location Address Postal Code']=profile['Provider Business Practice Location Address Postal Code'].astype(str)



In [12]:
#keeping the columns we want
profile=profile.loc[:, profile_cols] #approch 1

#profile=profile.filter(items=profile_cols) #approach 2
profile

Unnamed: 0,NPI,Provider Business Practice Location Address Postal Code,Provider First Name,Provider Middle Name,Provider Name Suffix Text,Provider Credential Text,Provider Name Prefix Text,Provider Organization Name (Legal Business Name),Provider Second Line Business Practice Location Address,Entity Type Code,Provider First Line Business Practice Location Address,Provider Business Practice Location Address State Name,Provider Last Name (Legal Name),Provider Business Practice Location Address City Name
0,1679576722,688472944.0,DAVID,A,,M.D.,,,,1.0,3500 CENTRAL AVE,NE,WIEBE,KEARNEY
1,1588667638,322044736.0,WILLIAM,C,,MD,DR.,,SUITE 300,1.0,1824 KING STREET,FL,PILCHER,JACKSONVILLE
2,1497758544,283044552.0,,,,,,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,2.0,3418 VILLAGE DR,NC,,FAYETTEVILLE
3,1306849450,,,,,,,,,,,,,
4,1215930367,770901243.0,LAURENT,,,M.D.,DR.,,,1.0,17323 RED OAK DR,TX,GRESSOT,HOUSTON
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1649273673,281052387.0,TODD,M,,M.D.,,,SUITE 300,1.0,1450 MATTHEWS TOWNSHIP PKWY,NC,KOPCZYNSKI,MATTHEWS
96,1083617013,31011325.0,,,,,,AMOSKEAG HEALTH,,2.0,145 HOLLIS ST,NH,,MANCHESTER
97,1992708929,303271610.0,,,,,,NOVAMED MANAGEMENT SERVICES LLC,STE 200,2.0,3200 DOWNWOOD CIR NW,GA,,ATLANTA
98,1801899836,958165120.0,SARAH,L,,PA-C,MRS.,,SUITE 410,1.0,2801 K ST,CA,ZICHELLA,SACRAMENTO


In [13]:
#merging profile and primary_taxes tables
profile=pd.merge(profile,primary_taxes)
profile.head(5)

Unnamed: 0,NPI,Provider Business Practice Location Address Postal Code,Provider First Name,Provider Middle Name,Provider Name Suffix Text,Provider Credential Text,Provider Name Prefix Text,Provider Organization Name (Legal Business Name),Provider Second Line Business Practice Location Address,Entity Type Code,Provider First Line Business Practice Location Address,Provider Business Practice Location Address State Name,Provider Last Name (Legal Name),Provider Business Practice Location Address City Name,Taxonomy Code,Taxonomy Switch
0,1679576722,688472944.0,DAVID,A,,M.D.,,,,1.0,3500 CENTRAL AVE,NE,WIEBE,KEARNEY,207X00000X,Y
1,1588667638,322044736.0,WILLIAM,C,,MD,DR.,,SUITE 300,1.0,1824 KING STREET,FL,PILCHER,JACKSONVILLE,207RC0000X,Y
2,1497758544,283044552.0,,,,,,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,2.0,3418 VILLAGE DR,NC,,FAYETTEVILLE,251G00000X,Y
3,1215930367,770901243.0,LAURENT,,,M.D.,DR.,,,1.0,17323 RED OAK DR,TX,GRESSOT,HOUSTON,207RH0003X,Y
4,1023011178,945594515.0,,,,,,COLLABRIA CARE,,2.0,414 S JEFFERSON ST,CA,,NAPA,251G00000X,Y


In [14]:
profile.columns = [c.replace(' ', '_') for c in profile.columns]
profile.columns

Index(['NPI', 'Provider_Business_Practice_Location_Address_Postal_Code',
       'Provider_First_Name', 'Provider_Middle_Name',
       'Provider_Name_Suffix_Text', 'Provider_Credential_Text',
       'Provider_Name_Prefix_Text',
       'Provider_Organization_Name_(Legal_Business_Name)',
       'Provider_Second_Line_Business_Practice_Location_Address',
       'Entity_Type_Code',
       'Provider_First_Line_Business_Practice_Location_Address',
       'Provider_Business_Practice_Location_Address_State_Name',
       'Provider_Last_Name_(Legal_Name)',
       'Provider_Business_Practice_Location_Address_City_Name',
       'Taxonomy_Code', 'Taxonomy_Switch'],
      dtype='object')

In [15]:
profile.head(3)

Unnamed: 0,NPI,Provider_Business_Practice_Location_Address_Postal_Code,Provider_First_Name,Provider_Middle_Name,Provider_Name_Suffix_Text,Provider_Credential_Text,Provider_Name_Prefix_Text,Provider_Organization_Name_(Legal_Business_Name),Provider_Second_Line_Business_Practice_Location_Address,Entity_Type_Code,Provider_First_Line_Business_Practice_Location_Address,Provider_Business_Practice_Location_Address_State_Name,Provider_Last_Name_(Legal_Name),Provider_Business_Practice_Location_Address_City_Name,Taxonomy_Code,Taxonomy_Switch
0,1679576722,688472944.0,DAVID,A,,M.D.,,,,1.0,3500 CENTRAL AVE,NE,WIEBE,KEARNEY,207X00000X,Y
1,1588667638,322044736.0,WILLIAM,C,,MD,DR.,,SUITE 300,1.0,1824 KING STREET,FL,PILCHER,JACKSONVILLE,207RC0000X,Y
2,1497758544,283044552.0,,,,,,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,2.0,3418 VILLAGE DR,NC,,FAYETTEVILLE,251G00000X,Y


In [25]:
taxonomy_code=pd.read_csv('data/nucc_taxonomy_220.csv')
taxonomy_code.head(5)

Unnamed: 0,Code,Grouping,Classification,Specialization,Definition,Notes,Display Name,Section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,Definition to come...,,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,Definition to come...,,Clinical & Laboratory Immunology (Allergy & Im...,Individual


In [22]:
#taxonomy_code.columns=[c.replace(' ', '_') for c in taxonomy_code.columns]

### Creating the SQLite database

In [19]:
db = sqlite3.connect('data/hopteam.sqlite')  

In [26]:
profile.to_sql('profile', db, if_exists='append', index=False)
from_npi.to_sql('referral_from', db, if_exists = 'append', index = False)
to_npi.to_sql('referral_to', db, if_exists = 'append', index = False)
referrals.to_sql('referrals', db, if_exists = 'append', index = False)
cbsa_zip.to_sql('zip_cbsa', db, if_exists = 'append', index = False)
taxonomy_code.to_sql('taxonomy_code', db, if_exists='append', index=False )


  sql.to_sql(


In [27]:
db.close()