In [54]:
import pandas as pd

In [67]:
schema = {'DESYNPUF_ID':str, 
          'CLM_ID':str, 
            'CLM_PMT_AMT':float, 
            'NCH_PRMRY_PYR_CLM_PD_AMT':float,
            'NCH_BENE_IP_DDCTBL_AMT':float,
            'NCH_BENE_PTA_COINSRNC_LBLTY_AM':float,
            'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM':float,
            'CLM_UTLZTN_DAY_CNT':'Int64',
            'SEGMENT':str, 
          'CLM_FROM_DT':str, 
          'CLM_THRU_DT':str,
            'PRVDR_NUM':str,
            'AT_PHYSN_NPI':str, 
            'OP_PHYSN_NPI':str,
            'OT_PHYSN_NPI':str, 
            'ADMTNG_ICD9_DGNS_CD':str,
            'CLM_DRG_CD':str
         }

In [68]:
df = pd.read_csv('raw_inpatient_claims.csv', dtype=schema)

In [69]:
df.dtypes

DESYNPUF_ID     object
CLM_ID          object
SEGMENT         object
CLM_FROM_DT     object
CLM_THRU_DT     object
                ...   
HCPCS_CD_41    float64
HCPCS_CD_42    float64
HCPCS_CD_43    float64
HCPCS_CD_44    float64
HCPCS_CD_45    float64
Length: 81, dtype: object

In [70]:
df.columns

Index(['DESYNPUF_ID', 'CLM_ID', 'SEGMENT', 'CLM_FROM_DT', 'CLM_THRU_DT',
       'PRVDR_NUM', 'CLM_PMT_AMT', 'NCH_PRMRY_PYR_CLM_PD_AMT', 'AT_PHYSN_NPI',
       'OP_PHYSN_NPI', 'OT_PHYSN_NPI', 'CLM_ADMSN_DT', 'ADMTNG_ICD9_DGNS_CD',
       'CLM_PASS_THRU_PER_DIEM_AMT', 'NCH_BENE_IP_DDCTBL_AMT',
       'NCH_BENE_PTA_COINSRNC_LBLTY_AM', 'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM',
       'CLM_UTLZTN_DAY_CNT', 'NCH_BENE_DSCHRG_DT', 'CLM_DRG_CD',
       'ICD9_DGNS_CD_1', 'ICD9_DGNS_CD_2', 'ICD9_DGNS_CD_3', 'ICD9_DGNS_CD_4',
       'ICD9_DGNS_CD_5', 'ICD9_DGNS_CD_6', 'ICD9_DGNS_CD_7', 'ICD9_DGNS_CD_8',
       'ICD9_DGNS_CD_9', 'ICD9_DGNS_CD_10', 'ICD9_PRCDR_CD_1',
       'ICD9_PRCDR_CD_2', 'ICD9_PRCDR_CD_3', 'ICD9_PRCDR_CD_4',
       'ICD9_PRCDR_CD_5', 'ICD9_PRCDR_CD_6', 'HCPCS_CD_1', 'HCPCS_CD_2',
       'HCPCS_CD_3', 'HCPCS_CD_4', 'HCPCS_CD_5', 'HCPCS_CD_6', 'HCPCS_CD_7',
       'HCPCS_CD_8', 'HCPCS_CD_9', 'HCPCS_CD_10', 'HCPCS_CD_11', 'HCPCS_CD_12',
       'HCPCS_CD_13', 'HCPCS_CD_14', 'HCPCS_CD_15', 

In [71]:
col_keep = ['DESYNPUF_ID', 'CLM_ID', 'SEGMENT', 'CLM_FROM_DT', 'CLM_THRU_DT',
       'PRVDR_NUM', 'CLM_PMT_AMT', 'NCH_PRMRY_PYR_CLM_PD_AMT', 'AT_PHYSN_NPI',
       'OP_PHYSN_NPI', 'OT_PHYSN_NPI', 'CLM_ADMSN_DT', 'ADMTNG_ICD9_DGNS_CD',
       'CLM_PASS_THRU_PER_DIEM_AMT', 'NCH_BENE_IP_DDCTBL_AMT',
       'NCH_BENE_PTA_COINSRNC_LBLTY_AM', 'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM',
       'CLM_UTLZTN_DAY_CNT', 'NCH_BENE_DSCHRG_DT', 'CLM_DRG_CD']

In [72]:
df = df[col_keep]

In [73]:
len(df.columns)

20

In [74]:
df.columns

Index(['DESYNPUF_ID', 'CLM_ID', 'SEGMENT', 'CLM_FROM_DT', 'CLM_THRU_DT',
       'PRVDR_NUM', 'CLM_PMT_AMT', 'NCH_PRMRY_PYR_CLM_PD_AMT', 'AT_PHYSN_NPI',
       'OP_PHYSN_NPI', 'OT_PHYSN_NPI', 'CLM_ADMSN_DT', 'ADMTNG_ICD9_DGNS_CD',
       'CLM_PASS_THRU_PER_DIEM_AMT', 'NCH_BENE_IP_DDCTBL_AMT',
       'NCH_BENE_PTA_COINSRNC_LBLTY_AM', 'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM',
       'CLM_UTLZTN_DAY_CNT', 'NCH_BENE_DSCHRG_DT', 'CLM_DRG_CD'],
      dtype='object')

In [75]:
df['Admission Date'] = pd.to_datetime(df['CLM_ADMSN_DT'],format='%Y%m%d')
df['Discharge Date'] = pd.to_datetime(df['NCH_BENE_DSCHRG_DT'],format='%Y%m%d')
df['Admission Duration Days'] = (df['Discharge Date'] - df['Admission Date']).dt.days

In [76]:
# rename
name_map = {
    'DESYNPUF_ID': 'Beneficiary Code', 
    'CLM_ID': 'Claim ID', 
    'SEGMENT': 'Claim Line Segment', 
    'CLM_FROM_DT': 'Claims start date', 
    'CLM_THRU_DT': 'Claims end date',
    'PRVDR_NUM': 'Provider Institution', 
    'CLM_PMT_AMT': 'Claim Payment Amount', 
    'NCH_PRMRY_PYR_CLM_PD_AMT': 'NCH Primary Payer Claim Paid Amount', 
    'AT_PHYSN_NPI': 'Attending Physician National Provider Identifier Number',
    'OP_PHYSN_NPI': 'Operating Physician National Provider Identifier Number', 
    'OT_PHYSN_NPI': 'Other Physician National Provider Identifier Number',
    'CLM_ADMSN_DT': 'Inpatient admission date', 
    'ADMTNG_ICD9_DGNS_CD': 'Claim Admitting Diagnosis Code',
    'CLM_PASS_THRU_PER_DIEM_AMT': 'Claim Pass Thru Per Diem Amount', 
    'NCH_BENE_IP_DDCTBL_AMT': 'NCH Beneficiary Inpatient Deductible Amount',
    'NCH_BENE_PTA_COINSRNC_LBLTY_AM': 'NCH Beneficiary Part A Coinsurance Liability Amount',
    'NCH_BENE_BLOOD_DDCTBL_LBLTY_AM': 'NCH Beneficiary Blood Deductible Liability Amount',
    'CLM_UTLZTN_DAY_CNT': 'Claim Utilization Day Count', 
    'NCH_BENE_DSCHRG_DT': 'Inpatient discharged date', 
    'CLM_DRG_CD': 'Claim Diagnosis Related Group Code'
}
df.rename(columns=name_map, inplace=True)


In [77]:
df.head(5)

Unnamed: 0,Beneficiary Code,Claim ID,Claim Line Segment,Claims start date,Claims end date,Provider Institution,Claim Payment Amount,NCH Primary Payer Claim Paid Amount,Attending Physician National Provider Identifier Number,Operating Physician National Provider Identifier Number,...,Claim Pass Thru Per Diem Amount,NCH Beneficiary Inpatient Deductible Amount,NCH Beneficiary Part A Coinsurance Liability Amount,NCH Beneficiary Blood Deductible Liability Amount,Claim Utilization Day Count,Inpatient discharged date,Claim Diagnosis Related Group Code,Admission Date,Discharge Date,Admission Duration Days
0,00013D2EFD8E45D1,196661176988405,1,20100312,20100313,2600GD,4000.0,0.0,3139083564,,...,0.0,1100.0,0.0,0.0,1,20100313,217,2010-03-12,2010-03-13,1
1,00016F745862898F,196201177000368,1,20090412,20090418,3900MB,26000.0,0.0,6476809087,,...,0.0,1068.0,0.0,0.0,6,20090418,201,2009-04-12,2009-04-18,6
2,00016F745862898F,196661177015632,1,20090831,20090902,3900HM,5000.0,0.0,611998537,611998537.0,...,0.0,1068.0,0.0,0.0,2,20090902,750,2009-08-31,2009-09-02,2
3,00016F745862898F,196091176981058,1,20090917,20090920,3913XU,5000.0,0.0,4971602784,,...,0.0,1068.0,0.0,0.0,3,20090920,883,2009-09-17,2009-09-20,3
4,00016F745862898F,196261176983265,1,20100626,20100701,3900MB,16000.0,0.0,6408400473,1960859579.0,...,0.0,1100.0,0.0,0.0,5,20100701,983,2010-06-26,2010-07-01,5


In [78]:
df.to_csv('desynpuf_inpatient_simple.csv', index=False)

In [79]:
name_map = {
    'Beneficiary Code':str, 
    'Claim ID':str, 
    'Claim Line Segment':str, 
    'Claims start date':str, 
    'Claims end date':str,
    'Provider Institution':str, 
    'Claim Payment Amount':'Int64', 
    'NCH Primary Payer Claim Paid Amount': float, 
    'Attending Physician National Provider Identifier Number':str,
    'Operating Physician National Provider Identifier Number':str, 
    'Other Physician National Provider Identifier Number':str,
    'Inpatient admission date':str, 
    'Claim Admitting Diagnosis Code':str,
    'Claim Pass Thru Per Diem Amount':float, 
    'NCH Beneficiary Inpatient Deductible Amount':float,
    'NCH Beneficiary Part A Coinsurance Liability Amount':float,
    'NCH Beneficiary Blood Deductible Liability Amount':float,
    'Claim Utilization Day Count':'Int64', 
    'Inpatient discharged date':str, 
    'Claim Diagnosis Related Group Code':str
}