# Texas Hospital Discharge - Import

## Setup

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from scipy import stats
import yaml, time, sys, os, glob

import seaborn as sns
sns.set_style("darkgrid")

from IPython.display import display, Markdown
pd.set_option('display.max_columns', None)  

DATASET = "Texas_Inpatient_Discharge"
SPLIT_TRAINING = True
DEBUG = False
SEED = 42

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

In [7]:
if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(d): os.makedirs(d)
  if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['doc','orig','data','output']: makedirs(d)

## Download Files

In [10]:
URL = "https://setu-datamining2.github.io/live/topics/21-Assignments/03-Texas_Inpatient_Discharge/files/"

files = "my_lib.py train.csv.gz grading.csv.gz Facility_type1q2013_tab.zip Facility_type2q2013_tab.zip Facility_type3q2013_tab.zip Facility_type4q2013_tab.zip UserManual1Q2013.pdf"

for filename in files.split(" "):
    
    ext = filename.split(".")[-1]
    dest = {"pdf":"doc", "py":".", "ipynb":".", "gz":"orig", "zip":"orig"}[ext]
   
    source = f"{URL}/{filename}"
    target = f"{ROOT}/{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}") 

Using local copy of my_lib.py in folder .
Downloading remote file train.csv.gz


HTTPError: HTTP Error 404: Not Found

In [None]:
%load_ext autoreload
%autoreload 2

import my_lib

In [None]:
df = pd.read_csv(f"src/train.csv.gz", dtype=str)
df.shape

(1000000, 194)

Dataset is waaaay too big - everything is going to be slow => split into smaller sets for development

## 1.2. Construct Target 

In [None]:
df.LENGTH_OF_STAY.describe()

count     999698
unique       382
top         0002
freq      259935
Name: LENGTH_OF_STAY, dtype: object

In [None]:
# Drop missing values based on variable LENGTH_OF_STAY
df.dropna(subset=["LENGTH_OF_STAY"], inplace=True)

# Convert to int
df.LENGTH_OF_STAY = df.LENGTH_OF_STAY.astype(int)

In [None]:
df.LENGTH_OF_STAY.describe()

count    999698.000000
mean          5.280482
std          11.420006
min           1.000000
25%           2.000000
50%           3.000000
75%           6.000000
max        1961.000000
Name: LENGTH_OF_STAY, dtype: float64

In [None]:
df["TARGET"] = df.LENGTH_OF_STAY.apply(lambda x: "short" if x < 3 else ("medium" if x <= 6 else "long"))

In [None]:
df.TARGET.value_counts(dropna=False)

short     414152
medium    381437
long      204109
Name: TARGET, dtype: int64

## 1.3. Dividing training datasets into subsets

In [None]:
if SPLIT_TRAINING:
    
    # Randomising the rows in the dataset
    print("Randomising order ...")
    df_sample = df.sample(frac=1, random_state=SEED)
    
    for parts in [20,10,5,2,1]:
        nrows = df.shape[0] // parts
        print(f"\nBreak dataset into {parts} each with {nrows} rows ... ", end="")

        for k in range(parts):
            filename = "data/df_train_sample_%02d_of_%d.csv" % (k, parts)

            print(k, end=" ")
            df_sample.iloc[k * nrows : (k + 1) * nrows].to_csv(filename, index=False)

Randomising order ...

Break dataset into 20 each with 49984 rows ... 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 
Break dataset into 10 each with 99969 rows ... 0 1 2 3 4 5 6 7 8 9 
Break dataset into 5 each with 199939 rows ... 0 1 2 3 4 
Break dataset into 2 each with 499849 rows ... 0 1 
Break dataset into 1 each with 999698 rows ... 0 

Dividing the training dataset into subsets will make EDA more performant and easier.

In [None]:
!say "splits are saved"

## 1.4. Grading Dataset

Moving the grading dataset from `src/` to `data/` for usage in `03-Model.ipynb`.

In [None]:
df_grading = pd.read_csv(f"src/grading.csv.gz", dtype=str)
print(df_grading.shape)

(100000, 193)


In [None]:
df_grading.to_csv("data/grading.csv", index=False)

## Faculty Type

In [None]:
!ls src

Facility_type1q2013_tab.zip Facility_type4q2013_tab.zip
Facility_type2q2013_tab.zip grading.csv.gz
Facility_type3q2013_tab.zip train.csv.gz


In [None]:
df_f = pd.read_csv("src/Facility_type1q2013_tab.zip", sep="\t")
print(df_f.shape)
df_f.head()

(577, 11)


Unnamed: 0,THCIC_ID,PROVIDER_NAME,FAC_TEACHING_IND,FAC_PSYCH_IND,FAC_REHAB_IND,FAC_ACUTE_CARE_IND,FAC_SNF_IND,FAC_LONG_TERM_AC_IND,FAC_OTHER_LTC_IND,FAC_PEDS_IND,Unnamed: 10
0,100,Austin State Hospital,,X,,,,,,,
1,101,Big Spring State Hospital,,X,,,,,,,
2,102,UT Medical Branch Hospital,A,,X,X,,,,X,
3,104,Rio Grande State Center,,X,,,,,,,
4,105,UT MD Anderson Cancer Center,A,,,X,,,,,


In [None]:
df_fs = [ pd.read_csv("src/Facility_type%sq2013_tab.zip" % k, sep="\t") for k in range(1,5) ]

In [None]:
for d in df_fs:
    print(d.shape)

(577, 11)
(584, 11)
(587, 11)
(587, 11)


In [None]:
df_f = pd.concat(df_fs, ignore_index=True)
df_f.shape

(2335, 11)

In [None]:
df_f.drop_duplicates(keep="last", inplace=True)
df_f.shape

(635, 11)

In [None]:
df_f.drop_duplicates(subset="THCIC_ID", keep="last", inplace=True)
df_f.shape

(606, 11)

In [None]:
df_f.columns

Index(['THCIC_ID', 'PROVIDER_NAME', 'FAC_TEACHING_IND', 'FAC_PSYCH_IND',
       'FAC_REHAB_IND', 'FAC_ACUTE_CARE_IND', 'FAC_SNF_IND',
       'FAC_LONG_TERM_AC_IND', 'FAC_OTHER_LTC_IND', 'FAC_PEDS_IND',
       'Unnamed: 10'],
      dtype='object')

In [None]:
df_f.drop(columns=["Unnamed: 10"], inplace=True)

In [None]:
df_f.fillna(0, inplace=True)

In [None]:
for c in df_f.columns[2:]:
    df_f[c] = df_f[c].map({0:0, "A":1, "C":1, "X":1 ,"x":1})

In [None]:
df_f.head()

Unnamed: 0,THCIC_ID,PROVIDER_NAME,FAC_TEACHING_IND,FAC_PSYCH_IND,FAC_REHAB_IND,FAC_ACUTE_CARE_IND,FAC_SNF_IND,FAC_LONG_TERM_AC_IND,FAC_OTHER_LTC_IND,FAC_PEDS_IND
347,724900,Brownsville Doctors Hospital,0,0,0,1,0,0,0,1
499,854000,Twin Creeks Hospital,0,0,1,0,0,0,0,0
531,907000,Renaissance Hospital-Groves,0,0,0,1,0,0,0,0
544,939000,GlobalRehab Hospital-San Antonio,0,0,1,0,0,0,0,0
983,798500,Austin Surgical Hospital,0,0,0,0,0,0,0,1


In [None]:
df_f.to_csv("data/facility.csv", index=False)

In [None]:
df_f.sum()

THCIC_ID                                                        342296121
PROVIDER_NAME           Brownsville Doctors HospitalTwin Creeks Hospit...
FAC_TEACHING_IND                                                       42
FAC_PSYCH_IND                                                          94
FAC_REHAB_IND                                                         164
FAC_ACUTE_CARE_IND                                                    396
FAC_SNF_IND                                                            51
FAC_LONG_TERM_AC_IND                                                   99
FAC_OTHER_LTC_IND                                                       4
FAC_PEDS_IND                                                           99
dtype: object