# EMBA 1520 Project

This project is written in Python.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Libraries to access the data.medicare.gov API
!pip install sodapy #package to access general API
from sodapy import Socrata #Socrata is the API

# Libraries for plotting graphs
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pyplot import figure

## Nursing Homes

Import the data. 

Loading directly from data.medicare.gov in order always have the most up-to-date data set.

From Medicare:
    Nursing Home Compare has detailed information about every Medicare - Opens in a new window and Medicaid - Opens in a new window-certified nursing home in the country. A nursing home is a place for people who can’t be cared for at home and need 24-hour nursing care.

Provider Subset:
General information on currently active nursing homes, including number of certified beds, quality measure scores, staffing and other information used in the Five-Star Rating System. Data are presented as one row per nursing home.


In [None]:
# this code comes from: https://dev.socrata.com/foundry/data.medicare.gov/4pq5-n9py


# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.medicare.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.medicare.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 20,000 results, returned as JSON from API / converted to Python list of (nb: This data set has ~15.5k records)
# dictionaries by sodapy.
results = client.get("4pq5-n9py",limit=20000) # 4pq5-n9py is the serial for the nursing home providers data set

# Convert to pandas DataFrame
NH_Providers = pd.DataFrame.from_records(results)

In [None]:
NH_Providers.head(n=5) # looking at the data


In [None]:
NH_Providers.info() # looking the columns (number, name, data type, etc)


The distribution of nursing homes around the country by state.

In [None]:
figure(num=None, figsize=(15, 6), dpi=80, facecolor='w', edgecolor='k') # make the graph bigger

NH_Providers.provider_state.value_counts().plot(kind='bar') 

Extracting the only the columns we'll need for the merge.

Need to add the facility type to distinguish the facilities upon merge.

In [None]:
NH_Providers['type'] = "nursing_home"
NH_Providers.info()

In [None]:
NH_Providers_contact = NH_Providers[['provider_name','provider_address','provider_city','provider_state','provider_zip_code','provider_county_name',
                                             'provider_phone_number','type']]




Set the county name to uppercase

In [None]:
NH_Providers_contact['provider_county_name'] = NH_Providers_contact['provider_county_name'].str.upper() 

Format phone numbers; remove paranthesis, spaces, and dashes


In [None]:
NH_Providers_contact['provider_phone_number'] = NH_Providers_contact['provider_phone_number'].str.replace("(","")
NH_Providers_contact['provider_phone_number'] = NH_Providers_contact['provider_phone_number'].str.replace(")","")
NH_Providers_contact['provider_phone_number'] = NH_Providers_contact['provider_phone_number'].str.replace("-","")
NH_Providers_contact['provider_phone_number'] = NH_Providers_contact['provider_phone_number'].str.replace(" ","")

In [None]:
NH_Providers_contact.head(n=5)

Checking for null values in the data set.

In [None]:
NH_Providers_contact.isna().sum()

Delete the larger dataframe to clear memory.

In [None]:
del NH_Providers

## Dialysis Centers

Import the data.

Loading directly from data.medicare.gov in order always have the most up-to-date data set.

From Medicare: Dialysis Facility Compare datasets: These are the official datasets used on the Medicare.gov Dialysis Facility Compare Website provided by the Centers for Medicare & Medicaid Services.  These data allow you to compare the quality of care provided in Medicare-certified dialysis facilities nationwide.

Facility Subset: A list of all dialysis facilities registered with Medicare that includes addresses and phone numbers, as well as services and quality of care provided.

In [None]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.medicare.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.medicare.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 10,000 results, returned as JSON from API / converted to Python list of (nb: This data set has ~7.5k records)
# dictionaries by sodapy.
results = client.get("23ew-n7w9",limit=10000)

# Convert to pandas DataFrame
D_Facilities = pd.DataFrame.from_records(results)

In [None]:
D_Facilities.info()

In [None]:
list(D_Facilities.columns)

In [None]:
pd.set_option('display.max_rows', None) # There are 118 columns so this allows all of the to be displayed

D_Facilities.isna().sum() # Looking for NaN values

Distribution of dialysis centers across the country by state.

In [None]:
figure(num=None, figsize=(15, 6), dpi=80, facecolor='w', edgecolor='k') #make the plot bigger

D_Facilities.state.value_counts().plot(kind='bar')

Adding the type of center

In [None]:
D_Facilities['type'] = "dialysis_center"
D_Facilities.info()

Extracting the columns needed for the merge.

In [None]:
D_Facilities_contact = D_Facilities[['facility_name','address_line_1','city','state','zip','county','phone_number','type']]

Make sure county names are uppercase

In [None]:
D_Facilities_contact['county'] = D_Facilities_contact['county'].str.upper() 

In [None]:
D_Facilities_contact['phone_number'] = D_Facilities_contact['phone_number'].str.replace("(","")
D_Facilities_contact['phone_number'] = D_Facilities_contact['phone_number'].str.replace(")","")
D_Facilities_contact['phone_number'] = D_Facilities_contact['phone_number'].str.replace("-","")
D_Facilities_contact['phone_number'] = D_Facilities_contact['phone_number'].str.replace(" ","")

In [None]:
D_Facilities_contact.info()

In [None]:
D_Facilities_contact.head(n=20)

In [None]:
D_Facilities_contact.isna().sum()

In [None]:
pd.set_option('display.max_rows', 50)
D_Facilities_contact[D_Facilities_contact.isna().any(axis=1)]

Memory management

In [None]:
del D_Facilities

## Long Term Care

Import the data.

Loading directly from data.medicare.gov in order always have the most up-to-date data set.

Long-Term Care Hospital Compare datasets: These are the official datasets used on the Medicare.gov Long-Term Care Hospital Compare Website provided by the Centers for Medicare & Medicaid Services.  These data allow you to compare the quality of care provided by over 420 Medicare-certified long-term care hospitals across the nation.

General Information Subset: A list of long-term care hospitals with information such as address, phone number, ownership data and more.

In [None]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.medicare.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.medicare.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 1000 results, returned as JSON from API / converted to Python list of (nb: This data set has ~.37k records)
# dictionaries by sodapy.
results = client.get("azum-44iv",limit=1000)

# Convert to pandas DataFrame
LTC_Facilities = pd.DataFrame.from_records(results)

In [None]:
LTC_Facilities.info()

In [None]:
LTC_Facilities.isna().sum()

Distribution of LTC facilities across the country by state.

In [None]:
figure(num=None, figsize=(15, 6), dpi=80, facecolor='w', edgecolor='k') #make the plot bigger

LTC_Facilities.state.value_counts().plot(kind='bar')

Adding facility type

In [None]:
LTC_Facilities['type'] = "ltc_facility"
LTC_Facilities.info()

Extracting columns needed for merge

In [None]:
LTC_Facilities_contact = LTC_Facilities[['facility_name','address_line_1','city','state','zip_code','county_name','phonenumber','type']]

In [None]:
LTC_Facilities_contact['county_name'] = LTC_Facilities_contact['county_name'].str.upper() 

In [None]:
LTC_Facilities_contact['phonenumber'] = LTC_Facilities_contact['phonenumber'].str.replace("(","")
LTC_Facilities_contact['phonenumber'] = LTC_Facilities_contact['phonenumber'].str.replace(")","")
LTC_Facilities_contact['phonenumber'] = LTC_Facilities_contact['phonenumber'].str.replace("-","")
LTC_Facilities_contact['phonenumber'] = LTC_Facilities_contact['phonenumber'].str.replace(" ","")

Memory management

In [None]:
del LTC_Facilities

## Hospitals

In [None]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.medicare.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.medicare.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 1000 results, returned as JSON from API / converted to Python list of (nb: This data set has ~5.7k records)
# dictionaries by sodapy.
results = client.get("xubh-q36u",limit=10000)

# Convert to pandas DataFrame
Hospitals = pd.DataFrame.from_records(results)

In [None]:
Hospitals.info()

In [None]:
Hospitals['type'] = "hospital"

In [None]:
Hospitals_contact = Hospitals[['hospital_name','address','city','state','zip_code','county_name','phone_number','type']]

In [None]:
Hospitals_contact['phone_number'] = Hospitals_contact['phone_number'].str.replace("(","")
Hospitals_contact['phone_number'] = Hospitals_contact['phone_number'].str.replace(")","")
Hospitals_contact['phone_number'] = Hospitals_contact['phone_number'].str.replace("-","")
Hospitals_contact['phone_number'] = Hospitals_contact['phone_number'].str.replace(" ","")

In [None]:
Hospitals_contact.head(n=5)

## Merge into one dataset

Renaming the columns with a consistent naming scheme.

In [None]:
NH_Providers_contact = NH_Providers_contact.rename(columns={'provider_address':'address','provider_city':'city','provider_state':'state','provider_zip_code':'zip','provider_county_name':'county',
                                             'provider_phone_number':'phone_number',})
NH_Providers_contact.info()

In [None]:
D_Facilities_contact = D_Facilities_contact.rename(columns={'facility_name':'provider_name','address_line_1':'address'})           

D_Facilities_contact.info()

In [None]:
LTC_Facilities_contact = LTC_Facilities_contact.rename(columns={'facility_name':'provider_name','address_line_1':'address','zip_code':'zip','county_name':'county',
                                                        'phonenumber':'phone_number'})

LTC_Facilities_contact.info()

In [None]:
Hospitals_contact.info()

In [None]:
Hospitals_contact = Hospitals_contact.rename(columns={'hospital_name':'provider_name','zip_code':'zip','county_name':'county',
                                                })

Hospitals_contact.info()

Merging the data into one dataframe

In [None]:
Output=pd.concat([NH_Providers_contact, D_Facilities_contact, LTC_Facilities_contact, Hospitals_contact], axis=0, join='inner', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True, sort=False)

Output = Output.sort_values(by=['state','county', 'city'])

In [None]:
Output.head(n=10)

In [None]:
Output.info()

Exporting dataframe as a CSV file

In [None]:
Output.to_csv('Medicare Consolidated Contact Info.csv', index = False, header=True)