## HCSIS analysis (+ tutorial)

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

In [2]:
# Pandas options
pd.set_option("display.max_columns", 40)
pd.set_option("display.width", 3000)
pd.set_option("display.max_rows", 2000)

In [3]:
# Data location
data_dir = Path("../scraped_data/")
data_file_path = list(data_dir.glob('*.csv'))[0]  # get first csv in dir

In [4]:
# Read CSV
raw = pd.read_csv(data_file_path, dtype={
    'provider_id': 'object', 
    'service_location_id': 'object',
},
parse_dates=['inspection_date'])


In [5]:
# Rearrange cols
raw = raw[['provider_id', 'provider_name', 'service_location', 'service_location_id', 'inspections_found', 'inspection_id','inspection_date','inspection_reason','regulation','non_compliance_area','plans_of_correction','poc_status','certified_locations_url']]


In [6]:
# Basic stats
def unique_vals(df):
    print('UNIQUE VALS:')
    for col in df:
        unique_vals = len(df[col].unique())
        print(f'{col}: {unique_vals}')

n_rows, n_cols = raw.shape
print(f"No. of rows: {n_rows}, No. of cols: {n_cols}\n")
unique_vals(raw)
print()
raw.info()

No. of rows: 44509, No. of cols: 13

UNIQUE VALS:
provider_id: 998
provider_name: 986
service_location: 5955
service_location_id: 899
inspections_found: 3
inspection_id: 17857
inspection_date: 1653
inspection_reason: 7
regulation: 1156
non_compliance_area: 32902
plans_of_correction: 31417
poc_status: 7
certified_locations_url: 998

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44509 entries, 0 to 44508
Data columns (total 13 columns):
provider_id                44509 non-null object
provider_name              44509 non-null object
service_location           44509 non-null object
service_location_id        43928 non-null object
inspections_found          43928 non-null object
inspection_id              43290 non-null object
inspection_date            43290 non-null datetime64[ns]
inspection_reason          43290 non-null object
regulation                 34218 non-null object
non_compliance_area        34218 non-null object
plans_of_correction        34218 non-null object
poc_status

In [7]:
# Date range
earliest_inspection = raw['inspection_date'].min()
most_recent_inspection = raw['inspection_date'].max()
timedelta_between_dates = (most_recent_inspection - earliest_inspection)
days_between_dates = timedelta_between_dates.days
years_between_dates = days_between_dates / 365
total_inspections = len(raw['inspection_id'].unique())
inspections_per_day = total_inspections / days_between_dates
inspections_per_year = total_inspections / years_between_dates

print(f'Earliest inspection: {earliest_inspection}')
print(f'Most recent inspection: {most_recent_inspection}')
print(f'Total number of inspections over period: {total_inspections}')
print(f'Avg. no of inspections per day: {inspections_per_day}')
print(f'Avg. no of inspections per year: {inspections_per_year}')

Earliest inspection: 2010-11-10 00:00:00
Most recent inspection: 2019-10-07 00:00:00
Total number of inspections over period: 17857
Avg. no of inspections per day: 5.489394405164464
Avg. no of inspections per year: 2003.6289578850292


In [8]:
# Create clean copy of df
clean = raw.copy(deep=True)

# Calculate unique service locations
prov_service_loc_id = clean['provider_id'] + '-' + clean['service_location_id']
clean.insert(2,'prov_service_loc_id',prov_service_loc_id)
unique_service_locs = clean['prov_service_loc_id'].unique()
count_unique_service_locs = len(unique_service_locs)
print(f'Unique service locations: {count_unique_service_locs}')

# clean.iloc[500:600].head()

Unique service locations: 6796


In [9]:
# Count of service_locations with no inspection data
locs_no_inspection_data = clean[clean["inspections_found"] == False]
count_locs_no_inspection_data = len(locs_no_inspection_data)
percent_no_inspection_data = (100 / count_unique_service_locs) * count_locs_no_inspection_data
round_percent_no_inspection_data = round(percent_no_inspection_data, 2)

print(f"Service locations with no inspections: {len(locs_no_inspection_data)}")
print(f'% of service locations with no inspections: {round_percent_no_inspection_data}')


Service locations with no inspections: 638
% of service locations with no inspections: 9.39


In [11]:
# interesting violations
df_keyword = clean[clean['non_compliance_area'].str.contains('sex',na=False)]
print(df_keyword)

Unnamed: 0,provider_id,provider_name,prov_service_loc_id,service_location,service_location_id,inspections_found,inspection_id,inspection_date,inspection_reason,regulation,non_compliance_area,plans_of_correction,poc_status,certified_locations_url
203,1763,AVENUES,1763-0033,Habilitation,33,True,SIN-00130514,2018-03-20,Renewal,2390.124(12),-- Individual #3's ISP talks about him being i...,Program specialist updated individual #3's ass...,Corrected,https://www.hcsis.state.pa.us/hcsis-ssd/ssd/od...
3550,1576,KEYSTONE SERVICE SYSTEMS INC,1576-0969,KSS Willow Way,969,True,SIN-00149225,2019-01-24,Unannounced Monitoring,6400.143(a),Individual #1 refuses gynecology appointments ...,A desensitization plan was developed to suppor...,Corrected,https://www.hcsis.state.pa.us/hcsis-ssd/ssd/od...
5734,1744,WOODS SERVICES,1744-0023,WOODS SERVICES-WILDWOOD,23,True,SIN-00078312,2015-04-22,Unannounced Monitoring,6400.185(b),"Individual #1's ISP dated 7/15/14, indicates t...",The current staffing levels for Individual #2 ...,Submitted,https://www.hcsis.state.pa.us/hcsis-ssd/ssd/od...
5749,1744,WOODS SERVICES,1744-0023,WOODS SERVICES-WILDWOOD,23,True,SIN-00078312,2015-04-22,Unannounced Monitoring,6400.33(a),"On 4/4/15, sometime between 8:45pm and 8:55pm,...","In order to maintain safety of Individual #1, ...",Not Corrected,https://www.hcsis.state.pa.us/hcsis-ssd/ssd/od...
6296,1892,WES HEALTH CENTERS INC,1892-0002,WES HEALTH CENTERS INC,2,True,SIN-00054614,2013-10-28,Renewal,2390.124(1),"Individual¿s # 1, 2, 3 & 4 records did not inc...",New form developed to include identified infor...,Corrected,https://www.hcsis.state.pa.us/hcsis-ssd/ssd/od...
