# Metric Design

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
#reading the cleaned master data 
master_data = pd.read_csv("Master Data.csv")

In [3]:
#glancing
master_data.head()

Unnamed: 0,hospital_pk,collection_week,total_beds_7_day_sum,all_adult_hospital_beds_7_day_sum,all_adult_hospital_inpatient_beds_7_day_sum,inpatient_beds_used_7_day_sum,all_adult_hospital_inpatient_bed_occupied_7_day_sum,inpatient_beds_used_covid_7_day_sum,total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum,total_adult_patients_hospitalized_confirmed_covid_7_day_sum,...,previous_day_admission_adult_covid_confirmed_7_day_avg,previous_day_admission_pediatric_covid_confirmed_7_day_avg,previous_day_admission_adult_covid_suspected_7_day_avg,previous_day_admission_pediatric_covid_suspected_7_day_avg,all_pediatric_inpatient_bed_occupied_7_day_avg,all_pediatric_inpatient_beds_7_day_avg,staffed_icu_pediatric_patients_confirmed_covid_7_day_avg,staffed_pediatric_icu_bed_occupancy_7_day_avg,total_staffed_pediatric_icu_beds_7_day_avg,state_full
0,330219,2020-12-04,4011,4011,3626,3297,3297,391,391,390,...,5.571429,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,New York
1,351312,2022-07-01,0,0,70,24,24,2,2,2,...,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,North Dakota
2,452083,2020-09-25,343,343,343,335,335,145,145,145,...,1.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Texas
3,340114,2020-11-27,3947,3331,3331,2595,2588,375,375,319,...,5.0,0.0,13.571429,0.285714,0.0,0.0,0.0,0.0,0.0,North Carolina
4,330406,2021-06-18,805,749,749,612,605,0,0,0,...,0.0,0.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0,New York


# Formulas:


### Calculating total confirmed covid beds occupied over total inpatient beds
inpatient_confirmed = inpatient_beds_used_covid_7_day_avg / inpatient_beds_7_day_avg

### Calculating total confirmed covid beds occupied over total ICU beds
icu_confirmed = (staffed_icu_adult_patients_confirmed_covid_7_day_avg + staffed_icu_pediatric_patients_confirmed_covid_7_day_avg) / total_icu_beds_7_day_avg

### Calcualting fully vaccinated staff over total staff
staff_vaccinated = total_personnel_covid_vaccinated_doses_all_7_day/( total_personnel_covid_vaccinated_doses_none_7_day + total_personnel_covid_vaccinated_doses_one_7_day + total_personnel_covid_vaccinated_doses_all_7_day)

### Calculating the hospital burden ranking based on initial weights assigned on the basis of domain knowledge
hospital_burden_ranking = 40 * inpatient_confirmed + 50 * icu_confirmed + 10 * staff_vaccinated

In [5]:
# creating a dataframe "metric" that has all the required columns to create the new features
metric = master_data[["staffed_icu_adult_patients_confirmed_covid_7_day_avg", 
              "staffed_icu_pediatric_patients_confirmed_covid_7_day_avg",
              "total_icu_beds_7_day_avg",
              "inpatient_beds_used_covid_7_day_avg", 
              "inpatient_beds_7_day_avg",
              "total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg", 
              "total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg", 
              "total_personnel_covid_vaccinated_doses_none_7_day",
              "total_personnel_covid_vaccinated_doses_one_7_day",
              "total_personnel_covid_vaccinated_doses_all_7_day"]]

In [6]:
metric['inpatient_confirmed'] = metric.apply(lambda row: row.inpatient_beds_used_covid_7_day_avg / row.inpatient_beds_7_day_avg, axis=1)

metric['icu_confirmed'] = metric.apply(lambda row: (row.staffed_icu_adult_patients_confirmed_covid_7_day_avg + row.staffed_icu_pediatric_patients_confirmed_covid_7_day_avg)/ row.total_icu_beds_7_day_avg, axis=1) 

metric['staff_vaccinated'] = metric.apply(lambda row: row.total_personnel_covid_vaccinated_doses_all_7_day/( row.total_personnel_covid_vaccinated_doses_none_7_day + row.total_personnel_covid_vaccinated_doses_one_7_day + row.total_personnel_covid_vaccinated_doses_all_7_day), axis=1)


  metric['inpatient_confirmed'] = metric.apply(lambda row: row.inpatient_beds_used_covid_7_day_avg / row.inpatient_beds_7_day_avg, axis=1)
  metric['inpatient_confirmed'] = metric.apply(lambda row: row.inpatient_beds_used_covid_7_day_avg / row.inpatient_beds_7_day_avg, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metric['inpatient_confirmed'] = metric.apply(lambda row: row.inpatient_beds_used_covid_7_day_avg / row.inpatient_beds_7_day_avg, axis=1)
  metric['icu_confirmed'] = metric.apply(lambda row: (row.staffed_icu_adult_patients_confirmed_covid_7_day_avg + row.staffed_icu_pediatric_patients_confirmed_covid_7_day_avg)/ row.total_icu_beds_7_day_avg, axis=1)
  metric['icu_confirmed'] = metric.apply(lambda row: (row.staffed_icu_adult_patients_confirmed_covid_7_day

In [7]:
metric_col = metric[["inpatient_confirmed", "icu_confirmed", "staff_vaccinated"]]

In [8]:
metric_col.describe()

Unnamed: 0,inpatient_confirmed,icu_confirmed,staff_vaccinated
count,730852.0,487641.0,204416.0
mean,inf,inf,0.556532
std,,,0.243781
min,0.0,0.0,0.0
25%,0.002380952,0.0,0.389313
50%,0.03469388,0.03467406,0.494679
75%,0.09047619,0.14,0.75983
max,inf,inf,1.0


In [11]:
# dealing with the "inf" and replacing it with the finite max-value 
metric_col['inpatient_confirmed'].replace([np.inf], 59.642857, inplace=True)
metric_col['icu_confirmed'].replace([np.inf], 7, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metric_col['inpatient_confirmed'].replace([np.inf], 59.642857, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metric_col['icu_confirmed'].replace([np.inf], 7, inplace=True)


In [12]:
metric_col.describe()

Unnamed: 0,inpatient_confirmed,icu_confirmed,staff_vaccinated
count,730852.0,487641.0,204416.0
mean,1.159295,0.109623,0.556532
std,7.996425,0.247478,0.243781
min,0.0,0.0,0.0
25%,0.002381,0.0,0.389313
50%,0.034694,0.034674,0.494679
75%,0.090476,0.14,0.75983
max,59.642857,7.0,1.0


In [14]:
#min-max scaling the data

# creating a copy
df_sklearn = metric_col.copy()
  
# apply normalization techniques
column = 'inpatient_confirmed'
df_sklearn[column] = MinMaxScaler().fit_transform(np.array(df_sklearn[column]).reshape(-1,1))
  
# view normalized data  
#display(df_sklearn)

In [15]:
column = 'icu_confirmed'
df_sklearn[column] = MinMaxScaler().fit_transform(np.array(df_sklearn[column]).reshape(-1,1))

In [16]:
column = 'staff_vaccinated'
df_sklearn[column] = MinMaxScaler().fit_transform(np.array(df_sklearn[column]).reshape(-1,1))

In [17]:
# replacing the null values with 0
df_sklearn = df_sklearn.fillna(0)


In [18]:
df_sklearn.describe()

Unnamed: 0,inpatient_confirmed,icu_confirmed,staff_vaccinated
count,742253.0,742253.0,742253.0
mean,0.019139,0.010289,0.153269
std,0.13306,0.029604,0.279597
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.000565,0.0,0.0
75%,0.00149,0.010204,0.280374
max,1.0,1.0,1.0


In [19]:
df_sklearn['hbr'] = df_sklearn.apply(lambda row: 40 * row.inpatient_confirmed + 50 * row.icu_confirmed + 10 * row.staff_vaccinated, axis=1)


In [20]:
df_sklearn.describe()

Unnamed: 0,inpatient_confirmed,icu_confirmed,staff_vaccinated,hbr
count,742253.0,742253.0,742253.0,742253.0
mean,0.019139,0.010289,0.153269,2.81266
std,0.13306,0.029604,0.279597,6.016567
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.013405
50%,0.000565,0.0,0.0,0.370409
75%,0.00149,0.010204,0.280374,4.063838
max,1.0,1.0,1.0,97.329192


# Further Steps:
This gives us ranking of Hospital Burden based on the initialized weights (that were chosen considering various factors and domain knowledge) 

To update the weights of the equation calculating HBR, we came up with an algortihm (which could not be implemented due to the time constraint)

- Create 10 clusters using these new developed features (inpatient_confirmed, icu_confirmed, staff_vaccinated) using K-means clustering. 
- These clusters are representative of "unordered" Hospital Burden (Basically, each of this cluster represents one bin of the ranked Hospital burden. Let's say 80-90 ranked could be closely represented by cluster 5, and similarly other bins of ranks could be represented by some other cluster.) 
- So, we try to map the cluster to these bins and calculate the weights each of these features carry using grid-search in a way that we have 10 clusters closesly represent the 10 bins of HBR. 

Due to the time constraint, we were not able to complete this algo. 

