In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyxlsb import convert_date
import utils

In [None]:
import importlib
import sys
importlib.reload(sys.modules['utils'])

In [None]:
# The data files is stored in the data directory under the root directory of this repo. 
data_dir = "../../data/data/dataanalysis/"
output_dir = "Plots2/"
# Name of the file being read
case_sheet = "bmc_cases.xlsb"

#Read the file
input_df = pd.read_excel(data_dir+case_sheet,engine='pyxlsb',header=None)

In [None]:
# Some sheets have a blank row at top and some don't. By reading without header, dropping the blank rows and assigning the top row as header back, we can fix this. 
# If there is a better solution to this, or later if this problem is resolved if we shift to S3 containers, feel free to edit. 
input_df = input_df.dropna(how='all')
data = input_df[1:]
data.columns = input_df.iloc[0]

In [None]:
# Columns Required for this analysis. Filtering for easy visualizations
columns_required = ['Date','Age','Gender','Patient Location (Central)','Patient Status (Central)']
df = data[columns_required]
df['Age Bracket'] = df.apply(lambda x: utils.get_age_bracket(x),axis=1)
df['Date'] = df.apply(lambda x: convert_date(x['Date']),axis=1)

#Add week column to group week wise

In [None]:
# Count all the non-na (non-empty) values in each columns
nvals = data.notna().sum()
print(nvals)

In [None]:
def plot_age_gender(df,suffix):
    ag,gen = utils.age_gender_distribution(df)
    
    # Age
    xlabel = "Date When Patient Was Detected Positive"
    ylabel = "Number of Patients"
    title = "No. of new cases of patients whose final status was : " + suffix
    filename = "AgeDist_"+suffix+".png"
    utils.plot_multiple_time_series(ag,'Age Bracket','Date','patient count',xlabel,ylabel,title,filename)
    
    # Gender
    xlabel = "Date When Patient Was Detected Positive"
    ylabel = "Number of Patients"
    title = "No. of new cases of patients whose final status was : " + suffix
    filename = "GenderDist_"+suffix+".png"
    utils.plot_multiple_time_series(gen,'Gender','Date','patient count',xlabel,ylabel,title,filename)
    
    return ag,gen
    

In [None]:
age_dist = {}
gender_dist = {}

an1, an2 = plot_age_gender(df,'_all_')
age_dist['All'] = an1
gender_dist['All'] = an2

facility_types = ['Home Isolation','CCC2','DCHC','Hospitalized','Private Hospital','DCH']
for fac in facility_types :
    df_fac = df[df['Patient Location (Central)'] == fac]
    an1, an2 = plot_age_gender(df_fac,fac)
    age_dist[fac] = an1
    gender_dist[fac] = an2
    
df_fatal = df[df['Patient Status (Central)'] == "Dead"]
an1, an2 = plot_age_gender(df_fatal,'Dead')
age_dist['Dead'] = an1
gender_dist['Dead'] = an2