In [266]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyxlsb import convert_date

In [267]:
# The data files is stored in the data directory under the root directory of this repo. 
data_dir = "../../data/data/dataanalysis/"
output_dir = "Plots/"
# Name of the file being read
case_sheet = "bmc_cases.xlsb"

#Read the file
data = pd.read_excel(data_dir+case_sheet,engine='pyxlsb',header=None)

In [268]:
# Some sheets have a blank row at top and some don't. By reading without header, dropping the blank rows and assigning the top row as header back, we can fix this. 
# If there is a better solution to this, or later if this problem is resolved if we shift to S3 containers, feel free to edit. 

data = data.dropna(how='all')
data.columns = data.iloc[0]
data = data[1:]

In [269]:
# Count all the non-na (non-empty) values in each columns
nvals = data.notna().sum()
print(nvals)

0
Date                                           100487
S No                                            52926
Ward                                           100487
Test ID (ICMR)                                 100473
Patient name                                        0
Age                                            100487
Gender                                         100487
Mobile Number                                       0
Present Address                                     0
Transferred from Ward                            9015
HealthPost Allocation                           74370
Traceable?                                      98535
Out of Mumbai (Yes / No)                        72999
Assign to other ward                             5828
Person / Official from other ward                   0
Duplicate Entry\n(Yes / No)                     66879
Previous Test ID (only if Duplicate is Yes)      3262
Patient Type (Index / Contact)                  70012
Patient Status            

In [270]:
# Returns the age bracket given the age of the individual. Currently, divided into buckets of 10 years
def get_age_bracket(row):
    age_low = int(row['Age']/10)*10
    age_high = age_low+9
    return str(age_low)+"-"+str(age_high)

In [271]:
# Returns the age and gender distribution (time - series) for the input dataframe,df.
def age_gender_distribution(df):
    x1 = df.groupby(['Date','Age Bracket']).agg('size').reset_index().rename(columns={0:'patient count'})
    x2 = df.groupby(['Date','Gender']).agg('size').reset_index().rename(columns={0:'patient count'})
    return x1,x2

In [272]:
def plot_age(df,filename):
    age_brackets = df['Age Bracket'].unique()
    age_brackets.sort()
    fig = plt.figure(figsize=(20,20))
    for agbr in age_brackets:
        time_series = df[df['Age Bracket']==agbr]
        plt.plot(time_series['Date'],time_series['patient count'],label=agbr)
    fig.autofmt_xdate()
    plt.ylabel('Number of Patients')
    plt.xlabel('Date')
    plt.title("Time series of number of positive cases by age")
    plt.savefig(output_dir+filename+"_ageDist.png")
    plt.show()

In [273]:
def plot_gender(df,filename):
    gender_types = df['Gender'].unique()
    gender_types.sort()
    fig = plt.figure(figsize=(20,20))
    for gender in gender_types:
        time_series = df[df['Gender']==gender]
        plt.plot(time_series['Date'],time_series['patient count'],label=gender)
    fig.autofmt_xdate()
    plt.ylabel('Number of Patients')
    plt.xlabel('Date')
    plt.title("Time series of number of positive cases by gender")
    plt.savefig(output_dir+filename+"_genderDist.png")
    plt.show()

In [274]:
# Columns Required for this analysis. Filtering for easy visualizations
columns_required = ['Date','Age','Gender','Patient Location (Central)','Patient Status (Central)']
df = data[columns_required]
df['Age Bracket'] = df.apply(lambda x: get_age_bracket(x),axis=1)
df['Date'] = df.apply(lambda x: convert_date(x['Date']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
age_dist = {}
gender_dist = {}

an1,an2 = age_gender_distribution(df)
age_dist['All'] = an1
gender_dist['All'] = an2
plot_age(an1,'All')
plot_gender(an2,'All')

facility_types = ['Home Isolation','CCC2','DCHC','Hospitalized','Private Hospital','DCH']
for fac in facility_types :
    df_fac = df[df['Patient Location (Central)'] == fac]
    an1,an2 = age_gender_distribution(df_fac)
    age_dist[fac] = an1
    gender_dist[fac] = an2
    plot_age(an1,fac)
    plot_gender(an2,fac)

df_fatal = df[df['Patient Status (Central)'] == "Dead"]
an1,an2 = age_gender_distribution(df_fatal)
age_dist['Dead'] = an1
gender_dist['Dead'] = an2
plot_age(an1,'Dead')
plot_gender(an2,'Dead')