In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df = pd.read_csv('../input/covid19-case-surveillance-public-use-dataset/COVID-19_Case_Surveillance_Public_Use_Data.csv')

# Columns Explaination
Information from [here](https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data/vbim-akqf).
* cdc_report_dt - Date received by Centre for Disease Control (CDC)
* pos_spec_dt - Date of first positive specimen collection
* onset_dt - Date of symptoms onset (only if symptomatic)
* current_status - Lab confirmed, or Probable Case
* sex - Sex of patient
* age_group - Age of patient, binned on 10 year periods
* Race and ethnicity - **Interesting one. **
* hosp_yn - Hospitalised?
* icu_yn  - ICU Admitted
* death_yn - Died?
* medcond_yn - Underlying disease?

## Let's do some Data Cleansing
Since we do not have unique patient identifiers it would not be wise to remove "duplicates".
Instead lets look at unique values and column types

In [None]:
#df_explore = df[df.columns[3:]]
#for col in df_explore:
#    print( df_explore[col].value_counts() )
#    print('- - - - - - - - - - - - - - - - - - -')
#

#### Unknown vs. Missing
From a downloadable word doc ([here](https://wwwn.cdc.gov/nndss/document/Methods_for_Conveying_Unknown_Values_in_Case_Notification_Messages_20161108.docx), downloads), Missing states a data entry was not provided while Unknown means it is applicable but no value can be given. In this context, it could be a case of backlog reports, or "missing" but the field is required.
#### Dates
I've left the dates out of this search, but they range from 14 Jan 2020 to 10 Nov 2020. Earliest symptoms were the 14th January, earliest positive test was 21st January, and earliest reported case to CDC was 28th January.
#### Sex
I would be interested in seeing what could render sex be missing or unknown.
#### Age
Binned data on 10 year periods, fairly good split of data
#### Race & Ethnicity
Interestingly, they have combined this data but are not distinguishing between white/non-hispanic, and white/hispanic. Looking at a CDC report on this ([here](https://www.cdc.gov/phin/resources/vocabulary/documents/cdc-race--ethnicity-background-and-purpose.pdf)) explains that a user can select a race (6 options), and an ethnicity( 2 options; hispanic/latino OR not hispanic/latino). Yet in this dataset they have combined them with only hispanic/latino or combinations of races and not hispanic. Leaving out the possiblity of races with hispanic ethnicity. They do have a link to federal standards of classifying race and ethnicity, but it is a [broken link](http://www.whitehouse.gov/omb/fedreg/ombdir15.html) to the White House website. Thanks Trump. The reason hispanic is not in combination with a race is because the CDC defines this as:
<br>Hispanic or Latino – A person of Cuban, Mexican, Puerto Rican, Cuban, South or Central American, or other Spanish culture or origin, regardless of race. ([Here](https://www.cdc.gov/nchs/nhis/rhoi/rhoi_glossary.htm))
#### Status Type
A **Confirmed Case** or death is defined by meeting confirmatory laboratory evidence for COVID-19.
A **Probable Case** or death is defined by one of the following:
* Meeting clinical criteria AND epidemiologic evidence with no confirmatory laboratory testing performed for COVID-19
* Meeting presumptive laboratory evidence AND either clinical criteria OR epidemiologic evidence
* Meeting vital records criteria with no confirmatory laboratory testing performed for COVID19

In [None]:
# Converting columns to the desired data type
df['cdc_report_dt'] =  pd.to_datetime(df['cdc_report_dt'])
df['pos_spec_dt'] =  pd.to_datetime(df['pos_spec_dt'])
df['onset_dt'] =  pd.to_datetime(df['onset_dt'])
# Gather results from March onwards, very little  before that 
df = df[df['cdc_report_dt'] >= '03/01/2020']
#df = df[df['pos_spec_dt'] >= '03/01/2020']
#df = df[df['onset_dt'] >= '03/01/2020']
# Remove 'non-hispanic' since redundant in this dataset
df['Race'] = df['Race and ethnicity (combined)'].str.split(',').str[0]
df = df.drop('Race and ethnicity (combined)', axis=1)

### Plotting Distribution of Data

In [None]:
df_gender_cases = df.groupby(['current_status','sex'], as_index=False)['cdc_report_dt'].count().rename(columns={'cdc_report_dt':'count'})
df_gender_cases_probable = df_gender_cases.tail(5)
df_gender_cases_lab =  df_gender_cases.head(5)

In [None]:
labels = ['Female', 'Male', 'Missing', 'Other', 'Unknown']
x = np.arange(len(labels))
width = 0.35
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, df_gender_cases_probable['count'], width, label='Probable')
rects1 = ax.bar(x + width/2, df_gender_cases_lab['count'], width, label='Lab-confirmed')


ax.set_ylabel('Count')
ax.set_title('Status by Gender')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

fig.tight_layout()

plt.show()

From the distribution we can see that women are more likely to be lab-confirmed than men, but there are more women going to get tested than men.

In [None]:
df_timeline_cdc = df.groupby(by=['cdc_report_dt', 'current_status'], as_index=False)['sex'].count().melt(id_vars=['current_status','sex']).rename(columns={'variable':'report_type','value':'date'})
df_timeline_pos = df.groupby(by=['pos_spec_dt', 'current_status'], as_index=False)['sex'].count().melt(id_vars=['current_status','sex']).rename(columns={'variable':'report_type','value':'date'})
df_timeline_onset = df.groupby(by=['onset_dt', 'current_status'], as_index=False)['sex'].count().melt(id_vars=['current_status','sex']).rename(columns={'variable':'report_type','value':'date'})
df_reports = df_timeline_cdc.append(df_timeline_pos).append(df_timeline_onset)
df_reports = df_reports.sort_values(by=['date'],ascending=True)
df_reports_1 = df_reports[df_reports['current_status']=='Laboratory-confirmed case']
df_reports_2 = df_reports[df_reports['current_status']=='Probable Case']

In [None]:
reports = df_reports['report_type'].unique()
colors = ['#72B34C','#4C72B3','#B34C72']
i=0
fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(24,8))
fig.suptitle('CDC Report Date, by Status Type')
for report in reports:
    ax1.plot(df_reports_1[df_reports_1['report_type'] == report]['date'],
             df_reports_1[df_reports_1['report_type'] == report]['sex'],
             label=report,
             color=colors[i])
    i+=1
j=0
for report in reports:
    ax2.plot(df_reports_2[df_reports_2['report_type'] == report]['date'],
             df_reports_2[df_reports_2['report_type'] == report]['sex'],
             label=report,
             color=colors[j])
    j+=1
ax2.set_xlabel('Date')
ax2.set_ylabel('Probable')
ax1.set_ylabel('Lab-confirmed')
ax1.grid(linewidth=0.5)
ax1.legend()
ax2.grid(linewidth=0.5)
ax2.legend()
plt.show()

The dip in Positive (pos_spec_dt) and Symptoms (onset_dt) towards the end are due to the lag between symptomatic cases being identified as positive, i.e. the backlog of official reports (cdc_report_dt) as not finished compiling the data. I expect this to increase/adjust in future uplodas of the dataset.
<br>We can also see a rise in symptoms without a rise in positive results, from the probable case status (bottom graph). To me, this indicates people becoming more cautious of getting covid during winter months when symptoms can overlap with the common cold or flu.

In [None]:
df_race = df.groupby(by=['cdc_report_dt','current_status', 'Race'])['sex'].count().reset_index()
df_race1 = df_race[df_race['current_status']=='Laboratory-confirmed case']
df_race1a = df_race[df_race['current_status']=='Probable Case']

fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(30,15))
fig.suptitle('Status Type by Race')
races = df['Race'].unique()[:-1]
colors = ['#1ecbe1','#E1341E','#CBE11E','saddlebrown','navy','#961EE1','indigo','darkorange','#3364CC','white']
i=0
for race in races:
    ax1.plot(df_race1[df_race1['Race'] == race]['cdc_report_dt'],
             df_race1[df_race1['Race'] == race]['sex'],
             label=race,
             color=colors[i])
    i+=1
j=0
for race in races:
    ax2.plot(df_race1a[df_race1a['Race'] == race]['cdc_report_dt'],
             df_race1a[df_race1a['Race'] == race]['sex'],
             label=race,
             color=colors[j])
    j+=1

ax.set(xlabel='date',
       ylabel='cases',
       title='CDC Report Date, by Status Type')

ax1.grid()
ax1.legend()
ax2.grid()
ax2.legend()
plt.show()