
# COVID-Team-1
### Dr. Mohanty
### Christopher Brantley
### CSC 405 01
### Last Updated: 10/27/2020
### I, Christopher Brantley, have abided by the UNCG academic policy.

## Imports.

In [None]:
import ipynb
from ipynb.fs.full import covid_data as cd
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

## Pandas Display Options.

In [None]:
# Telling pandas not convert to html tags.
pd.set_option('display.html.table_schema', True)
# Max columns and rows to display.
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 8)

### Data Paths

In [None]:
# Relative to "../"
_state_daily_cases = "./chris/DATA/state_county_daily_cases.csv"
_state_daily_deaths = "./chris/DATA/state_county_daily_deaths.csv"

_state_weekly_casees = "./chris/DATA/state_county_weekly_cases.csv"
_state_weekly_deaths = "./chris/DATA/state_county_weekly_deaths.csv"

_state_county_weekly_stats_cases = "./chris/DATA/state_county_weekly_stats_cases.csv"
_state_county_weekly_stats_deaths = "./chris/DATA/state_county_weekly_stats_deaths.csv"

# Generate weekly statistics (mean, median, mode) for number of cases and deaths across a specific state.


### Generating daily new cases for State by county.

In [None]:
# Gets the daily new cases for any state and exports to csv.
def state_daily_cases_to_csv(_state):
    df_primary_data = cd.get_covid_primary_data()
    # Get all columns that have County or cases.
    df_state_cases = df_primary_data[df_primary_data.State == _state].filter(regex = "County|cases")
    # Setting index as county name.
    df_state_cases.set_index("County Name", inplace = True)
    # Creating new dataframe with county names as index.
    df_daily_cases = pd.DataFrame(index = [df_state_cases.index])
    # Getting number of days by getting number of columns that contain cases keyword.
    number_days = len(df_state_cases.filter(regex = "cases").columns)
    #  We iterate from the end so we will utilize negative values here.
    for day in range(-1, -number_days, -1):
        # Get the column name of day of cases.
        column_name = df_state_cases.columns[day]
        # Get the current case values and subtract the previous days.
        column_value = df_state_cases.iloc[:, day].sub(df_state_cases.iloc[:, day - 1]).values
        # Insert into dataframe the day of cases and the number of new cases.
        df_daily_cases.insert(0, column_name, column_value)
    # Because the first day has no preceding cases we will insert these as is.
    first_column_name = df_state_cases.columns[0]
    first_column_values = df_state_cases.iloc[:, 0:1].values
    df_daily_cases.insert(0, first_column_name, first_column_values)
    # We will export the data.
    df_daily_cases.to_csv(_state_daily_cases)

### Generating daily new deaths for State by county.

In [None]:
# Gets the daily new deaths and exports to csv.
def state_daily_deaths_to_csv(state):
    # First get our primary data.
    df_primary_data = cd.get_covid_primary_data()
    # Find any columns that are containing keywords county/deaths.
    df_state_deaths = df_primary_data[df_primary_data.State == state].filter(regex = "County|deaths")
    # Setting county name as the index.
    df_state_deaths.set_index("County Name", inplace = True)
    # Create new dataframe to hold daily new deaths, with index of counties.
    df_daily_deaths = pd.DataFrame(index = [df_state_deaths.index])
    # Get number of recorded days by the number of columns that exist.
    number_days = len(df_state_deaths.filter(regex = "deaths").columns)
    # We start at the end and move to the front of data so we utilize negative values.
    for day in range(-1, - number_days, -1):
        # Get the current column name for date value.
        column_name = df_state_deaths.columns[day]
        # Get the current daily death value and subtract the previous daily death value.
        # This yields daily new death value.
        column_values = df_state_deaths.iloc[:, day].sub(df_state_deaths.iloc[:, day - 1]).values
        # Insert the daily death column/value into the dataframe.
        df_daily_deaths.insert(0, column_name, column_values)
    # The first value does not have a preceding day so we will just copy these values as is.
    first_column_name = df_state_deaths.columns[0]
    first_column_values = df_state_deaths.iloc[:, 0:1].values
    df_daily_deaths.insert(0, first_column_name, first_column_values)
    # Export data to csv.
    df_daily_deaths.to_csv(_state_daily_deaths)

### Accessor methods for State Daily Cases and Deaths.

In [None]:
# Accessor method for state daily cases by county.
def get_state_daily_cases(_state):
    """Returns dataframe containing state daily cases by county."""
    state_daily_cases_to_csv(_state)
    return pd.read_csv(_state_daily_cases, index_col=[0])
    
# Accessor method for state daily deaths by county.
def get_state_daily_deaths(_state):
    """Returns dataframe containing state daily deaths by county."""
    state_daily_deaths_to_csv(_state)
    return pd.read_csv(_state_daily_deaths, index_col=[0])

### Accessor/Generator for Normalized Daily Cases

In [None]:
# Generates daily normalized cases.
def state_daily_cases_normalized(_state):
    # Get the covid primary data.
    df_primary_data = cd.get_covid_primary_data()
    # Get all columns that contain keywords County/cases.
    df_state_cases = df_primary_data[df_primary_data.State == _state].filter(regex = "County|cases")
    # Set the county name as index.
    df_state_cases.set_index("County Name", inplace = True)
    # Create daily cases dataframe with index of counties.
    df_daily_cases = pd.DataFrame(index = [df_state_cases.index])
    # Get the relevant state population data.
    df_state_pop = df_primary_data[df_primary_data.State == _state].filter(regex = "County|population")
    # Set the county name as index.
    df_state_pop.set_index("County Name", inplace = True)
    # Get the number of days as the number of columns that contain cases as keyword.
    number_days = len(df_state_cases.filter(regex = "cases").columns)
    # We will work in reverse, end to front of dataframe.
    for day in range(-1, -number_days, -1):
        # Get the column name for date value.
        column_name = df_state_cases.columns[day]
        # Get the current day subtracting the pror day.
        column_value = df_state_cases.iloc[:, day].sub(df_state_cases.iloc[:, day - 1]).values
        # Insert extracted date/value combo to df_daily_cases.
        df_daily_cases.insert(0, column_name, column_value)
    # We iterate over each county.
    for county in df_state_cases.index.values:
        # Get the population for current county.
        pop = df_state_pop.loc[county,"population"]
        # Swap the values out for the normalized value.
        df_daily_cases.loc[county, :] = \
        df_daily_cases.loc[county, :].apply(lambda x: round(((x / pop) * 100000), 0)).values
    # We will not need to subtract the prior day from the first day so we just apply normalization.
    first_column_name = df_state_cases.columns[0]
    first_column_values = df_state_cases.iloc[:, 0:1].apply(lambda x: round(((x / pop) * 100000), 0)).values
    df_daily_cases.insert(0, first_column_name, first_column_values)
    
    return df_daily_cases

In [None]:
# Generates log10 daily cases.
def state_daily_cases_log10(_state):
    # We get the primary covid data.
    df_primary_data = cd.get_covid_primary_data()
    # Get specific state and all columns that contain keywords.
    df_state_cases = df_primary_data[df_primary_data.State == _state].filter(regex = "County|cases")
    # Set index as county names.
    df_state_cases.set_index("County Name", inplace = True)
    # Crate new dataframe to hold daily cases, index is counties.
    df_daily_cases = pd.DataFrame(index = [df_state_cases.index])
    # Get the relevant county populations.
    df_state_pop = df_primary_data[df_primary_data.State == _state].filter(regex = "County|population")
    # Set county name as index.
    df_state_pop.set_index("County Name", inplace = True)
    # Get number days as number of columns that contain keyword.
    number_days = len(df_state_cases.filter(regex = "cases").columns)
    # Start at the end and work towards the front of dataframe.
    for day in range(-1, -number_days, -1):
        # Get the current column name for date info.
        column_name = df_state_cases.columns[day]
        # Get the current daily value subtracting the prior daily value.
        column_value = df_state_cases.iloc[:, day].sub(df_state_cases.iloc[:, day - 1]).values
        # Insert the extracted info into our daily dataframe.
        df_daily_cases.insert(0, column_name, column_value)
    # For each county we...
    for county in df_state_cases.index.values:
        # Get the county population.
        pop = df_state_pop.loc[county,"population"]
        # We normalize and then take get log10 value.
        df_daily_cases.loc[county, :] = \
        np.log10(df_daily_cases.loc[county, :].apply(lambda x: round(((x / pop) * 100000), 0)).values)
    # No prior day exists for the first date so we just normalize and calculate log10 value.
    first_column_name = df_state_cases.columns[0]
    first_column_values = np.log10(df_state_cases.iloc[:, 0:1]\
                                   .apply(lambda x: round(((x / pop) * 100000), 0)).values)
    df_daily_cases.insert(0, first_column_name, first_column_values)
    
    return df_daily_cases

### Accessor/Generator for Normalized Daily Deaths

In [None]:
# Generates normalized daily deaths.
def state_daily_deaths_normalized(_state):
    # Get covid primary data.
    df_primary_data = cd.get_covid_primary_data()
    # Get columns that match keywords County/deaths.
    df_state_deaths = df_primary_data[df_primary_data.State == _state].filter(regex = "County|deaths")
    # Set index as county names.
    df_state_deaths.set_index("County Name", inplace = True)
    # Create new dataframe with index as county names.
    df_daily_deaths = pd.DataFrame(index = [df_state_deaths.index])
    # Get state specific population information.
    df_state_pop = df_primary_data[df_primary_data.State == _state].filter(regex = "County|population")
    # Set state pop index as county name.
    df_state_pop.set_index("County Name", inplace = True)
    # Calculate number days as number of columns that contain death keyword.
    number_days = len(df_state_deaths.filter(regex = "deaths").columns)
    # Work from the back of dataframe towards the front.
    for day in range(-1, - number_days, -1):
        # Extract the current date value.
        column_name = df_state_deaths.columns[day]
        # Get the current value and subtract the prior daily value.
        column_values = df_state_deaths.iloc[:, day].sub(df_state_deaths.iloc[:, day - 1]).values
        # Insert information into the daily dataframe.
        df_daily_deaths.insert(0, column_name, column_values)
    # For each county.
    for county in df_daily_deaths.index.values:
        # Get the county's population.
        pop = df_state_pop.loc[county,"population"]
        # Apply normalization to each county by respective population.
        df_daily_deaths.loc[county, :] = \
        df_daily_deaths.loc[county, :].apply(lambda x: round(((x / pop) * 100000), 0)).values
    # Normalize and insert first row of data, no prior date exists to calculate daily value.
    first_column_name = df_state_deaths.columns[0]
    first_column_values = df_state_deaths.iloc[:, 0:1].apply(lambda x:(x / pop) * 100000).values
    df_daily_deaths.insert(0, first_column_name, first_column_values)
    return df_daily_deaths

In [None]:
# Gnerates log10 daily deaths.
def state_daily_deaths_log10(_state):
    # Get primary data.
    df_primary_data = cd.get_covid_primary_data()
    # Retrieve all columns containing keywords.
    df_state_deaths = df_primary_data[df_primary_data.State == _state].filter(regex = "County|deaths")
    # Set index as county names.
    df_state_deaths.set_index("County Name", inplace = True)
    # Create new dataframe to hold daily deaths with county names as index.
    df_daily_deaths = pd.DataFrame(index = [df_state_deaths.index])
    # Get the state specific population information.
    df_state_pop = df_primary_data[df_primary_data.State == _state].filter(regex = "County|population")
    # Set index as county name.
    df_state_pop.set_index("County Name", inplace = True)
    # Let number days equal the length of the columns that contain deaths as keyword.
    number_days = len(df_state_deaths.filter(regex = "deaths").columns)
    # Work from back to front of database.
    for day in range(-1, - number_days, -1):
        # Get the current date value.
        column_name = df_state_deaths.columns[day]
        # Get the daily deaths, current day - prior day.
        column_values = df_state_deaths.iloc[:, day].sub(df_state_deaths.iloc[:, day - 1]).values
        # Insert extracted information as daily deaths.
        df_daily_deaths.insert(0, column_name, column_values)
    # For each county in df_daily_deaths.
    for county in df_daily_deaths.index.values:
        # Get county population.
        pop = df_state_pop.loc[county,"population"]
        # Apply population normalization and apply log10.
        df_daily_deaths.loc[county, :] = \
        np.log10(df_daily_deaths.loc[county, :].apply(lambda x: round(((x / pop) * 100000), 0)).values)
    # Apply normalization and log10 to first date value.
    first_column_name = df_state_deaths.columns[0]
    first_column_values = np.log10(df_state_deaths.iloc[:, 0:1].apply(lambda x:(x / pop) * 100000).values)
    df_daily_deaths.insert(0, first_column_name, first_column_values)
    return df_daily_deaths

### Generating Weekly Cases for State by County.

In [None]:
# Generates weekly new cases.
def state_weekly_cases_to_csv(state):
    # Get daily cases.
    df_county_cases = get_state_daily_cases(state)
    # Get shape of dataframe where all columns contain cases as keyword as number days.
    number_days = df_county_cases.filter(regex = "cases").shape[1]
    # We will get the number of days needed to make number_days divisible by 7 to calculate number of weeks.
    number_weeks = int(((7 - number_days % 7) + number_days) / 7)

    index_county = []
    index_weeks = []
    # Create each week for each county for multi index purposes.
    for county in df_county_cases.index.values:
        for week in range(0, number_weeks, 1):
            index_county.append(county)
            index_weeks.append("week " + str(week + 1))
    # Create new dataframe with multi index and columns as cases.
    df_county_weekly = pd.DataFrame(index = [index_county, index_weeks],\
                                   columns = ["cases"])
    # For each county.
    for county in df_county_cases.index.values:
        # For each increment of seven columns.
        for column in range(0, df_county_cases.shape[1], 7):
            # Find the weekly sum and assign it to proper index.
            df_county_weekly.loc[(county, "week " + str((column // 7) + 1))] = \
                df_county_cases.loc[county].iloc[column:column + 7].sum()
    df_county_weekly.to_csv(_state_weekly_casees)

### Generating Weekly Deaths for State by County.

In [None]:
# Generates weekly new deaths.
def state_weekly_deaths_to_csv(state):
    # Get state daily deaths.
    df_county_deaths = get_state_daily_deaths(state)
    # Get number of columns that contain keyword deaths.
    number_days = df_county_deaths.filter(regex = "deaths").shape[1]
    # Make number_days divisible by seven and calculate number of weeks.
    number_weeks = int(((7 - number_days % 7) + number_days) / 7)
    
    index_county = []
    index_weeks = []
    # Create multi index of each week for each county.
    for county in df_county_deaths.index.values:
        for week in range(0, number_weeks, 1):
            index_county.append(county)
            index_weeks.append("week " + str(week + 1))
    # Create dataframe with multi index and columns as deaths.
    df_county_weekly = pd.DataFrame(index = [index_county, index_weeks],
                                    columns = ["deaths"])
    # For each county.
    for county in df_county_deaths.index.values:
        # For each seven daily deaths.
        for column in range(0, df_county_deaths.shape[1], 7):
            # Calculate the sum and store it in proper index of weekly deaths.
            df_county_weekly.loc[(county, "week " + str((column // 7) + 1))] = \
                df_county_deaths.loc[county].iloc[column:column + 7].sum()
    df_county_weekly.to_csv(_state_weekly_deaths)

### Accessor Methods for State Weekly Cases and Deaths

In [None]:
# Accessor method for state weekly cases by county.
def get_state_weekly_cases(_state):
    state_weekly_cases_to_csv(_state)
    # index_col is necessary to show that 0, 1 make up a multiindex.
    return pd.read_csv(_state_weekly_casees, index_col=[0, 1])
    
# Accessor method for state weekly deaths by county.
def get_state_weekly_deaths(_state):
    state_weekly_deaths_to_csv(_state)
    # index_col is necessary to show that 0, 1 make up a multiindex.
    return pd.read_csv(_state_weekly_deaths, index_col=[0, 1])

### Generating Weekly Case Stats for State by county.

In [None]:
# Generates weekly stats for state cases.
def state_weekly_cases_stats_to_csv(_state,):
    # Get primary data.
    df_primary_data = cd.get_covid_primary_data()
    # Get columns that contain county/population for specific state.
    df_state_pop = df_primary_data[df_primary_data.State == _state].filter(regex = "County|population")
    # Set county name as index.
    df_state_pop.set_index("County Name", inplace = True)
    # Get weekly case values.
    df_weekly_normalized = get_state_weekly_cases(_state)

    # Normalizing weekly cases to 100,000.
    # For each county.
    for county in df_weekly_normalized.index.get_level_values(0).unique():
        # Get the county's population.
        pop = df_state_pop.loc[county, "population"]
        # Normalize and round so that mode can be calculated.
        df_weekly_normalized.loc[county, :] = \
        df_weekly_normalized.loc[county, :].apply(lambda x: round(((x / pop) * 100000), 0)).values
    # Create new dataframe with county names as index, columns as statistical measurements.
    df_county_normalized = pd.DataFrame(index = [df_weekly_normalized.index.get_level_values(0).unique()],\
                                   columns = ["mean", "median", "mode"])
    # For each county.
    for county in df_weekly_normalized.index.get_level_values(0).unique():
        # Calculate statistical measurements.
        df_county_normalized.loc[county, "mean"] = df_weekly_normalized.loc[county].mean()[0]
        df_county_normalized.loc[county, "median"] = df_weekly_normalized.loc[county].median()[0]
        df_county_normalized.loc[county, "mode"] = df_weekly_normalized.loc[county].mode().values[0,0]
    df_county_normalized.to_csv(_state_county_weekly_stats_cases)

### Generating Weekly Deaths Stats for State by county.

In [None]:
# Generates weekly stats for state deaths.
def state_weekly_deaths_stats_to_csv(_state,):
    # Get covid primary data.
    df_primary_data = cd.get_covid_primary_data()
    # Get state specific data where columns match County and population keywords.
    df_state_pop = df_primary_data[df_primary_data.State == _state].filter(regex = "County|population")
    # Set index as county names.
    df_state_pop.set_index("County Name", inplace = True)
    # Get weekly deaths.
    df_weekly_normalized = get_state_weekly_deaths(_state)

    # Normalizing deaths to 100,000.
    # For each county.
    for county in df_weekly_normalized.index.get_level_values(0).unique():
        # Get county's population.
        pop = df_state_pop.loc[county, "population"]
        # Normalize weekly data.
        df_weekly_normalized.loc[county, :] = \
        df_weekly_normalized.loc[county, :].apply(lambda x: round(((x / pop) * 100000), 0)).values
    # Create new dataframe with county names as index, columns as statistical measurements.
    df_county_normalized = pd.DataFrame(index = [df_weekly_normalized.index.get_level_values(0).unique()],\
                                   columns = ["mean", "median", "mode"])
    # For each county.
    for county in df_weekly_normalized.index.get_level_values(0).unique():
        # Calculate statistical measurements.
        df_county_normalized.loc[county, "mean"] = round(df_weekly_normalized.loc[county].mean()[0], 0)
        df_county_normalized.loc[county, "median"] = round(df_weekly_normalized.loc[county].median()[0])
        df_county_normalized.loc[county, "mode"] = round(df_weekly_normalized.loc[county].mode().values[0,0], 0)
    df_county_normalized.to_csv(_state_county_weekly_stats_deaths)

### Accessor Methods for State Weekly Cases and Deaths

In [None]:
# Accessor method for state weekly cases by county.
def get_county_weekly_cases_stats(_state):
    state_weekly_cases_stats_to_csv(_state)
    return pd.read_csv(_state_county_weekly_stats_cases, index_col=[0])
    
# Accessor method for state weekly deaths by county.
def get_county_weekly_deaths_stats(_state):
    state_weekly_deaths_stats_to_csv(_state)
    return pd.read_csv(_state_county_weekly_stats_deaths, index_col=[0])

### Generator and Accessor for  Weekly Cases Stats for State.

In [None]:
# Generates county overall statistical measurements for both cases/deaths.
def county_weekly_stats_overall(_state):
    # Create dataframe to hold states weekly cases/deaths as index, statistical measurements as columns.
    df_overall_weekly_county = pd.DataFrame(index = [_state + " weekly cases",
                                 _state + " weekly deaths"],
                 columns = ["mean", "median", "mode"])
    # For each case/death, weekly case stats, weekly death stats get overall statistical measurement.
    for data_type, dataset in zip(tuple(df_overall_weekly_county.index.values),\
                                  (get_county_weekly_cases_stats(_state),\
                                   get_county_weekly_deaths_stats(_state))):
        # Calculating overall statistical measurements.
        df_overall_weekly_county.loc[data_type, "mean"] = dataset["mean"].mean()
        df_overall_weekly_county.loc[data_type, "median"] = dataset["median"].median()
        df_overall_weekly_county.loc[data_type, "mode"] = dataset["mode"].mode()[0]

    return df_overall_weekly_county