# Corona Virus Data

The Corona Virus, COVID-19, is one of the most significant events of recent years, affecting the entire world. Social distancing measures and shelter in place orders have been enacted to slow the spread of the pandemic. These interventions come with the cost of slowed social and economic activity.

The goal of this notebook is to explore corona virus data from a list of sources including those aggregated by [Johns Hopkins University Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19) and the [New York Times](https://github.com/nytimes/covid-19-data). This work is intended to make use of *data science* skills, including programming, data visualization and modeling to remind readers of the bigger picture. 

The structure of this notebook is as follows:

1. [Convenience Functions](#functions)
2. [Data Preprocessing](#data)
3. [Exploratory Data Analysis](#eda)
4. [SIR Model Background](#SIR)
5. [Model Implementation and Fits](#fits)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import re
import plotly.express as px
from fuzzywuzzy import fuzz, process
from collections import defaultdict

# <a id='functions'></a>Convenience Functions

In [38]:
# Functions to load data 
def load_raw_global():
    """Loads confirmed, deaths, recovered data frames in a dict."""
    path = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/' \
            'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_'
    world = {i: pd.read_csv(path+i.lower()+'_global.csv').drop(columns=['Lat', 'Long']) 
             for i in ['Confirmed', 'Deaths', 'Recovered']}
    return world

def load_local():
    """Loads state, county data in tidy format, packaged in a dict."""
    path = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/'
    data ={'state': pd.read_csv(path + 'us-states.csv'),
            'county': pd.read_csv(path + 'us-counties.csv')}
    data['state'].rename(columns={'date': 'Date', 'state':'State',
                                  'cases':'Confirmed','deaths':'Deaths'}, inplace=True)
    data['county'].rename(columns={'date': 'Date', 'county':'County',
                                   'state':'State','cases':'Confirmed',
                                   'deaths':'Deaths'}, inplace=True)
    return data

def search_country(data, countries):
    """Narrow search results for confirmed, deaths"""
    data = data[data['Country/Region'].isin(countries)]
    return confirmed, deaths

def time_series(data):
    """Transpose data with time series index, and swaps index and columns"""
    idx = pd.to_datetime(data.columns, errors='coerce')                        
    ts = (data
          .T
          .loc[data.columns.drop(labels=['Province/State', 'Country/Region', 'Lat', 'Long'],
                                 errors='ignore')
              ]
          .set_index(idx)          
         )
    return ts

def load_merged_daily_global(datadict):
    """Create dataframes of daily confirmed, deaths, recovered in tidy (long) format"""        
    d_ = list(datadict.values())
    merged = d_[0]
    for d in d_:
        merged = pd.merge(merged, d)
    for key in ['Confirmed', 'Deaths', 'Recovered']:
        merged['Daily ' + key] = (merged
                                  .groupby('Country/Region')
                                  [key]
                                  .diff()
                                  .fillna(merged[key])
                                 )
    return merged
    
def load_tidy_global(data):
    """Load confirmed, deaths, recovered in tidy (long) format"""   
    datadict = {}
    for key, name in zip(data, ['Confirmed', 'Deaths', 'Recovered']):
        country_aggregated = data[key].groupby('Country/Region').agg(sum).reset_index()
        datadict[key] = pd.melt(country_aggregated, 'Country/Region', var_name='Date', value_name=name)        
    return datadict

def load_merged_daily_local(data):
    """Create dict of dataframes of state or county level daily cases, deaths in tidy (long) format in a dictionary."""
    data['Date'] = pd.to_datetime(data.Date)
    iscounty = 'County' in data.columns
    group = 'State'
    if iscounty:
        group = ['County', 'State']
    for i in ['Confirmed', 'Deaths']:
        data['Daily ' + i]= data.groupby(group)[i].diff().fillna(data[i])
    return data

def load_daily_county(county):
    """Create dict of dataframes of county-level daily cases, deaths
        in tidy (long) format in a dictionary"""
    daily_county = {}
    county['Date'] = pd.to_datetime(county['Date'])
    multi_idx = county.set_index(['County', 'State']).index
    county['County_State'] = multi_idx
    for i in ['Cases', 'Deaths']:
        df = (county
                     .pivot(index='Date', columns='County_State', values=i)
                     .fillna(0)
                     .diff()
                     .reset_index()
                    )
        daily_county[i] = pd.melt(df, id_vars='Date', var_name=['County_State'],
                                  value_name='Daily '+i)
        daily_county[i]['County'] = daily_county[i]['County_State'].apply(lambda x: x[0])
        daily_county[i]['State'] = daily_county[i]['County_State'].apply(lambda x: x[1])        
    return daily_county

# Functions to filter data
def top_n_countries(world_confirmed, feature='Confirmed', n=10):
    """Return list of top n countries with highest feature (Confirmed, Deaths, Recovered, Daily ...)"""
    top_n = (world_confirmed
              .groupby('Country/Region')
              .agg('max')
              .sort_values(feature, ascending=False)
              .head(n)
              .index
              .values)
    return list(top_n)

def top_n_states(states, feature='Confirmed', n=10):
    top_n = (states
              .groupby('State')
              .agg('max')
              .sort_values(feature, ascending=False)
              .head(n)
              .index
              .values)
    return list(top_n)

def top_n_counties(counties, feature='Confirmed', n=10):
    top_n = (counties
                .groupby('County_State')
                .agg['max']
                .sort_values('Confirmed', ascending=False)
                .head(n)
                .index
                .values)
    return list(top_n)

def filter_countries(data, countries):
    return data[data['Country/Region'].isin(countries)]

def filter_states(data, states):
    return data[data['States'].isin(states)]

def filter_counties(data, counties):
    return data[data['County_States'].isin(counties)]

# Text processing
def convert2num(text):
    """
    Converts unit free text into numbers
    
    Parameters:
    ----------------
    text: string
        description
        
    Returns:    
    ----------------
    num: float
         Data converted to numerical form
    
    """
    if text is np.nan:
        return 0
    if type(text) in [np.float64, float, int, np.int64, np.float32, np.float16,
                      np.int32, np.int16]:        
        return text
    
    factor = 1
    dd = defaultdict(lambda: 1, {'m': 1e6, 'b': 1e9, 'tr': 1e12}) 
    unit = re.search(r'([a-z]*)(illion)', text)    
    if unit:
        factor = dd[unit.group(1)]
        text = text.replace(unit.group(),'').rstrip()
        
    found =  re.search(r'-?\d+[\.]?\d*', text.replace('$', '').replace(',', ''))
    
    if found:        
        try:
            num= float(found.group())*factor
        except:
            num = text
            print('Had trouble converting %s' % text)
            print('Obtained: ', found.group())
            return num
    else:
        num = text
    return num

def remove_stop_words(stop_words, text):
    """
    Description
    
    Parameters:
    ----------------
    stop_words: list
        List of strings to remove
    text: string
        String to process
        
    Returns:    
    ----------------
    filtered: string
         String with stop words removed    
    """
    text = text.lower()
    for word in stop_words:
        text = re.sub(word.lower(), '', text)
    
    return text

def search_fields(search_term, search_space, scorer='partial', cutoff=80, max_results=15):
    """
    Searches fields in countries dataset by partial string matching
    
    Parameters:
    ----------------
    search_term: string
        A string that approximates the matches
    search_space: list-like object
        Words to search from
    scorer: {'simple', 'partial', 'token sort', 'tokens set ratio'}
        Strings to specify scorers fuzz.ratio, fuzz.partial_ratio, fuzz.token_set_ratio
        respectively
    cutoff: int
        Minimum score to count as a match
    max_results: int
        Limit on how many results to show
        
    Returns:    
    ----------------
    results: list
        List of words that meet cutoff score
    
    """
    scorers = {'simple':fuzz.ratio,
           'partial': fuzz.partial_ratio,
           'token sort': fuzz.token_sort_ratio,
           'token set': fuzz.token_set_ratio}
    
    if scorer not in scorers.keys():
        print(scorer, ' not found among valid scorers. Using default instead.')
        scorer = 'partial'
    
    results = process.extract(search_term, search_space,
                       scorer=scorers[scorer], limit=max_results)
    
    filtered_results = [(field, score) for field, score in results if score >=cutoff]
    
    return filtered_results

def find_aliases(match_space, search_space, stop_words=[],
                 scorers=['partial', 'token sort'], cutoff=70, max_results=5):
    """
    Creates a dictionary between entries of one list and the analogs of another
    through consecutive string matches
    
    Parameters:
    ----------------
    match_space: list-like
        Index to translate to best matches
    search_space: list-like
        Analogous list to search from
    stop_words: list
        Words to exclude from the match_space
    scorers: list of strings
        List containing first scorer and second scorer for string matching
    cutoff: int
        Threshold for words to consider for matching
    max_results: int
        Expected number of results with top scores
    
    Returns:    
    ----------------
    best_matches: dict
        Dictionary with match_space words as keys and best matches from search_space
        as values
    unmatched: list
        List of words from the match_space with no matches
    
    """
    match_set = set(match_space)
    search_set = set(search_space)
    intersection = match_set & search_set
    diff = match_set - intersection
    search_space = search_set - intersection
    unmatched = []
    best_matches = {word: word for word in intersection}
    for word in diff:
        
        # First filter keeping only max scores
        match1 = search_fields(remove_stop_words(stop_words, word), search_space,
                              scorer=scorers[0], max_results=max_results, cutoff=cutoff)
        if len(match1)>0:
            best_score = match1[0][1]
            first_round_best = [tup[0] for tup in match1 if tup[1]==best_score]
            
            # Second filter 
            match2 = search_fields(remove_stop_words(stop_words, word), first_round_best,
                                             scorer=scorers[1], max_results=1,
                                             cutoff=cutoff)
            if len(match2) > 0:
                best_matches[word] = match2[0][0]
            else:
                best_matches[word] = match1[0][0]
        else:
            unmatched.append(word)
                
    return best_matches, unmatched

def translate(text, d):
        """Translate text to alias if present in dicitonary"""
        try:
            return d[text]
        except:
            return text        
    
def load_population_data():
    """Load and prepare country populations with same keys as covid19 data."""
    # Load population data
    populations_counties = pd.read_csv('https://www2.census.gov/programs-surveys/popest/datasets/'\
                           '2010-2019/counties/totals/co-est2019-alldata.csv',encoding='ISO-8859-1')
    populations_states = populations_counties[populations_counties.STNAME==populations_counties.CTYNAME]
    populations_countries = pd.read_csv('countries.csv')

    # Load COVID-19 data
    state, county = load_local().values()
    world = load_raw_global()
    
    # Sets of keys from each data source
    covid19_keys = {'county': set(county.groupby(['County','State']).max().index.values),
                        'state': set(state.State.unique()),
                        'country': set(world['Confirmed'].groupby('Country/Region').max().index.values),
                       }

    pop_keys = {'county': set(populations_counties[~populations_counties
                                                     .duplicated(['STNAME', 'CTYNAME'])
                                                ]
                                    .groupby(['CTYNAME', 'STNAME'])
                                    .max()
                                    .index
                            ),
                    'state': set(populations_states.groupby('STNAME').max().index),
                    'country': set(populations_countries.country.unique())
    }
    
    # Preparing initial nation, state and county dataframes
    df_countries = populations_countries.applymap(convert2num)[['country', 'Population']]
    df_countries.rename(columns={'country': 'Country/Region'}, inplace=True)
    df_states = (populations_states[~populations_states[['STNAME', 'CTYNAME']]
                                .duplicated()][['STNAME', 'POPESTIMATE2019']]
            .rename(columns={'STNAME': 'State', 'POPESTIMATE2019': 'Population'})          
            )
    
    territories = ['American Samoa', 'Guam', 'Northern Mariana Islands', 'Puerto Rico',
                 'Virgin Islands']
    df_territories = (df_countries[df_countries['Country/Region'].isin(territories)]
                  .rename(columns={'Country/Region': 'State'})
                 )
    df_states_territories = pd.concat([df_states, df_territories], ignore_index=True)

    #  Country Key Dictionary, including cruise ships
    country_search_space = df_countries['Country/Region']
    country_match_space = covid19_keys['country'] 
    covid19_pop_countries, unmatched_countries = find_aliases(country_match_space, country_search_space)
    pop_covid19_countries = {val: key for key, val in covid19_pop_countries.items()}
    # manual inputs
    pop_covid19_countries['United States'] = 'US'
    pop_covid19_countries['Congo, Republic of the'] = 'Congo (Brazzaville)'
    pop_covid19_countries['Congo, Democratic Republic of the'] = 'Congo (Kinshasa)'
    df_cruise_ships = pd.DataFrame([['The Diamond Princess',3711], ['MS Zaandam', 1243]],
                                      columns=['Country/Region', 'Population'])
    df_global = df_countries.append(df_cruise_ships, ignore_index=True)
    # State Key Dictionary
    state_search_space = df_states_territories.State
    state_match_space = covid19_keys['state'] 
    covid19_pop_states, unmatched_states = find_aliases(state_match_space, state_search_space)
    pop_covid19_states = {val: key for key, val in covid19_pop_states.items()}
    
    # County Key Dictionary, including NYC entry
    county_search_space = county.County.unique()
    county_match_space = populations_counties.CTYNAME.unique()
    covid19_pop_counties, unmatched_counties = find_aliases(county_search_space, county_match_space)
    boroughs = ['Richmond County', 'Queens County', 'Kings County', 'Bronx County', 'New York County']
    inNYstate = populations_counties.STNAME=='New York'
    same_counties = populations_counties.CTYNAME.isin(boroughs)
    inNYC = same_counties & inNYstate
    NYC_pop = populations_counties[inNYC].loc[:, ('CTYNAME', 'POPESTIMATE2019')]
    NYC_2019est = populations_counties[inNYC].loc[:,'POPESTIMATE2019'].sum()
    df_NYC = pd.DataFrame({'CTYNAME': 'New York City', 'STNAME': 'New York',
                       'POPESTIMATE2019': NYC_2019est}, index=[0])
    populations_counties.append(df_NYC, ignore_index=True)
    covid19_pop_counties['New York City'] = 'New York City'
    pop_covid19_counties = {val: key for key, val in covid19_pop_counties.items()}

    pop_covid19_US = {**pop_covid19_counties, **pop_covid19_states}
    df_counties = (populations_counties
                              .append(df_NYC, ignore_index=True)
                              .loc[:, ('CTYNAME', 'STNAME','POPESTIMATE2019')]
                              .rename(columns={'CTYNAME':'County', 'STNAME': 'State',
                                               'POPESTIMATE2019':'Population'})
                              
                             )
    
    df_counties.loc[:,('County','State')] = (df_counties[['County','State']]
                                        .applymap(lambda x: translate(x, pop_covid19_US))
                                       )
    
    return df_global, df_states_territories, df_counties

def create_per_capita_features(data, population):
    """Merges data set with population data and creates per capita numerical features"""
   
    num_types = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = pd.merge(data, population)
    numerical_fields = [i for i in data.columns 
                        if data[i].dtype in num_types]
    for i in numerical_fields:
        df[i+' per capita'] = df[i]/df['Population']
    return df


# <a id='data'></a> Population Data Preprocessing

## Translating to Common Keys

The COVID-19 data contains raw number of confirmed cases, deaths and recovered. To compare these numbers among different countries, states, and counties, we may wish to factor in populations, which requires population estimates for each entity. To eventually join these datasets, which will require *joins* based on common *keys*, specifically the geographic names. 

This requires that the population data and COVID-19 data to have consistent analogous keys:

1. COVID-19 Data:
    - countries
    - states
    - counties
2. Population Data:
    - countries
    - states
    - counties

Our approach will be:
1. loading population data
2. loading COVID-19 data
3. create sets of unique location entities of each data set
3. compare number of location entities in each data set
4. use string matching library `fuzzywuzzy` to create dictionaries between location keys
5. check a few examples manually

In [3]:
# Load population data
populations_counties = pd.read_csv('https://www2.census.gov/programs-surveys/popest/datasets/'\
                                   '2010-2019/counties/totals/co-est2019-alldata.csv',encoding='ISO-8859-1')
populations_states = populations_counties[populations_counties.STNAME==populations_counties.CTYNAME]
populations_countries = pd.read_csv('countries.csv')

# Load COVID-19 data
state, county = load_local().values()
world = load_raw_global()

In [4]:
# Sets of keys from each data source
covid19_keys = {'county': set(county.groupby(['County','State']).max().index.values),
                    'state': set(state.State.unique()),
                    'country': set(world['Confirmed'].groupby('Country/Region').max().index.values),
                   }

pop_keys = {'county': set(populations_counties[~populations_counties
                                                 .duplicated(['STNAME', 'CTYNAME'])
                                            ]
                                .groupby(['CTYNAME', 'STNAME'])
                                .max()
                                .index
                        ),
                'state': set(populations_states.groupby('STNAME').max().index),
                'country': set(populations_countries.country.unique())
}

In [5]:
# Comparing set cardinalities
d = {'Countries (Population)': [populations_countries.country.nunique(),
                                len(pop_keys['country'])],
     'Countries (COVID-19)': [world['Confirmed']['Country/Region'].nunique(),
                              len(covid19_keys['country'])],
     'States (Population)': [populations_states.STNAME.nunique(),
                             len(pop_keys['state'])],
     'States (COVID-19)': [state.State.nunique(),
                           len(covid19_keys['state'])],
     'Counties (Population)': [populations_counties.CTYNAME.nunique(),
                               len(pop_keys['country'])],
     'Counties (COVID-19)': [county.County.nunique(),
                             len(covid19_keys['county'])]
     }
df = pd.DataFrame(d, index=['Unique Names','Unique Entities'])
df

Unnamed: 0,Countries (Population),Countries (COVID-19),States (Population),States (COVID-19),Counties (Population),Counties (COVID-19)
Unique Names,250,188,51,55,1927,1726
Unique Entities,250,188,51,55,250,2947


In [6]:
# Set differences and intersections
covid19_keys['state'] - pop_keys['state'] 

{'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands'}

In [7]:
covid19_keys['state'] & pop_keys['country']

{'Georgia',
 'Guam',
 'Northern Mariana Islands',
 'Puerto Rico',
 'Virgin Islands'}

In [8]:
pop_keys['country'] & pop_keys['state']

{'Georgia'}

In [9]:
covid19_keys['country'] - pop_keys['country'] 

{'Bahamas',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Diamond Princess',
 'Gambia',
 'Holy See',
 'MS Zaandam',
 'Taiwan*',
 'US',
 'West Bank and Gaza'}

### Note:

- The population data has more entities than covid-19 data in the case of countries and counties
- States include:
    - 50 US States
    - District of Columbia
    - territories: American Samoa, Guam, Northern Mariana Islands, Puerto Rico, Virgin Islands 
 

So...
- population data is likely available for all entities with covid19 data
- population data of states and countries should be merged for convenient joins, were it not for Georgia
- state population data should be augmented with the territory data from country population dataset:
    - American Samoa
    - Guam
    - Northern Mariana Islands
    - Puerto Rico
    - Virgin Islands
 

### Augmenting States Data with Territories from Country Data

In [10]:
df_countries = populations_countries.applymap(convert2num)[['country', 'Population']]
df_countries.rename(columns={'country': 'Country/Region'}, inplace=True)
df_countries.head()

Unnamed: 0,Country/Region,Population
0,American Samoa,49437.0
1,Afghanistan,36643815.0
2,Albania,3074579.0
3,Angola,32522339.0
4,Algeria,42972878.0


In [11]:
territories = ['American Samoa', 'Guam', 'Northern Mariana Islands', 'Puerto Rico',
                 'Virgin Islands']
df_territories = (df_countries[df_countries['Country/Region'].isin(territories)]
                  .rename(columns={'Country/Region': 'State'})
                 )
df_territories

Unnamed: 0,State,Population
0,American Samoa,49437.0
91,Guam,168485.0
166,Northern Mariana Islands,51433.0
181,Puerto Rico,3189068.0
238,Virgin Islands,106235.0


In [12]:
df_states = (populations_states[~populations_states[['STNAME', 'CTYNAME']]
                                .duplicated()][['STNAME', 'POPESTIMATE2019']]
            .rename(columns={'STNAME': 'State', 'POPESTIMATE2019': 'Population'})          
            )
df_states.head()

Unnamed: 0,State,Population
0,Alabama,4903185
68,Alaska,731545
98,Arizona,7278717
114,Arkansas,3017804
190,California,39512223


In [13]:
df_states_territories = pd.concat([df_states, df_territories], ignore_index=True)
df_states_territories

Unnamed: 0,State,Population
0,Alabama,4903185.0
1,Alaska,731545.0
2,Arizona,7278717.0
3,Arkansas,3017804.0
4,California,39512223.0
5,Colorado,5758736.0
6,Connecticut,3565287.0
7,Delaware,973764.0
8,District of Columbia,705749.0
9,Florida,21477737.0


### Note:

- Concatenation of state and country population data sets looks successful


- Time to move toward checking for consistency between keys

### Creating Dictionaries Between State Keys

In [14]:
from pprint import pprint
state_search_space = df_states_territories.State
state_match_space = covid19_keys['state'] 
covid19_pop_states, unmatched_states = find_aliases(state_match_space, state_search_space)
pop_covid19_states = {val: key for key, val in covid19_pop_states.items()}

In [15]:
unmatched_states

[]

In [16]:
covid19_pop_states

{'Mississippi': 'Mississippi',
 'Northern Mariana Islands': 'Northern Mariana Islands',
 'Hawaii': 'Hawaii',
 'Nevada': 'Nevada',
 'South Carolina': 'South Carolina',
 'Guam': 'Guam',
 'Colorado': 'Colorado',
 'California': 'California',
 'North Carolina': 'North Carolina',
 'Michigan': 'Michigan',
 'Oregon': 'Oregon',
 'New Hampshire': 'New Hampshire',
 'Kentucky': 'Kentucky',
 'Alaska': 'Alaska',
 'West Virginia': 'West Virginia',
 'Missouri': 'Missouri',
 'Georgia': 'Georgia',
 'Massachusetts': 'Massachusetts',
 'Texas': 'Texas',
 'Utah': 'Utah',
 'South Dakota': 'South Dakota',
 'Oklahoma': 'Oklahoma',
 'Delaware': 'Delaware',
 'Iowa': 'Iowa',
 'Illinois': 'Illinois',
 'North Dakota': 'North Dakota',
 'Wyoming': 'Wyoming',
 'Indiana': 'Indiana',
 'Louisiana': 'Louisiana',
 'Wisconsin': 'Wisconsin',
 'Maine': 'Maine',
 'Florida': 'Florida',
 'Puerto Rico': 'Puerto Rico',
 'Minnesota': 'Minnesota',
 'New Jersey': 'New Jersey',
 'Connecticut': 'Connecticut',
 'Virgin Islands': 'Virgin

### Note
- no unmatched keys


So state populations data is ready to be joined with covid19 state data.

### Creating Dictionaries Between Country Keys

In [17]:
country_search_space = df_countries['Country/Region']
country_match_space = covid19_keys['country'] 
covid19_pop_countries, unmatched_countries = find_aliases(country_match_space, country_search_space)
pop_covid19_countries = {val: key for key, val in covid19_pop_countries.items()}
unmatched_countries

['US',
 'Diamond Princess',
 'MS Zaandam',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)']

### Note

- The US is an abbreviation for United States
- Congo (Kinshasa) : [Democratic Republic of Congo](https://en.wikipedia.org/wiki/Democratic_Republic_of_the_Congo)
- Congo (Brazzaville) : [Republic of Congo](https://en.wikipedia.org/wiki/Republic_of_the_Congo)
- [The Diamond Princess](https://en.wikipedia.org/wiki/Diamond_Princess_(ship)) had a total population of 3711 
- [MS Zaandaam](https://en.wikipedia.org/wiki/MS_Zaandam#cite_note-GraudianBlog-3) had 1243 passengers and 586 crew, for a total population of 1829



**Manual Modifications to be made include:**
- adding US: United States entries to the key dictionary
- appending cruise ship population data

In [18]:
df_countries[df_countries['Country/Region'].str.contains('Congo')]

Unnamed: 0,Country/Region,Population
51,"Congo, Republic of the",5293070.0
52,"Congo, Democratic Republic of the",101780263.0


In [19]:
pop_covid19_countries['United States'] = 'US'
pop_covid19_countries['Congo, Republic of the'] = 'Congo (Brazzaville)'
pop_covid19_countries['Congo, Democratic Republic of the'] = 'Congo (Kinshasa)'

In [20]:
df_cruise_ships = pd.DataFrame([['The Diamond Princess',3711], ['MS Zaandam', 1243]],
                                      columns=['Country/Region', 'Population'])
df_global = df_countries.append(df_cruise_ships, ignore_index=True)

In [21]:
df_global

Unnamed: 0,Country/Region,Population
0,American Samoa,49437.0
1,Afghanistan,36643815.0
2,Albania,3074579.0
3,Angola,32522339.0
4,Algeria,42972878.0
...,...,...
247,United States,332639102.0
248,Yemen,29884405.0
249,Zimbabwe,14546314.0
250,The Diamond Princess,3711.0


### Converting to a Common Key for Countries 

In [22]:
def translate(text, d):
    """Translate text to alias if present in dicitonary"""
    try:
        return d[text]
    except:
        return text

df_global.loc[:,'Country/Region'] = df_global[['Country/Region']].applymap(lambda x: translate(x,pop_covid19_countries))
df_global

Unnamed: 0,Country/Region,Population
0,American Samoa,49437.0
1,Afghanistan,36643815.0
2,Albania,3074579.0
3,Angola,32522339.0
4,Algeria,42972878.0
...,...,...
247,US,332639102.0
248,Yemen,29884405.0
249,Zimbabwe,14546314.0
250,The Diamond Princess,3711.0


### Checking for Consistency Between County Keys

All states were included in the covid19 data, so that leaves the county names to check. Since we're dealing with tuples, we'll have to invidually extract and translate the county portions.

In [23]:
populations_counties.CTYNAME.unique()

array(['Alabama', 'Autauga County', 'Baldwin County', ..., 'Uinta County',
       'Washakie County', 'Weston County'], dtype=object)

In [24]:
covid19_pop_counties, unmatched_counties = find_aliases(county.County.unique(), populations_counties.CTYNAME.unique())

In [25]:
unmatched_counties

[]

In [26]:
covid19_pop_counties['New York City']

'New York'

### Notes:

- All county names had matches
- New York City was connected to New York county, which only includes Manhattan. This entity in the COVID-19 data comprises of all boroughs, which includes:
    - Staten Island (Richmond County)
    - Queens (Queens County)
    - Brooklyn (Kings County)
    - Bronx (Bronx County)
    - Manhattan (New York County)
    
This implies we need to manually add each of the populations for the total NYC estimate

In [27]:
boroughs = ['Richmond County', 'Queens County', 'Kings County', 'Bronx County', 'New York County']
inNYstate = populations_counties.STNAME=='New York'
same_counties = populations_counties.CTYNAME.isin(boroughs)
inNYC = same_counties & inNYstate
NYC_pop = populations_counties[inNYC].loc[:, ('CTYNAME', 'POPESTIMATE2019')]
NYC_pop

Unnamed: 0,CTYNAME,POPESTIMATE2019
1863,Bronx County,1418207
1884,Kings County,2559903
1891,New York County,1628706
1901,Queens County,2253858
1903,Richmond County,476143


In [28]:
NYC_2019est = populations_counties[inNYC].loc[:,'POPESTIMATE2019'].sum()

In [29]:
df_NYC = pd.DataFrame({'CTYNAME': 'New York City', 'STNAME': 'New York',
                       'POPESTIMATE2019': NYC_2019est}, index=[0])
populations_counties.append(df_NYC, ignore_index=True)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40.0,3.0,6.0,1.0,0.0,Alabama,Alabama,4779736.0,4780125.0,4785437.0,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50.0,3.0,6.0,1.0,1.0,Alabama,Autauga County,54571.0,54597.0,54773.0,...,4.847310,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50.0,3.0,6.0,1.0,3.0,Alabama,Baldwin County,182265.0,182265.0,183112.0,...,24.017829,16.641870,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50.0,3.0,6.0,1.0,5.0,Alabama,Barbour County,27457.0,27455.0,27327.0,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50.0,3.0,6.0,1.0,7.0,Alabama,Bibb County,22915.0,22915.0,22870.0,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,50.0,4.0,8.0,56.0,39.0,Wyoming,Teton County,21294.0,21298.0,21296.0,...,-1.583464,-2.575509,1.905293,23.259519,12.508872,4.670450,1.253214,-0.171608,-12.432212,1.797445
3190,50.0,4.0,8.0,56.0,41.0,Wyoming,Uinta County,21118.0,21121.0,21089.0,...,-9.181105,-18.196975,-4.487730,-10.870861,-15.033634,-10.484550,-11.485101,-18.923455,-13.554993,-9.181105
3191,50.0,4.0,8.0,56.0,43.0,Wyoming,Washakie County,8533.0,8528.0,8530.0,...,-7.652085,-12.839390,-3.084589,-1.307811,-18.938032,0.000000,-15.204038,-15.950541,-17.246806,-8.289759
3192,50.0,4.0,8.0,56.0,45.0,Wyoming,Weston County,7208.0,7208.0,7198.0,...,0.000000,-9.065551,-10.549265,6.191515,0.420345,9.201171,0.693097,-36.227798,-6.910452,2.599090


In [30]:
covid19_pop_counties['New York City'] = 'New York City'
pop_covid19_counties = {val: key for key, val in covid19_pop_counties.items()}

pop_covid19_US = {**pop_covid19_counties, **pop_covid19_states}

In [31]:
df_counties = (populations_counties
                              .append(df_NYC, ignore_index=True)
                              .loc[:, ('CTYNAME', 'STNAME','POPESTIMATE2019')]
                              .rename(columns={'CTYNAME':'County', 'STNAME': 'State',
                                               'POPESTIMATE2019':'Population'})
                              
                             )
df_counties.loc[:,('County','State')] = (df_counties[['County','State']]
                                        .applymap(lambda x: translate(x, pop_covid19_US))
                                       )


In [32]:
df_counties

Unnamed: 0,County,State,Population
0,Alabama,Alabama,4903185
1,Autauga,Alabama,55869
2,Baldwin,Alabama,223234
3,Barbour,Alabama,24686
4,Bibb,Alabama,22394
...,...,...,...
3189,Teton,Wyoming,23464
3190,Uinta,Wyoming,20226
3191,Washakie,Wyoming,7805
3192,Weston County,Wyoming,6927


## Summary Code: Population Data Preprocessing

All the steps of the above are summarized into:

In [33]:
df_countries, df_states, df_counties = load_population_data()

## Creating Per Capita Features

We can get per capita cases, deaths and recovered by:
- Merging COVID-19 and Population Data Sets
- Creating a Per Capita Feature for each data set


### Persistence


In [34]:
populations = [df_countries, df_states, df_counties]
files = ['pop_countries.csv', 'pop_states.csv', 'pop_counties.csv']
[df.to_csv(file) for df, file in zip(populations, files)]    


[None, None, None]

In [49]:
df_countries, df_states, df_counties = [pd.read_csv(file, index_col=0) for file in files]

In [53]:
# Load tidy global data dictionaries
data = load_raw_global()
tidy_global = load_tidy_global(data)
merged_global = load_merged_daily_global(tidy_global)

# Load local data, and daily data dictionaries
state, county = load_local().values()
merged_state = load_merged_daily_local(state)
merged_county = load_merged_daily_local(county)

# Create per capita features on merged data sets
datasets = [merged_global, merged_state.drop(columns='fips'), merged_county.drop(columns='fips')]
populations = [df_countries, df_states, df_counties]
world_data, state_data, county_data = [create_per_capita_features(data, pop) for data, pop in zip(datasets, populations)]

In [63]:
world_data['Location'] = world_data['Country/Region']
state_data['Location'] = state_data['State']
county_data['Location'] = county_data['County'] + ', ' + county_data['State']

In [66]:
set(state_data.Location) & set(county_data.Location) 

set()

In [None]:
county_data.Locat

In [56]:
world_data.head()

Unnamed: 0,Country/Region,Date,Confirmed,Deaths,Recovered,Daily Confirmed,Daily Deaths,Daily Recovered,Population,Confirmed per capita,Deaths per capita,Recovered per capita,Daily Confirmed per capita,Daily Deaths per capita,Daily Recovered per capita
0,Afghanistan,1/22/20,0,0,0,0.0,0.0,0.0,36643815.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,1/23/20,0,0,0,0.0,0.0,0.0,36643815.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Afghanistan,1/24/20,0,0,0,0.0,0.0,0.0,36643815.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,1/25/20,0,0,0,0.0,0.0,0.0,36643815.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Afghanistan,1/26/20,0,0,0,0.0,0.0,0.0,36643815.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
state_data.head()

Unnamed: 0,Date,State,Confirmed,Deaths,Daily Confirmed,Daily Deaths,Population,Confirmed per capita,Deaths per capita,Daily Confirmed per capita,Daily Deaths per capita
0,2020-01-21,Washington,1,0,1.0,0.0,7614893.0,1.313216e-07,0.0,1.313216e-07,0.0
1,2020-01-22,Washington,1,0,0.0,0.0,7614893.0,1.313216e-07,0.0,0.0,0.0
2,2020-01-23,Washington,1,0,0.0,0.0,7614893.0,1.313216e-07,0.0,0.0,0.0
3,2020-01-24,Washington,1,0,0.0,0.0,7614893.0,1.313216e-07,0.0,0.0,0.0
4,2020-01-25,Washington,1,0,0.0,0.0,7614893.0,1.313216e-07,0.0,0.0,0.0


In [58]:
county_data.head()

Unnamed: 0,Date,County,State,Confirmed,Deaths,Daily Confirmed,Daily Deaths,Population,Confirmed per capita,Deaths per capita,Daily Confirmed per capita,Daily Deaths per capita
0,2020-01-21,Snohomish,Washington,1,0,1.0,0.0,822083,1e-06,0.0,1e-06,0.0
1,2020-01-22,Snohomish,Washington,1,0,0.0,0.0,822083,1e-06,0.0,0.0,0.0
2,2020-01-23,Snohomish,Washington,1,0,0.0,0.0,822083,1e-06,0.0,0.0,0.0
3,2020-01-24,Snohomish,Washington,1,0,0.0,0.0,822083,1e-06,0.0,0.0,0.0
4,2020-01-25,Snohomish,Washington,1,0,0.0,0.0,822083,1e-06,0.0,0.0,0.0


In [62]:
county_data['County'] +', ' + county_data['State']

0         Snohomish, Washington
1         Snohomish, Washington
2         Snohomish, Washington
3         Snohomish, Washington
4         Snohomish, Washington
                  ...          
124446           Dent, Missouri
124447         Thayer, Nebraska
124448            Archer, Texas
124449            Fisher, Texas
124450           Haskell, Texas
Length: 124451, dtype: object

In [None]:
import time

def timed(func):
    def wrapper(*args, **kwargs):        
        start = time.time()
        val = func(*args,**kwargs)
        end = time.time()
        print('Time: ', end-start)
        return val
    return wrapper 


numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

for i in df_countries.columns:
    print(df_countries[i].dtype in numerics)
    

## <a id='eda'></a>Exploratory Data Analysis

In [None]:
# Change this list to see other countries
countries = top_n_countries(confirmed,10)
countries

In [None]:
fig1 = px.line(filter_countries(confirmed,countries), x='Date', y='Confirmed',
              color='Country/Region', hover_name='Country/Region', title='Cumulative Confirmed')
fig1.show()

In [None]:
fig2 = px.line(filter_countries(daily_confirmed,countries), x='Date', y='New Confirmed',
              color='Country/Region', hover_name='Country/Region', title='Daily Confirmed Cases')
fig2.show()

## Deaths

In [None]:
fig3 = px.line(filter_countries(deaths,countries), x='Date', y='Deaths',
              color='Country/Region', hover_name='Country/Region', title='Cumulative Deaths')
fig3.show()

In [None]:
fig4 = px.line(filter_countries(daily_deaths, countries), x='Date', y='New Deaths',
              color='Country/Region', hover_name='Country/Region', title='Daily Reported Deaths')
fig4.show()

In [None]:
fig5 = px.line(filter_countries(recovered, countries), x='Date', y='Recovered',
              color='Country/Region', hover_name='Country/Region', title='Cumulative Recovered')
fig5.show()

In [None]:
fig6 = px.line(filter_countries(daily_recovered, countries), x='Date', y='New Recovered',
              color='Country/Region', hover_name='Country/Region', title='Daily Recovered')
fig6.show()

## <a id='SIR'></a>SIR Model Background

Under this model the population is compartmentalized into time dependent groups denoted as susceptible $S(t)$, infected $I(t)$, and removed $R(t)$ and  we have the following system of differential equations to describe their dynamics:

$$\begin{align} 
\dfrac{dS}{dt} &= -\beta SI \tag{1}\\
\dfrac{dI}{dt}&= \beta SI - \nu I \tag{2}\\
\dfrac{dR}{dt}&= \nu I \tag{3}
\end{align}$$


### Understanding the Model Parameters


We can break the factor $\beta$ further down by considering:
- $\kappa$ : the contacts per unit time an infected individual has with the rest of the population
- $\tau$ : the probability that contact leads to transmission
- $N$ : the total population

Which leads to: $$ \beta = \dfrac{\kappa \tau}{N}$$

We then can look at the term:

$$\beta SI = \dfrac{\kappa \tau SI}{N} = \Big(\kappa \tau I \Big)\Big(\dfrac{S}{N}\Big)$$


and interpret $\kappa \tau I$ as the contacts per unit time by the infected population to the rest of the population that would lead to transmission. We multiply this by the fraction or probability of encountering a susceptible individual, $\dfrac{S}{N}$, to arrive at new infections per unit time.


The parameter $\nu$ can be interpreted as the probability of an infected host being removed per time, either through natural recovery, cure, or death. The inverse $\dfrac{1}{\nu}$ can be interpreted as the time per removal, or death of an invidual. In other words, the lifetime of the virus per host.

### Threshold for Epidemics

If we look at the behavior of the infected population growth rate in (2) in more detail:

$$\begin{align}
\dfrac{dI}{dt} &= \beta SI - \nu I\\
               &= (\beta S - \nu)I\\
               &= \Bigg[\dfrac{\kappa \tau S}{N} - \nu\Bigg]I\\
               &= \nu\Bigg[\dfrac{\kappa \tau}{\nu}\dfrac{S}{N} - 1\Bigg]I\\
 \end{align}$$
 
 
 With this rearrangment, we see how the term $\dfrac{\kappa \tau}{\nu}\dfrac{S}{N} - 1$ determines the sign for the rate of change for the infected population. 
 
 
 We define a threshold quantity, **effective reproductive rate** $R_e$:
 
 $$ R_e = \dfrac{\kappa \tau}{\nu}\dfrac{S}{N}$$
 
 
 This effective reproductive number factors is appropriately named because it depends on the lifetime of the virus $\dfrac{1}{\nu}$ coupled with the effective contacts per host $\kappa \tau$, scaled by the composition of the population, $\dfrac{S}{N}$. 
 
 
 and compare this quantity with 1 to see the sign for growth rate:
 
 
 $$\dfrac{dI}{dt} = \nu[R_e - 1]I$$
 
 
 
 ### Public Health Interventions
 
 - Staying home while sick, social distancing, quarantines, lowers the contacts $\kappa$, and lowers the effective $S$ and $I$ into the removed $R$ category.
 
 
 - Washing hands, sneezing into elbows and other habits reduce the fraction of transmissions per contact, $\tau$.
 
 
 - Cures and vaccinations helps reduce the lifetime of the virus, or increase $\nu$.
 
 
 - Testing allows faster reactions like quarantining, social distancing, which all lower $\kappa$.
 
 
 - Becoming informed and acting appropriately on this information overall drastically lowers $R_e$.

## <a id='fits'></a>Model Implementation and Fits

### Model Functions

In [None]:
from scipy.integrate import solve_ivp
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error 
from hyperopt import hp, fmin, tpe


def SIR(t, y, N, kappa, tau, nu):
    """
    Expresses SIR model in initial value ODE format, including time,
    state variables, and parameters required to compute the derivatives
    
    Parameters
    ---------------   
    t : array
        Independent variable
    y : array-like
        Initial state of system, [S, I, R]
    parameters : array-like
        Parameters of model [N, kappa, tau, nu]
        
    Returns
    ----------------
    f : array-like
        Array of derivatives for the system to be numerically integrated
    """ 
    S, I, R = y
    
    dSdt = - kappa*tau/N*S*I
    dIdt = (kappa*tau/N*S-nu)*I
    dRdt = nu*I
    f = [dSdt, dIdt, dRdt]
    return f

def create_SIR_curves(parameters, data=data):
    N, kappa, tau, nu = parameters
    S, I, R = N-1, 1, 0
    sys0 = (S, I, R)
    total_points = len(data)
    start = 0
    stop = total_points
    t = np.linspace(start, stop, total_points) 
    sol = solve_ivp(fun=SIR, t_span=(start,stop), y0=sys0, args=parameters, t_eval=t)
    df = pd.DataFrame(sol.y.T, columns=['S', 'I', 'R'])
    df['Time'] = t
    df['Cases'] = df['I']+df['R']
    df[data.name] = data.values
    return df

def plot_SIR(parameters, df):
    N, kappa, tau, nu = parameters
    Re = kappa*tau/(nu)
    print(Re)
    
    df.plot(x='Time', y=df.columns.drop('Time'),figsize=(15,10))
    plt.title('SIR')
    


def model_loss(parameters):
    # initial state variables    
    df = create_SIR_curves(parameters)
    loss = mean_squared_error(df['Cases'].values, df[df.columns[-1]].values)    
    return loss


### 

In [None]:
# parameters
sns.set(context='talk')
N = 10000
kappa = 3.5
tau = .05
nu = .01
parameters = (N, kappa, tau, nu)
country = 'Italy'
data = filter_countries(confirmed,[country])   


# define configuration space
space = [hp.loguniform('N', 3, 20),
         hp.uniform('kappa', 0, 10),
         hp.uniform('tau', 0, 1),
         hp.loguniform('nu', -2, 2),
         ]

In [None]:
data

In [None]:
best = fmin(model_loss, space, algo=tpe.suggest, max_evals=2000)

In [None]:
parameters = (int(best['N']),best['kappa'],best['tau'], best['nu'])
parameters

In [None]:
df = create_SIR_curves(parameters)
plot_SIR(parameters,df)

In [None]:
model_loss(parameters)

In [None]:
mean_squared_error(df['Cases'].values, df[df.columns[-1]].values)

In [None]:
data.values

In [None]:
df.columns.drop('Time')

In [None]:
df = create_SIR_curves(parameters, data)
df

In [None]:
((df.Cases-df['Korea, South'])**2).mean()

In [None]:
((df.Cases-df['Korea, South'])**2).mean()

In [None]:
from hyperopt import hp, fmin, tpe

# define loss function

def loss()

In [None]:
len(ts_confirmed)

In [None]:
ts_confirmed['Korea, South'].values

In [None]:
dots = sns.load_dataset("dots")
dots

## Population Simulations

Here's a simulation of infected population using the approach from [MIT 6.002x](https://courses.edx.org/courses/course-v1:MITx+6.00.2x_5+1T2016/courseware/44b64e16aa524037be90cd2aa3552ef6/eb30b49da504469c81d87d037126ce40/?child=first)

In [None]:
a = [('aa',1), ('a',3), ('b', 2)]

a.sort(key=lambda x: x[1])
a

In [None]:
import plotly.graph_objects as go

import pandas as pd

# Load dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv")

df.columns = [col.replace("AAPL.", "") for col in df.columns]

# Initialize figure
fig = go.Figure()

# Add Traces

fig.add_trace(
    go.Scatter(x=list(df.index),
               y=list(df.High),
               name="High",
               line=dict(color="#33CFA5")))

fig.add_trace(
    go.Scatter(x=list(df.index),
               y=[df.High.mean()] * len(df.index),
               name="High Average",
               visible=False,
               line=dict(color="#33CFA5", dash="dash")))

fig.add_trace(
    go.Scatter(x=list(df.index),
               y=list(df.Low),
               name="Low",
               line=dict(color="#F06A6A")))

fig.add_trace(
    go.Scatter(x=list(df.index),
               y=[df.Low.mean()] * len(df.index),
               name="Low Average",
               visible=False,
               line=dict(color="#F06A6A", dash="dash")))

# Add Annotations and Buttons
high_annotations = [dict(x="2016-03-01",
                         y=df.High.mean(),
                         xref="x", yref="y",
                         text="High Average:<br> %.2f" % df.High.mean(),
                         ax=0, ay=-40),
                    dict(x=df.High.idxmax(),
                         y=df.High.max(),
                         xref="x", yref="y",
                         text="High Max:<br> %.2f" % df.High.max(),
                         ax=0, ay=-40)]
low_annotations = [dict(x="2015-05-01",
                        y=df.Low.mean(),
                        xref="x", yref="y",
                        text="Low Average:<br> %.2f" % df.Low.mean(),
                        ax=-40, ay=40),
                   dict(x=df.High.idxmin(),
                        y=df.Low.min(),
                        xref="x", yref="y",
                        text="Low Min:<br> %.2f" % df.Low.min(),
                        ax=0, ay=40)]

fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="right",
            active=0,
            x=0.57,
            y=1.2,
            buttons=list([
                dict(label="None",
                     method="update",
                     args=[{"visible": [True, False, True, False]},
                           {"title": "Yahoo",
                            "annotations": []}]),
                dict(label="High",
                     method="update",
                     args=[{"visible": [True, True, False, False]},
                           {"title": "Yahoo High",
                            "annotations": high_annotations}]),
                dict(label="Low",
                     method="update",
                     args=[{"visible": [False, False, True, True]},
                           {"title": "Yahoo Low",
                            "annotations": low_annotations}]),
                dict(label="Both",
                     method="update",
                     args=[{"visible": [True, True, True, True]},
                           {"title": "Yahoo",
                            "annotations": high_annotations + low_annotations}]),
            ]),
        )
    ])

# Set title
fig.update_layout(
    title_text="Yahoo",
    xaxis_domain=[0.05, 1.0]
)

fig.show()

In [None]:
a = {'a':1,'b':2,'c':3}
b = [2,4,6]

for i,j in zip(a,b):
    print(i,j)

In [None]:
a

In [None]:
a.values()


In [None]:
import numpy as np
np.log(2/3)*(-450e-6)

In [None]:
def func(a):
    a = 1
    return a

In [None]:
a =2

b = func(a)
print(a,b)

In [None]:
a,b,c = [1,2,3]

In [None]:
a

In [None]:
b

In [None]:
c

In [None]:
a = ['a', 'b', 'c']
b = [1,2,3]

list(zip(a,b))

<div class="alert alert-block alert-info">
**Note:** You are not required to complete this notebook and it will not be submitted with your project, but it is designed to quickly introduce the relevant parts of the Pomegranate library that you will need to complete the part of speech tagger.
</div>

<div class="container">
  <h2>Dropdown Example</h2>
  <p>The data-toggle="dropdown" attribute is used to open the dropdown menu.</p>
  <div class="dropdown">
    <button class="btn btn-primary dropdown-toggle" id="menu1" type="button" data-toggle="dropdown">Dropdown Example
    <span class="caret"></span></button>
    <ul class="dropdown-menu" role="menu" aria-labelledby="menu1">
      <li role="presentation"><a role="menuitem" tabindex="-1" href="#">HTML</a></li>
      <li role="presentation"><a role="menuitem" tabindex="-1" href="#">CSS</a></li>
      <li role="presentation"><a role="menuitem" tabindex="-1" href="#">JavaScript</a></li>
      <li role="presentation" class="divider"></li>
      <li role="presentation"><a role="menuitem" tabindex="-1" href="#">About Us</a></li>    
    </ul>
  </div>
</div>


In [None]:
from collections import defaultdict

d = defaultdict(lambda: defaultdict(int))
d['a']['b']

In [None]:
d = {'a':1, 'b':2, 'c':3}
max(d.values())

In [None]:
a = [1,2,3,4]
for i in a:
    

In [None]:
sum(d.values())

In [None]:
from itertools import product
for i in product(d.values(), repeat=2):
    print(i)

In [None]:
d.keys()