In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import glob


In [None]:
df_preds = pd.read_pickle("../data/results/df_preds_2024-03-04.pkl")
df_latest = pd.read_pickle("../data/interim/df_NNDSS_latest.pkl")
df_historical = pd.read_pickle("../data/interim/df_NNDSS_historical.pkl")

In [None]:
df_preds = pd.read_parquet("../dash_app/data/predictions.parquet")
df_latest = pd.read_parquet("../dash_app/data/latest.parquet")
df_historical = pd.read_parquet("../dash_app/data/historical.parquet")
df_preds = df_preds.rename(columns={'prediction_for_date': 'date'})

df_historical = df_historical.sort_values(['date','item_id'])
df_preds = df_preds.sort_values(['date','item_id'])
df_latest = df_latest.sort_values(['date','item_id'])

In [None]:
df_historical.dtypes

In [None]:
df_preds.dtypes

In [None]:
df_latest.dtypes

In [None]:
df_preds.head()

In [None]:
df_latest[df_latest.new_cases>0].head(11)

In [None]:
selected_item_id = "CALIFORNIA_Campylobacteriosis"
df_latest.loc[
(df_latest['item_id'] == selected_item_id) & 
(df_latest['date'] == "2024-03-04"), 
'new_cases'] = 750 

In [None]:
df_latest[df_latest.new_cases>0].head(8)

In [None]:
df_historical[df_historical.new_cases>=0].item_id.unique()

In [None]:
df_latest[df_latest.item_id=='ARIZONA_Campylobacteriosis']

In [None]:
df_historical[df_historical.item_id=='ARIZONA_Campylobacteriosis']

In [None]:
len(df_historical[df_historical.item_id=='ARIZONA_Campylobacteriosis'].drop_duplicates())

In [None]:
def plot_outbreak(df_historical_chart, df_latest_chart, df_preds_chart, selected_item_id):
    
    

    fig = go.Figure(layout_template="plotly_dark")
    
    # Initialize pred_upper with a default value
    pred_upper = None
    
    # Filter datasets for the selected item_id
    df_historical_filtered = df_historical_chart[df_historical_chart['item_id'] == selected_item_id]
    df_latest_filtered = df_latest_chart[df_latest_chart['item_id'] == selected_item_id]
    df_preds_filtered = df_preds_chart[df_preds_chart['item_id'] == selected_item_id]

    # Plot historical data if available
    if not df_historical_filtered.empty:
        fig.add_trace(go.Scatter(x=df_historical_filtered['date'], y=df_historical_filtered['new_cases'], mode='lines', name='Historical', line=dict(color='skyblue')))

    # Plot prediction data if available
    if not df_preds_filtered.empty:
        pred_date = df_preds_filtered['date'].iloc[0]
        pred_mean = df_preds_filtered['pred_mean'].iloc[0]
        pred_lower = df_preds_filtered['pred_lower'].iloc[0]
        pred_upper = df_preds_filtered['pred_upper'].iloc[0]
        
        # Include error bars for the prediction interval
        fig.add_trace(go.Scatter(x=[pred_date], y=[pred_mean], mode='lines', name='Prediction Interval', 
            error_y=dict(type='data', symmetric=False, array=[pred_upper - pred_mean], arrayminus=[pred_mean - pred_lower]), marker=dict(color='#FF6347', size=12)))
        fig.add_trace(go.Scatter(x=[pred_date], y=[pred_mean], mode='markers', name='Prediction', marker=dict(color='#FF6347', size=12)))

    # Plot latest data if available
    if not df_latest_filtered.empty:
        fig.add_trace(go.Scatter(x=df_latest_filtered['date'], y=df_latest_filtered['new_cases'], mode='markers', name='Latest', marker=dict(color='#3CB371', size=12)))

    # Check for potential outbreak and update marker if present
    if pred_upper is not None and not df_latest_filtered.empty and df_latest_filtered['new_cases'].iloc[0] > pred_upper:
        outbreak_date = df_latest_filtered['date'].iloc[0]
        outbreak_cases = df_latest_filtered['new_cases'].iloc[0]
        fig.add_trace(go.Scatter(x=[outbreak_date], y=[outbreak_cases], mode='markers+text', 
            name='Potential Outbreak', marker=dict(color='yellow', size=15, symbol='x', line=dict(color='#B22222', width=2)), text="Potential Outbreak", textposition="top center"))
    else:
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers+text', name='Potential Outbreak',
                         marker=dict(color='#DAA520', size=15, symbol='x', line=dict(color='#800000', width=2)),
                         text="Potential Outbreak", textposition="top center", visible='legendonly'))

    # Add legend-only traces to ensure all possible data representations are covered in the legend
    if df_historical_filtered.empty:
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name='Historical',
                                 line=dict(color='skyblue'), visible='legendonly'))
    if df_latest_filtered.empty:
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', name='Latest',
                                 marker=dict(color='#98FF98', size=12), visible='legendonly'))
    if df_preds_filtered.empty:
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', name='Prediction',
                                 marker=dict(color='darkred', size=12), visible='legendonly'))
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name='Prediction Interval',
                                 line=dict(color='red'), visible='legendonly'))


    fig.update_layout(title=f"{selected_item_id}", xaxis_title="Date", yaxis_title="New Cases")
    fig.show()


In [None]:
# chart shows: we have historical, latest, pred/interval, and NO outbreak
selected_item_id = "ARIZONA_Campylobacteriosis"
plot_outbreak(df_historical, df_latest,  df_preds, selected_item_id)

In [None]:
# chart shows: we have historical, pred/interval, NO latest and therefore NO outbreak
selected_item_id = "ARKANSAS_Chlamydia trachomatis infection"
plot_outbreak(df_historical, df_latest,  df_preds, selected_item_id)

In [None]:
# chart shows: we have historical, pred/interval, latest and OUTBREAK
selected_item_id = "CALIFORNIA_Campylobacteriosis"
plot_outbreak(df_historical, df_latest,  df_preds, selected_item_id)

In [None]:
# chart shows: we have latest and nothing else
selected_item_id = "FLORIDA_Campylobacteriosis"
selected_item_id = "VIRGINIA_Botulism, Infant"
plot_outbreak(df_historical, df_latest,  df_preds, selected_item_id)

In [None]:
# if the last data point in the trained dataset (ie the last date) are ALL NA's for all time series ( OR 0!),
# all preds for next step will be 0. At least one has to be non NA/0 to get actual predictions. ugh. 
# yup confirmed this.
# this may only be true for the one step ahead forecast, 

In [None]:
df_hist = pd.read_parquet("../data/interim/df_NNDSS_historical.parquet")
df_hist[(df_hist.item_id=='COLORADO_Chlamydia trachomatis infection')]

In [None]:
df_hist = pd.read_parquet("../data/interim/df_NNDSS_historical.parquet")
df_hist[(df_hist.item_id=='COLORADO_Chlamydia trachomatis infection')]

In [None]:
df_hist[df_hist.item_id=='ALABAMA_Anthrax']

In [None]:
df_hist = df_hist.drop(columns="filled_value")
df_hist

In [None]:
df_hist[df_hist.new_cases.notna()]

In [None]:
df_hist.to_parquet("../data/interim/df_NNDSS_historical_mod.parquet",index=False)

In [None]:
df_hist = pd.read_parquet("../data/interim/df_NNDSS_historical_mod.parquet")
df_hist.head()

In [None]:
df_hist.dtypes

In [None]:
df_weekly_actuals = pd.read_parquet("../data/interim/weekly_actuals_2024-03-11.parquet")
df_weekly_actuals

In [None]:
df = pd.read_parquet("../data/results/df_preds_2024-03-04.parquet")

In [None]:
df.head()

In [None]:
df[df.pred_mean>1]

In [None]:
df.dtypes

In [None]:
df_historical = pd.read_parquet("../dash_app/data/historical.parquet")
print(df_historical.date.max())
df_historical.head(2)

In [None]:
df_preds1 = pd.read_parquet("../data/results/weekly_predictions_2024-02-12.parquet")
df_preds2 = pd.read_parquet("../data/results/weekly_predictions_2024-02-19.parquet")
df_preds3 = pd.read_parquet("../data/results/weekly_predictions_2024-03-04.parquet")

In [None]:
def load_preds(directory_path = "../data/results"):


    file_pattern = f"{directory_path}/weekly_predictions*.parquet"  # Adjust extension if necessary

    file_list = glob.glob(file_pattern)
    dfs = []

    for filename in file_list:
        df = pd.read_parquet(filename) 
        print(filename)
        print(len(df.item_id.unique()))
              
        dfs.append(df)

    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df.rename(columns={"prediction_for_date":"date"},inplace=True)
    return concatenated_df

In [None]:
df_preds = load_preds(directory_path = "../data/results")

In [None]:
df_preds.head(2)

In [None]:
df_outbreak = pd.merge(df_historical,df_preds,on=['item_id','date'])

In [None]:
df_outbreak['potential_outbreak'] = df_outbreak['new_cases'] > df_outbreak['pred_upper_0_99']

In [None]:
df_outbreak.head(2)

In [None]:
df_outbreak[df_outbreak.potential_outbreak==True][['item_id','year','week','date','state','label','new_cases','pred_upper_0_99','potential_outbreak']]

In [None]:
def is_outbreak_resolved(df):
    # Step 1: Sort the DataFrame by 'item_id' and 'date'
    df.sort_values(by=['item_id', 'date'], inplace=True)
    # remove rows with na for new cases (assume data skips a week)
    df = df[df.new_cases.notna()]
    # Step 2: Create a column for potential outbreak in the past week by shifting the current week
    df['potential_outbreak_past_week'] = df.groupby('item_id')['potential_outbreak'].shift(1)

    # Step 3: Determine if the potential outbreak was resolved
    # An outbreak is resolved if it was present last week but not this week
    #df['Potential_Outbreak_Resolved'] = np.where((df['potential_outbreak'] == True) & (df['potential_outbreak_past_week'] == True), False, True)
    df['Potential_Outbreak_Resolved'] = ~((df['potential_outbreak'] == True) & (df['potential_outbreak_past_week'] == True))

    return df


In [None]:
df_outbreak = is_outbreak_resolved(df_outbreak)

In [None]:
cols_wanted = ['item_id','year','week','date','state','label','new_cases','pred_upper_0_99','potential_outbreak','potential_outbreak_past_week','Potential_Outbreak_Resolved']

In [None]:
df_outbreak.head()[cols_wanted]

In [None]:
df_outbreak[df_outbreak.Potential_Outbreak_Resolved==False][cols_wanted]

In [None]:
df_outbreak[df_outbreak.item_id=='FLORIDA_Measles, Indigenous'][cols_wanted]

In [None]:
df_outbreak[df_outbreak.item_id=='KENTUCKY_Pertussis'][cols_wanted]

In [None]:
df_outbreak[df_outbreak.item_id=='OKLAHOMA_Hepatitis B, chronic, Confirmed'][cols_wanted]

In [None]:
df_historical[df_historical.item_id=='OKLAHOMA_Hepatitis B, chronic, Confirmed']

In [None]:
df_preds3[df_preds3.item_id=='OKLAHOMA_Hepatitis B, chronic, Confirmed']

In [None]:
df_historical[(df_historical.date==pd.to_datetime("2024-02-26"))]['new_cases'].notna().sum()

In [None]:
df_outbreak.label.unique()

In [None]:
df_outbreak.date.unique()[-1]

In [None]:
#date1 = pd.to_datetime("2024-02-12")
#date2 = pd.to_datetime("2024-02-19")
date_arr = df_outbreak.date.unique()
date_latest = date_arr[-1]
date_previous = date_arr[-2]

date_previous_str = date_previous.strftime('%Y-%m-%d')
date_latest_str = date_latest.strftime('%Y-%m-%d')

week_1_data = df_outbreak[df_outbreak['date'] == date_previous]
week_2_data = df_outbreak[df_outbreak['date'] == date_latest]

potential_outbreaks_week_1 = week_1_data['potential_outbreak'].sum()  
resolved_outbreaks_week_2 = week_2_data[(week_2_data.potential_outbreak_past_week==True)]['Potential_Outbreak_Resolved'].sum()  
ongoing_outbreaks_week_2 = week_2_data['potential_outbreak'].sum() - resolved_outbreaks_week_2  
ongoing_outbreaks_week_2 = week_2_data[(week_2_data['potential_outbreak_past_week'] == True) & (week_2_data['potential_outbreak'] == True)].shape[0]
new_outbreaks_week_2 = week_2_data[(~week_2_data.potential_outbreak_past_week) & (week_2_data['potential_outbreak'])].shape[0]


In [None]:
week_2_data.tail()[['item_id','date','new_cases','pred_upper_0_99', 'potential_outbreak_past_week','Potential_Outbreak_Resolved',]]

In [None]:
week_2_data[(week_2_data.potential_outbreak_past_week==True) & (week_2_data.potential_outbreak==True)][['state','label','new_cases']]

In [None]:
week_2_data[(week_2_data.Potential_Outbreak_Resolved==False)][['state','label','new_cases']]

In [None]:
week_2_data[(week_2_data['potential_outbreak_past_week'] == True)][['item_id','date','new_cases','pred_upper_0_99', 'potential_outbreak_past_week','Potential_Outbreak_Resolved',]]

In [None]:
len(week_2_data[week_2_data.potential_outbreak_past_week==True])

In [None]:
week_2_data[(week_2_data.potential_outbreak_past_week==True) & (week_2_data.Potential_Outbreak_Resolved==False)]

In [None]:
ongoing_outbreaks_week_2

In [None]:
print(potential_outbreaks_week_1)
print(resolved_outbreaks_week_2)
print(int(ongoing_outbreaks_week_2))
#print(new_outbreaks_week_2)
#ongoing_outbreaks_week_2 = 15
#resolved_outbreaks_week_2 = 140-15

In [None]:
labels = [
    f"{date_previous_str}<br><br>Potential Outbreaks: {int(potential_outbreaks_week_1)}",  # 0
    f"Ongoing Outbreaks: {int(ongoing_outbreaks_week_2)}",       # 1
    f"{date_latest_str}<br><br>Resolved Outbreaks: {int(resolved_outbreaks_week_2)}",     # 2
]

# Adjusting source and target arrays based on the updated labels order
source = [0, 0]  # Starting from "Week 1 Potential Outbreaks"
target = [1, 2]  # Attempting to influence order: First to "Ongoing Outbreaks", then to "Resolved Outbreaks"
value = [
    ongoing_outbreaks_week_2,  # From Potential to Ongoing
    resolved_outbreaks_week_2,  # From Potential to Resolved
]

# Colors remain the same; adjusting the order if needed based on your preference
node_colors = [
    'rgba(204,85,0,0.7)',  # Burnt orange for Potential Outbreaks
    'rgba(255,0,0,0.9)',   # Red for Ongoing Outbreaks
    'rgba(50,171,96,0.7)',  # Green for Resolved Outbreaks
]
link_colors = [
    'rgba(255, 50, 0, 0.5)',  # A color closer to red for the transition to "Ongoing Outbreaks"
    'rgba(99, 217, 100, 0.3)'  # A color closer to green for the transition to "Resolved Outbreaks"
]
hover_colors = [
    'rgba(255, 50, 27, 0.8)',  # Brighter burnt orange for hover
    'rgba(76, 217, 100, 0.8)'   # Brighter green for hover
]
node_customdata = [
    "Potential Outbreaks",
    "Ongoing Outbreaks",
    "Resolved Outbreaks"
]
node_hovertemplate = '%{customdata}: %{value}<extra></extra>'
link_customdata = [
    "Transition to Ongoing",
    "Transition to Resolved"
]
link_hovertemplate = '%{source.customdata} to %{target.customdata}: %{value}<extra></extra>'


fig = go.Figure(data=[go.Sankey(
    
    node=dict(
        pad=20,
        thickness=20,
      line=dict(color="rgba(50,50,50,0.5)", width=1),  # Subtle border
        label=labels,
        color=node_colors,
        customdata=node_customdata,
        hovertemplate=node_hovertemplate
    ),
    link=dict(
        arrowlen=15,    
        source=source,
        target=target,
        value=value,
        color=link_colors,
        hovercolor=hover_colors,
        customdata=link_customdata,
        hovertemplate=link_hovertemplate
        ))])

# Updating layout
fig.update_layout(
title=dict(
        text=f"Outbreak Resolutions",
        font=dict(size=20, color='white', family="Arial, bold"),
        x=0.5,  # Title position with 0.5 being the center of the figure
        xanchor='center'  # Ensuring the title is centered at the specified x position
    ),
    font=dict(size=15, color='white'),
    paper_bgcolor='rgba(10,10,10,0.95)',
    plot_bgcolor='rgba(10,10,10,0.95)',
    width=800, 
    height=500
)

fig.show()


In [None]:
df_outbreak.head(2)

In [None]:
df_outbreak.head()[['item_id','year','week','date','state','label','potential_outbreak','potential_outbreak_past_week','Potential_Outbreak_Resolved']]

In [None]:
outbreak_totals = df_outbreak.groupby('date')['potential_outbreak'].sum()

# Creating the line chart with a dark theme
fig = go.Figure(layout_template="plotly_dark")

# Adding the total potential outbreak line with a specific color
fig.add_trace(go.Scatter(
    x=outbreak_totals.index,
    y=outbreak_totals.values,
    mode='lines+markers',
    name='Total Potential Outbreaks',
    line=dict(color='rgba(204,85,0,0.7)'),  # Burnt orange color for the line
))

# Update layout
fig.update_layout(
    title='State/Disease-specific Outbreaks',
    #xaxis_title='',
    #yaxis_title='Count',
)

fig.show()

In [None]:
potential_outbreak_counts = df_outbreak[df_outbreak['potential_outbreak']].groupby('date')['label'].nunique()

# Creating the line chart with a dark theme
fig = go.Figure(layout_template="plotly_dark")

# Adding the line for potential outbreak counts
fig.add_trace(go.Scatter(
    x=potential_outbreak_counts.index,
    y=potential_outbreak_counts.values,
    mode='lines+markers',
    name='Potential Outbreaks',
    line=dict(color='rgba(204,85,0,0.7)'),  # Burnt orange color for the line
))

# Update layout
fig.update_layout(
    title='Disease-specific Outbreaks',
    #xaxis_title='',
    #yaxis_title='Count',
)

fig.show()

In [None]:
df_outbreak.state.unique()

In [None]:
date_wanted = df_outbreak.date.max()
outbreaks_per_state = df_outbreak[df_outbreak.date==date_wanted].groupby('state')['potential_outbreak'].apply(lambda x: x.astype(int).sum()).reset_index()


In [None]:
state_code_mapping = {
    'ALABAMA': 'AL', 'ALASKA': 'AK', 'AMERICAN SAMOA': 'AS', 'ARIZONA': 'AZ',
    'ARKANSAS': 'AR', 'CALIFORNIA': 'CA', 'COLORADO': 'CO', 'CONNECTICUT': 'CT',
    'DELAWARE': 'DE', 'DISTRICT OF COLUMBIA': 'DC', 'FLORIDA': 'FL', 'GEORGIA': 'GA',
    'GUAM': 'GU', 'HAWAII': 'HI', 'IDAHO': 'ID', 'ILLINOIS': 'IL',
    'INDIANA': 'IN', 'IOWA': 'IA', 'KANSAS': 'KS', 'KENTUCKY': 'KY',
    'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD', 'MASSACHUSETTS': 'MA',
    'MICHIGAN': 'MI', 'MINNESOTA': 'MN', 'MISSISSIPPI': 'MS', 'MISSOURI': 'MO',
    'MONTANA': 'MT', 'NEBRASKA': 'NE', 'NEVADA': 'NV', 'NEW HAMPSHIRE': 'NH',
    'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'NEW YORK CITY': 'NYC', 'NEW YORK': 'NY',
    'NORTH CAROLINA': 'NC', 'NORTH DAKOTA': 'ND', 'NORTHERN MARIANA ISLANDS': 'MP',
    'OHIO': 'OH', 'OKLAHOMA': 'OK', 'OREGON': 'OR', 'PENNSYLVANIA': 'PA',
    'PUERTO RICO': 'PR', 'RHODE ISLAND': 'RI', 'SOUTH CAROLINA': 'SC',
    'SOUTH DAKOTA': 'SD', 'TENNESSEE': 'TN', 'TEXAS': 'TX', 'U.S. VIRGIN ISLANDS': 'VI',
    'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA',
    'WEST VIRGINIA': 'WV', 'WISCONSIN': 'WI', 'WYOMING': 'WY'
}


In [None]:
outbreaks_per_state['state_code'] = outbreaks_per_state['state'].map(state_code_mapping)
territories = ['PR', 'GU', 'VI', 'AS', 'MP']  # Puerto Rico, Guam, U.S. Virgin Islands, American Samoa, Northern Mariana Islands

# Filter datasets
df_states = outbreaks_per_state[~outbreaks_per_state['state_code'].isin(territories)]
df_territories = outbreaks_per_state[outbreaks_per_state['state_code'].isin(territories)]
df_territories.rename(columns={"state":"US Territory","potential_outbreak":"Potential Outbreaks"},inplace=True)

In [None]:
from IPython.display import display, HTML

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=df_states['state_code'],
    z=df_states['potential_outbreak'].astype(float),
    locationmode='USA-states',
    colorscale='Reds',
    #colorbar_title="",
    colorbar=dict(x=0.9,thickness=5),
    marker_line_width=0.5,

    
))

fig.update_layout(
    title_text=f"Potential Outbreaks by State: {date_wanted.strftime('%Y-%m-%d')}",
    title_x=0.5,  # Center the title
    title_y=0.9,  # Adjust the title's vertical position
    geo_scope='usa',
    template="plotly_dark",
    geo=dict(
        landcolor='rgb(83, 83, 83)',
        lakecolor='rgb(32, 32, 32)',
        subunitcolor='rgb(100, 100, 100)',
        countrycolor='rgb(100, 100, 100)',
        bgcolor='rgb(23, 23, 23)',
    ),
    width=600, 
    height=400,
     margin=dict(l=0, r=10, t=20, b=0)
)

fig.show()

html = df_territories[["US Territory","Potential Outbreaks"]].to_html(index=False)
display(HTML(html))

In [None]:
# Organizing the diseases into a dictionary where each disease is mapped to its group or class
disease_groups = {
    "Anthrax": "Anthrax",
    "Arboviral diseases, Chikungunya virus disease": "Arboviral diseases",
    "Arboviral diseases, Eastern equine encephalitis virus disease": "Arboviral diseases",
    "Arboviral diseases, Jamestown Canyon virus disease": "Arboviral diseases",
    "Arboviral diseases, La Crosse virus disease": "Arboviral diseases",
    "Arboviral diseases, Powassan virus disease": "Arboviral diseases",
    "Arboviral diseases, St. Louis encephalitis virus disease": "Arboviral diseases",
    "Arboviral diseases, West Nile virus disease": "Arboviral diseases",
    "Arboviral diseases, Western equine encephalitis virus disease": "Arboviral diseases",
    "Babesiosis": "Babesiosis",
    "Botulism, Foodborne": "Botulism",
    "Botulism, Infant": "Botulism",
    "Botulism, Other (wound & unspecified)": "Botulism",
    "Brucellosis": "Brucellosis",
    "Campylobacteriosis": "Campylobacteriosis",
    "Candida auris, clinical": "Candidiasis",
    "Candida auris, screening": "Candidiasis",
    "Carbapenemase-Producing Organisms (CPO), Total": "Carbapenem-resistant Enterobacteriaceae (CRE)",
    "Chancroid": "Chancroid",
    "Chlamydia trachomatis infection": "Chlamydial infection",
    "Cholera": "Cholera",
    "Coccidioidomycosis, Confirmed": "Coccidioidomycosis",
    "Coccidioidomycosis, Probable": "Coccidioidomycosis",
    "Coccidioidomycosis, total": "Coccidioidomycosis",
    "Cronobacter invasive infection, infants, Confirmed": "Cronobacter infection",
    "Cronobacter invasive infection, infants, Probable": "Cronobacter infection",
    "Cryptosporidiosis": "Cryptosporidiosis",
    "Cyclosporiasis": "Cyclosporiasis",
    "Dengue virus infections, Dengue": "Dengue",
    "Dengue virus infections, Dengue-like illness": "Dengue",
    "Dengue virus infections, Severe dengue": "Dengue",
    "Giardiasis": "Giardiasis",
    "Gonorrhea": "Gonorrhea",
    "Haemophilus influenzae, invasive disease, Age <5 years, Non-b serotype": "Haemophilus influenzae infection",
    "Haemophilus influenzae, invasive disease, Age <5 years, Nontypeable": "Haemophilus influenzae infection",
    "Haemophilus influenzae, invasive disease, Age <5 years, Serotype b": "Haemophilus influenzae infection",
    "Haemophilus influenzae, invasive disease, Age <5 years, Unknown serotype": "Haemophilus influenzae infection",
    "Haemophilus influenzae, invasive disease, All ages, all serotypes": "Haemophilus influenzae infection",
    "Hansen's disease": "Leprosy",
    "Hantavirus infection, non-hantavirus pulmonary syndrome": "Hantavirus infection",
    "Hantavirus pulmonary syndrome": "Hantavirus infection",
    "Hemolytic uremic syndrome post-diarrheal": "Hemolytic uremic syndrome",
    "Hepatitis A, Confirmed": "Hepatitis A",
    "Hepatitis B, acute, Confirmed": "Hepatitis B",
    "Hepatitis B, acute, Probable": "Hepatitis B",
    "Hepatitis B, chronic, Confirmed": "Hepatitis B",
    "Hepatitis B, chronic, Probable": "Hepatitis B",
    "Hepatitis B, perinatal, Confirmed": "Hepatitis B",
    "Hepatitis C, acute, Confirmed": "Hepatitis C",
    "Hepatitis C, acute, Probable": "Hepatitis C",
    "Hepatitis C, chronic, Confirmed": "Hepatitis C",
    "Hepatitis C, chronic, Probable": "Hepatitis C",
    "Hepatitis C, perinatal, Confirmed": "Hepatitis C",
    "Influenza-associated pediatric mortality": "Influenza",
    "Invasive pneumococcal disease, age <5 years, Confirmed": "Pneumococcal disease",
    "Invasive pneumococcal disease, age <5 years, Probable": "Pneumococcal disease",
    "Invasive pneumococcal disease, all ages, Confirmed": "Pneumococcal disease",
    "Invasive pneumococcal disease, all ages, Probable": "Pneumococcal disease",
    "Legionellosis": "Legionellosis",
    "Leptospirosis": "Leptospirosis",
    "Listeriosis, Confirmed": "Listeriosis",
    "Listeriosis, Probable": "Listeriosis",
    "Malaria": "Malaria",
    "Measles, Imported": "Measles",
    "Measles, Indigenous": "Measles",
    "Melioidosis": "Melioidosis",
    "Meningococcal disease, All serogroups": "Meningococcal disease",
    "Meningococcal disease, Other serogroups": "Meningococcal disease",
    "Meningococcal disease, Serogroup B": "Meningococcal disease",
    "Meningococcal disease, Serogroups ACWY": "Meningococcal disease",
    "Meningococcal disease, Unknown serogroup": "Meningococcal disease",
    "Mpox": "Mpox",
    "Mumps": "Mumps",
    "Novel Influenza A virus infections": "Influenza",
    "Pertussis": "Pertussis",
    "Plague": "Plague",
    "Poliomyelitis, paralytic": "Poliomyelitis",
    "Poliovirus infection, nonparalytic": "Poliomyelitis",
    "Psittacosis": "Psittacosis",
    "Q fever, Acute": "Q fever",
    "Q fever, Chronic": "Q fever",
    "Q fever, Total": "Q fever",
    "Rabies, Human": "Rabies",
    "Rubella": "Rubella",
    "Rubella, congenital syndrome": "Rubella",
    "Salmonella Paratyphi infection": "Salmonellosis",
    "Salmonella Typhi infection": "Salmonellosis",
    "Salmonellosis": "Salmonellosis",
    "Severe acute respiratory syndrome-associated coronavirus disease": "SARS",
    "Shiga toxin-producing Escherichia coli (STEC)": "STEC infection",
    "Shigellosis": "Shigellosis",
    "Smallpox": "Smallpox",
    "Streptococcal toxic shock syndrome": "Toxic shock syndrome",
    "Syphilis, Congenital": "Syphilis",
    "Syphilis, Primary and secondary": "Syphilis",
    "Tetanus": "Tetanus",
    "Toxic shock syndrome (other than Streptococcal)": "Toxic shock syndrome",
    "Trichinellosis": "Trichinellosis",
    "Tuberculosis": "Tuberculosis",
    "Tularemia": "Tularemia",
    "Vancomycin-intermediate Staphylococcus aureus": "Staphylococcus aureus infection",
    "Vancomycin-resistant Staphylococcus aureus": "Staphylococcus aureus infection",
    "Varicella disease": "Varicella",
    "Yellow fever": "Yellow fever",
    "Zika virus disease, non-congenital": "Zika virus disease",
    "Arboviral diseases, Jamestown Canyon  virus disease": "Arboviral diseases",
    "Arboviral diseases, La Crosse  virus disease": "Arboviral diseases",
    "Salmonellosis (excluding Salmonella Typhi infection and Salmonella Paratyphi infection)": "Salmonellosis",
    "Vibriosis (any species of the family Vibrionaceae, other than toxigenic Vibrio cholerae O1 or O139), Confirmed": "Vibriosis",
    "Vibriosis (any species of the family Vibrionaceae, other than toxigenic Vibrio cholerae O1 or O139), Probable": "Vibriosis",
    "Viral hemorrhagic fevers, Chapare virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Crimean-Congo hemorrhagic fever virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Ebola virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Guanarito virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Junin virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Lassa virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Lujo virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Machupo virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Marburg virus": "Viral hemorrhagic fevers",
    "Viral hemorrhagic fevers, Sabia virus": "Viral hemorrhagic fevers",
}   

In [None]:
disease_details = {
    "Anthrax": {
        "group": "Anthrax",
        "category": "Bacterial",
        "body_system": ["Integumentary", "Respiratory", "Gastrointestinal"],
        "transmission": ["Contact", "Zoonotic"]
    },
    "Arboviral diseases": {
        "group": "Arboviral diseases",
        "category": "Viral",
        "body_system": ["Neurological", "Integumentary"],
        "transmission": ["Vector-borne"]
    },
    "Babesiosis": {
        "group": "Babesiosis",
        "category": "Parasitic",
        "body_system": ["Hematologic"],
        "transmission": ["Vector-borne"]
    },
    "Botulism": {
        "group": "Botulism",
        "category": "Bacterial",
        "body_system": ["Neurological", "Gastrointestinal"],
        "transmission": ["Oral-fecal", "Contact"]
    },
    "Brucellosis": {
        "group": "Brucellosis",
        "category": "Bacterial",
        "body_system": ["Musculoskeletal", "Reproductive"],
        "transmission": ["Contact", "Zoonotic"]
    },
    "Campylobacteriosis": {
        "group": "Campylobacteriosis",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Candidiasis": {
        "group": "Candidiasis",
        "category": "Fungal",
        "body_system": ["Integumentary", "Gastrointestinal"],
        "transmission": ["Contact", "Vertical"]
    },
    "Carbapenem-resistant Enterobacteriaceae (CRE)": {
        "group": "Carbapenem-resistant Enterobacteriaceae (CRE)",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal", "Urinary"],
        "transmission": ["Contact"]
    },
    "Chancroid": {
        "group": "Chancroid",
        "category": "Bacterial",
        "body_system": ["Reproductive"],
        "transmission": ["Sexual"]
    },
    "Chlamydial infection": {
        "group": "Chlamydial infection",
        "category": "Bacterial",
        "body_system": ["Reproductive", "Ocular"],
        "transmission": ["Sexual", "Vertical"]
    },
    "Cholera": {
        "group": "Cholera",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Coccidioidomycosis": {
        "group": "Coccidioidomycosis",
        "category": "Fungal",
        "body_system": ["Respiratory", "Musculoskeletal"],
        "transmission": ["Inhalation"]
    },
    "Cronobacter infection": {
        "group": "Cronobacter infection",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal", "Contact"]
    },
    "Cryptosporidiosis": {
        "group": "Cryptosporidiosis",
        "category": "Parasitic",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Cyclosporiasis": {
        "group": "Cyclosporiasis",
        "category": "Parasitic",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Dengue": {
        "group": "Dengue",
        "category": "Viral",
        "body_system": ["Hematologic", "Integumentary"],
        "transmission": ["Vector-borne"]
    },
    "Giardiasis": {
        "group": "Giardiasis",
        "category": "Parasitic",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Gonorrhea": {
        "group": "Gonorrhea",
        "category": "Bacterial",
        "body_system": ["Reproductive"],
        "transmission": ["Sexual"]
    },
    "Haemophilus influenzae infection": {
        "group": "Haemophilus influenzae infection",
        "category": "Bacterial",
        "body_system": ["Respiratory", "Neurological"],
        "transmission": ["Respiratory", "Contact"]
    },
    "Leprosy": {
        "group": "Leprosy",
        "category": "Bacterial",
        "body_system": ["Integumentary", "Nervous"],
        "transmission": ["Contact"]
    },
    "Hantavirus infection": {
        "group": "Hantavirus infection",
        "category": "Viral",
        "body_system": ["Respiratory", "Cardiovascular"],
        "transmission": ["Inhalation", "Zoonotic"]
    },
    "Hemolytic uremic syndrome": {
        "group": "Hemolytic uremic syndrome",
        "category": "Bacterial",
        "body_system": ["Renal", "Hematologic"],
        "transmission": ["Oral-fecal"]
    },
    "Hepatitis A": {
        "group": "Hepatitis A",
        "category": "Viral",
        "body_system": ["Hepatic"],
        "transmission": ["Oral-fecal"]
    },
    "Hepatitis B": {
        "group": "Hepatitis B",
        "category": "Viral",
        "body_system": ["Hepatic"],
        "transmission": ["Bloodborne", "Sexual", "Vertical"]
    },
    "Hepatitis C": {
        "group": "Hepatitis C",
        "category": "Viral",
        "body_system": ["Hepatic"],
        "transmission": ["Bloodborne", "Sexual", "Vertical"]
    },
    "Influenza": {
        "group": "Influenza",
        "category": "Viral",
        "body_system": ["Respiratory"],
        "transmission": ["Respiratory"]
    },
    "Pneumococcal disease": {
        "group": "Pneumococcal disease",
        "category": "Bacterial",
        "body_system": ["Respiratory"],
        "transmission": ["Respiratory", "Contact"]
    },
    "Legionellosis": {
        "group": "Legionellosis",
        "category": "Bacterial",
        "body_system": ["Respiratory"],
        "transmission": ["Inhalation"]
    },
    "Leptospirosis": {
        "group": "Leptospirosis",
        "category": "Bacterial",
        "body_system": ["Renal", "Hepatic"],
        "transmission": ["Contact", "Waterborne"]
    },
    "Listeriosis": {
        "group": "Listeriosis",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal", "Nervous"],
        "transmission": ["Oral-fecal"]
    },
    "Malaria": {
        "group": "Malaria",
        "category": "Parasitic",
        "body_system": ["Hematologic"],
        "transmission": ["Vector-borne"]
    },
    "Measles": {
        "group": "Measles",
        "category": "Viral",
        "body_system": ["Respiratory", "Integumentary"],
        "transmission": ["Respiratory"]
    },
    "Melioidosis": {
        "group": "Melioidosis",
        "category": "Bacterial",
        "body_system": ["Respiratory", "Cutaneous"],
        "transmission": ["Inhalation", "Contact"]
    },
    "Meningococcal disease": {
        "group": "Meningococcal disease",
        "category": "Bacterial",
        "body_system": ["Nervous"],
        "transmission": ["Respiratory", "Contact"]
    },
    "Mpox": {
        "group": "Mpox",
        "category": "Viral",
        "body_system": ["Integumentary", "Respiratory"],
        "transmission": ["Contact", "Respiratory", "Zoonotic"]
    },
    "Mumps": {
        "group": "Mumps",
        "category": "Viral",
        "body_system": ["Respiratory", "Reproductive"],
        "transmission": ["Respiratory"]
    },
    "Pertussis": {
        "group": "Pertussis",
        "category": "Bacterial",
        "body_system": ["Respiratory"],
        "transmission": ["Respiratory"]
    },
    "Plague": {
        "group": "Plague",
        "category": "Bacterial",
        "body_system": ["Lymphatic", "Respiratory"],
        "transmission": ["Vector-borne", "Contact", "Respiratory"]
    },
    "Poliomyelitis": {
        "group": "Poliomyelitis",
        "category": "Viral",
        "body_system": ["Neurological"],
        "transmission": ["Oral-fecal", "Contact"]
    },
    "Psittacosis": {
        "group": "Psittacosis",
        "category": "Bacterial",
        "body_system": ["Respiratory"],
        "transmission": ["Inhalation"]
    },
    "Q fever": {
        "group": "Q fever",
        "category": "Bacterial",
        "body_system": ["Respiratory", "Hepatic"],
        "transmission": ["Inhalation", "Contact"]
    },
    "Rabies": {
        "group": "Rabies",
        "category": "Viral",
        "body_system": ["Neurological"],
        "transmission": ["Contact", "Zoonotic"]
    },
    "Rubella": {
        "group": "Rubella",
        "category": "Viral",
        "body_system": ["Respiratory", "Integumentary"],
        "transmission": ["Respiratory"]
    },
    "Salmonellosis": {
        "group": "Salmonellosis",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "SARS": {
        "group": "SARS",
        "category": "Viral",
        "body_system": ["Respiratory"],
        "transmission": ["Respiratory", "Contact"]
    },
    "STEC infection": {
        "group": "STEC infection",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Shigellosis": {
        "group": "Shigellosis",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Smallpox": {
        "group": "Smallpox",
        "category": "Viral",
        "body_system": ["Integumentary", "Respiratory"],
        "transmission": ["Respiratory", "Contact"]
    },
    "Syphilis": {
        "group": "Syphilis",
        "category": "Bacterial",
        "body_system": ["Reproductive", "Integumentary"],
        "transmission": ["Sexual", "Vertical"]
    },
    "Tetanus": {
        "group": "Tetanus",
        "category": "Bacterial",
        "body_system": ["Neurological"],
        "transmission": ["Contact"]
    },
    "Toxic shock syndrome": {
        "group": "Toxic shock syndrome",
        "category": "Bacterial",
        "body_system": ["Multiple"],
        "transmission": ["Contact"]
    },
    "Trichinellosis": {
        "group": "Trichinellosis",
        "category": "Parasitic",
        "body_system": ["Musculoskeletal", "Gastrointestinal"],
        "transmission": ["Oral-fecal"]
    },
    "Tuberculosis": {
        "group": "Tuberculosis",
        "category": "Bacterial",
        "body_system": ["Respiratory"],
        "transmission": ["Respiratory"]
    },
    "Tularemia": {
        "group": "Tularemia",
        "category": "Bacterial",
        "body_system": ["Multiple"],
        "transmission": ["Contact", "Vector-borne", "Inhalation"]
    },
    "Varicella": {
        "group": "Varicella",
        "category": "Viral",
        "body_system": ["Integumentary"],
        "transmission": ["Respiratory", "Contact"]
    },
    "Vibriosis": {
        "group": "Vibriosis",
        "category": "Bacterial",
        "body_system": ["Gastrointestinal"],
        "transmission": ["Waterborne"]
    },
    "Viral hemorrhagic fevers": {
        "group": "Viral hemorrhagic fevers",
        "category": "Viral",
        "body_system": ["Multiple"],
        "transmission": ["Contact", "Vector-borne", "Zoonotic"]
    },
    "Yellow fever": {
        "group": "Yellow fever",
        "category": "Viral",
        "body_system": ["Hepatic"],
        "transmission": ["Vector-borne"]
    },
    "Zika virus disease": {
        "group": "Zika virus disease",
        "category": "Viral",
        "body_system": ["Neurological", "Integumentary"],
        "transmission": ["Vector-borne", "Sexual", "Vertical"]
    },
    "Staphylococcus aureus infection": {
        "group": "Staphylococcus aureus infection",
        "category": "Bacterial",
        "body_system": ["Skin", "Respiratory", "Cardiovascular"],
        "transmission": ["Contact"]
    },
}

In [None]:
# Convert group names in disease_details to a set for faster lookup
disease_detail_groups = set(disease_details.keys())

# Iterate through disease_groups to check if each disease's group exists in disease_detail_groups
missing_diseases = []
for disease, group in disease_groups.items():
    if group not in disease_detail_groups:
        missing_diseases.append(disease)

# Print out any missing diseases
if missing_diseases:
    print("The following diseases are missing from the disease_details dictionary:")
    for disease in missing_diseases:
        print(disease)
else:
    print("All diseases in disease_groups are accounted for in the disease_details dictionary.")

In [None]:
# Code to verify consistency between disease_groups and disease_details
inconsistencies = []
missing_in_details = []

# Check if the group for each disease in disease_groups exists in disease_details and matches the expected group
for disease, group in disease_groups.items():
    if group not in disease_details or disease_details.get(group, {}).get('group') != group:
        inconsistencies.append((disease, group))

# Check if any disease from disease_groups is missing in disease_details
for disease in disease_groups.keys():
    found = False
    for detail in disease_details.values():
        if detail['group'] == disease_groups[disease]:
            found = True
            break
    if not found:
        missing_in_details.append(disease)

if inconsistencies:
    print("Found inconsistencies in these disease mappings to their groups:")
    for inc in inconsistencies:
        print(f"Disease: {inc[0]}, Group: {inc[1]}")
else:
    print("No inconsistencies in disease to group mappings.")

if missing_in_details:
    print("\nThese diseases are missing in disease_details based on their groups:")
    for missing in missing_in_details:
        print(missing)
else:
    print("\nAll diseases from disease_groups are accounted for in disease_details.")


In [None]:
len(disease_groups)

In [None]:
diseases_list = df_historical.label.unique()

In [None]:
missing_diseases = [disease for disease in diseases_list if disease not in disease_groups]

missing_diseases

In [None]:
df_outbreak[df_outbreak.date==df_outbreak.date.max()][cols_wanted]

In [None]:
df_outbreak_latest = df_outbreak[df_outbreak.date==df_outbreak.date.max()]

In [None]:
# Function to map disease to its details
def map_disease_to_details(disease, attribute):
    group = disease_groups.get(disease)
    details = disease_details.get(group, {})
    return details.get(attribute)


In [None]:
def add_disease_info(df):
    df['group'] = df['label'].apply(lambda x: map_disease_to_details(x, 'group'))
    df['category'] = df['label'].apply(lambda x: map_disease_to_details(x, 'category'))
    df['body_system'] = df['label'].apply(lambda x: map_disease_to_details(x, 'body_system'))
    df['transmission'] = df['label'].apply(lambda x: map_disease_to_details(x, 'transmission'))

    df['body_system'] = df['body_system'].apply(lambda x: x if isinstance(x, list) else [x])
    df['transmission'] = df['transmission'].apply(lambda x: x if isinstance(x, list) else [x])
    return df

In [None]:
df_outbreak_latest = add_disease_info(df_outbreak_latest)
df_outbreak_latest.head()

In [None]:
df_outbreak_latest[df_outbreak_latest.group.isna()].label.unique()

In [None]:
df_outbreak_latest[df_outbreak_latest.group.isna()]

In [None]:
# Counting potential outbreaks per category
outbreak_counts_category = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].groupby('category').size()
# Display the results
print("Potential Outbreak Counts Per Category:")
print(outbreak_counts_category)


In [None]:

# For body_system, since diseases may affect multiple systems, we need to expand the list before counting
body_system_counts = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('body_system').groupby('body_system').size()

print("\nPotential Outbreak Counts Per Body System:")
print(body_system_counts)


In [None]:
transmission_counts = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('transmission').groupby('transmission').size()

print("\nPotential Outbreak Counts Per Transmission type:")
print(transmission_counts)


In [None]:
def bar_chart_counts(counts_df, count_type_title="Pathogen Type", color="blue",
                     note_text="*Each instance of the same disease across different states is counted separately and may belong to multiple categories"):
    
    outbreak_counts = counts_df.sort_values()

    # Define color shades for consistency
    if color == "blue":
        bar_color = 'rgb(31, 119, 180)'  # Blue
    elif color == "green":
        bar_color = 'rgb(44, 160, 44)'  # Green
    elif color == "purple":
        bar_color = 'rgb(148, 103, 189)'  # Purple

    # Create a horizontal bar chart with the specified color
    fig = go.Figure(data=[go.Bar(
        y=outbreak_counts.index,  # Pathogen types
        x=outbreak_counts.values,  # Counts
        orientation='h',  # Horizontal orientation
        marker_color=bar_color  # Bar color
    )])

    # Customize layout
    fig.update_layout(
        title=f"Potential Outbreak Counts Per {count_type_title}",
        yaxis_title="",
        xaxis_title="Count",
        template='plotly_dark',  # Dark theme
        title_font_size=24,  # Increase title font size
        title_x=0.5,  # Center title
        title_y=0.9,  # Adjust title position
    )

    # Add subtitle as annotation
    fig.update_layout(
        annotations=[
            dict(
                x=0.5,
                y=-0.25,  # Adjust this value to position the annotation below the x-axis
                xref="paper",
                yref="paper",
                text=f"{note_text}",
                showarrow=False,
                font=dict(size=10, color="white"),  # Adjust font size and color
            )
        ]
    )

    # Show the plot
    fig.show()

In [None]:
bar_chart_counts(outbreak_counts_category,"Pathogen Type", "blue")

In [None]:
bar_chart_counts(body_system_counts, "Bodily System Affected", "green")

In [None]:
bar_chart_counts(transmission_counts, "Transmission Type", "purple")

In [None]:
# Counting potential outbreaks per unique disease grouped by category
unique_outbreak_counts_category = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].groupby('category')['label'].nunique()

# Display the results
print("Potential Outbreak Counts Per Unique Disease Grouped by Category:")
print(unique_outbreak_counts_category)


In [None]:
# Explode the body_system column to count potential outbreaks per unique disease per body system
exploded_body_system = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('body_system')
unique_outbreak_counts_body_system = exploded_body_system.groupby('body_system')['label'].nunique()

# Display the results
print("\nPotential Outbreak Counts Per Unique Disease Grouped by Body System:")
print(unique_outbreak_counts_body_system)


In [None]:
# Explode the transmission column to count potential outbreaks per unique disease per transmission
exploded_transmission = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('transmission')
unique_outbreak_counts_transmission = exploded_transmission.groupby('transmission')['label'].nunique()

# Display the results
print("\nPotential Outbreak Counts Per Unique Disease Grouped by Transmission:")
print(unique_outbreak_counts_transmission)


In [None]:
bar_chart_counts(unique_outbreak_counts_category,"Pathogen Type", "blue",
                 "*Each disease is counted once for potential outbreaks, regardless of its presence in multiple states. A single disease may belong to multiple categories")

In [None]:
bar_chart_counts(unique_outbreak_counts_body_system, "Bodily System Affected", "green",
                                  "*Each disease is counted once for potential outbreaks, regardless of its presence in multiple states. A single disease may belong to multiple categories")

In [None]:
bar_chart_counts(unique_outbreak_counts_transmission, "Transmission Type", "purple",
                                  "*Each disease is counted once for potential outbreaks, regardless of its presence in multiple states. A single disease may belong to multiple categories")

In [None]:
df_outbreak_latest.head(2)

In [None]:
outbreak_counts_category = df_outbreak_latest[df_outbreak_latest['potential_outbreak']][['category']].groupby('category').size()
body_system_counts = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('body_system').groupby('body_system').size()
transmission_counts = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('transmission').groupby('transmission').size()

unique_outbreak_counts_category = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].groupby('category')['label'].nunique()
exploded_body_system = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('body_system')
unique_outbreak_counts_body_system = exploded_body_system.groupby('body_system')['label'].nunique()
exploded_transmission = df_outbreak_latest[df_outbreak_latest['potential_outbreak']].explode('transmission')
unique_outbreak_counts_transmission = exploded_transmission.groupby('transmission')['label'].nunique()


In [None]:
outbreak_counts_category = df_outbreak_latest[df_outbreak_latest['potential_outbreak']][['category']].groupby('category').size()

In [None]:
outbreak_counts_category

In [None]:
df_outbreaks_time = df_outbreak[['item_id','state','label','date','potential_outbreak','potential_outbreak_past_week','Potential_Outbreak_Resolved']]

In [None]:
# outbreaks over time
df_outbreaks_time

In [None]:
df_outbreaks_time.date.min()

In [None]:
def agg_outbreak_counts(df, condition='potential_outbreak'):

    min_date = df['date'].min()
    max_date = df['date'].max()


    if condition=='potential_outbreak':
        df_outbreak_counts = df[df.potential_outbreak==True].copy()
    elif condition=='ongoing_outbreaks':
        df_outbreak_counts = df[(df['potential_outbreak']) & (df['potential_outbreak_past_week'])].copy()
    elif condition=='resolved_outbreaks':
        df_outbreak_counts = df[(df['potential_outbreak']==False) & (df['potential_outbreak_past_week']==True)].copy()
        # need to remove the first date here, as it will always be 0 for resolved as we need one week of previous data
        df_outbreak_counts = df_outbreak_counts[df_outbreak_counts['date']>min_date]
        min_date = df_outbreak_counts['date'].min()

    
    all_week_starts = pd.date_range(start=min_date - pd.to_timedelta(min_date.dayofweek, unit='d'),
                                    end=max_date, freq='7D')
    df_all_weeks = pd.DataFrame(all_week_starts, columns=['date'])

    if df_outbreak_counts.empty:
        print("No data with 'potential_outbreak' as True.")
    else:
        # Calculate the start of the week date for each row
        # Adjust the `week_start` to match how your data defines the start of a week, if needed
        df_outbreak_counts['date'] = df_outbreak_counts['date'] - pd.to_timedelta(df_outbreak_counts['date'].dt.dayofweek, unit='d')

        # Group by the new 'week_start' column and count occurrences
        weekly_counts = df_outbreak_counts.groupby('date').size().reset_index(name='count')
        df_all_weeks = pd.merge(df_all_weeks, weekly_counts, on='date', how='left').fillna(0)

    df_all_weeks['count'] = df_all_weeks['count'].astype(int)
    df_all_weeks['cumulative_count'] = df_all_weeks['count'].cumsum()

        # Now, 'weekly_counts' DataFrame will have two columns: 'week_start' and 'counts'
    return df_all_weeks

In [None]:
def plot_time_series(df_aggregated, title="Time Series of Counts", display_col='count', primary_name = 'potential outbreaks',
                     df_secondary=None, secondary_display_col=None, secondary_name=None):
    """
    Plots the time series from the aggregated DataFrame using Plotly with a dark theme. Optionally includes a second DataFrame.

    Args:
    - df_aggregated: Aggregated DataFrame with the date and count columns for the primary data.
    - title: Title of the plot.
    - display_col: Column name in the primary DataFrame to display.
    - df_secondary: Optional. A secondary DataFrame to plot on the same chart.
    - secondary_display_col: Optional. Column name in the secondary DataFrame to display.
    - secondary_name: Optional. Name for the secondary data trace.

    Returns:
    - Plotly Figure
    """
    fig = go.Figure()

    # Primary dataset
    fig.add_trace(go.Scatter(x=df_aggregated['date'], y=df_aggregated[display_col],
                             mode='lines+markers',
                             name=primary_name))

    # Check if a secondary DataFrame is provided
    if df_secondary is not None and secondary_display_col is not None:
        fig.add_trace(go.Scatter(x=df_secondary['date'], y=df_secondary[secondary_display_col],
                                 mode='lines+markers',
                                 name=secondary_name or 'Secondary Counts'))

    fig.update_layout(title=title,
                      xaxis_title='Date',
                      yaxis_title='Count',
                      template="plotly_dark")

    return fig


In [None]:
df_weekly_ongoing = agg_outbreak_counts(df_outbreaks_time, 'ongoing_outbreaks')
plot_figure = plot_time_series(df_weekly_ongoing, title="Number of Ongoing Outbreaks",display_col='count', primary_name = 'potential outbreaks')
plot_figure.show()


In [None]:
# Use the previously aggregated data for plotting as an example
df_aggregated_example = agg_outbreak_counts(df_outbreaks_time,  condition='potential_outbreak')
plot_figure = plot_time_series(df_aggregated_example, title="Potential Outbreaks",display_col='count', primary_name = 'potential outbreaks')
plot_figure.show()


In [None]:
# Use the previously aggregated data for plotting as an example
df_aggregated_example = agg_outbreak_counts(df_outbreaks_time,  condition='potential_outbreak')
plot_figure = plot_time_series(df_aggregated_example, title="Cumulative Potential Outbreaks", display_col="cumulative_count", primary_name = 'potential outbreaks')
plot_figure.show()


In [None]:
df_weekly_ongoing = agg_outbreak_counts(df_outbreaks_time, 'ongoing_outbreaks')
plot_figure = plot_time_series(df_weekly_ongoing, title="Number of Ongoing Outbreaks", display_col="cumulative_count")
plot_figure.show()


In [None]:
df_weekly_resolved = agg_outbreak_counts(df_outbreaks_time, condition='resolved_outbreaks')
df_weekly_potential = agg_outbreak_counts(df_outbreaks_time,  condition='potential_outbreak')
fig = plot_time_series(df_weekly_potential, title="Cumulative Potential vs Resolved Outbreaks", 
                       display_col='cumulative_count', primary_name = 'Potential', df_secondary=df_weekly_resolved, 
                       secondary_display_col='cumulative_count', secondary_name='Resolved')
fig.show()


In [None]:
fig = plot_time_series(df_weekly_potential, title="Potential vs Resolved Outbreaks", 
                       display_col='count', primary_name = 'Potential', df_secondary=df_weekly_resolved, 
                       secondary_display_col='count', secondary_name='Resolved')
fig.show()