In [None]:
import pandas as pd
import json
import geopandas as gpd
pd.set_option("max_colwidth", None)
pd.options.display.max_columns = None

In [None]:
from pygam import LinearGAM, s
import plotly.graph_objects as go
import numpy as np

In [None]:
df_ranking = pd.read_pickle("../data/interim/CDC_PLACES_county_rankings_by_year.pickle")
df_ranking = df_ranking[df_ranking.Year==2020]
df_ranking.head()

In [8]:
# Read the CSV file, skipping the first 3 rows and the last 11 rows
df_gdp = pd.read_csv("../data/raw/BEA/county_gdp_fips_2020.csv", skiprows=3, skipfooter=11, engine='python')

# Rename the columns
df_gdp.columns = ['GeoFips', 'GeoName', 'gdp_thousands']

# Convert 'gdp_thousands' to numeric, coercing errors to NaN
df_gdp['gdp_thousands'] = pd.to_numeric(df_gdp['gdp_thousands'], errors='coerce')

# Multiply by 1000 to convert to actual GDP
df_gdp['gdp'] = df_gdp['gdp_thousands'] * 1000
df_gdp['GeoFips'] = df_gdp.GeoFips.astype(str)
df_gdp.head()

Unnamed: 0,GeoFips,GeoName,gdp_thousands,gdp
0,1001,"Autauga, AL",1813553.0,1813553000.0
1,1003,"Baldwin, AL",8762106.0,8762106000.0
2,1005,"Barbour, AL",786529.0,786529000.0
3,1007,"Bibb, AL",501320.0,501320000.0
4,1009,"Blount, AL",973414.0,973414000.0


In [9]:
df_gdp[df_gdp.GeoFips=='51610']

Unnamed: 0,GeoFips,GeoName,gdp_thousands,gdp


In [10]:
df_gdp[df_gdp.GeoName.str.contains("Falls Church")] # probably I"ll just ignore

Unnamed: 0,GeoFips,GeoName,gdp_thousands,gdp
2912,51919,"Fairfax, Fairfax City + Falls Church, VA*",130155397.0,130155400000.0


In [11]:
df_spending = pd.read_pickle("../data/interim/USA_Spending_2020.pickle")
df_spending.head()

Unnamed: 0,shape_code,display_name,aggregated_amount,population,per_capita,state_code,fiscal_year
0,6037,Los Angeles County,97787490000.0,10014009.0,9765.07,CA,2020
1,12086,Miami-Dade County,24269400000.0,2701767.0,8982.79,FL,2020
2,17031,Cook County,43053570000.0,5275541.0,8160.98,IL,2020
3,17097,Lake County,6948460000.0,714342.0,9727.08,IL,2020
4,48113,Dallas County,30997720000.0,2613539.0,11860.44,TX,2020


In [12]:
df_spending[df_spending.shape_code=='51610']

Unnamed: 0,shape_code,display_name,aggregated_amount,population,per_capita,state_code,fiscal_year
1934,51610,Falls Church City,211073500.0,14658.0,14399.88,VA,2020


In [None]:
merged_df = pd.merge(df_ranking, df_gdp, left_on=['GEOID'],right_on=['GeoFips'], how='inner')
print(merged_df.shape)
merged_df.head()

In [None]:

merged_df = pd.merge(df_ranking, df_gdp, left_on=['GEOID'],right_on=['GeoFips'], how='inner')
print(merged_df.shape)
merged_df.head()
merged_df = pd.merge(merged_df, df_spending, left_on=['GEOID'],right_on=['shape_code'], how='inner')
merged_df['GDP_Per_Capita_2020'] = merged_df['gdp'] / merged_df['population']
print(merged_df.shape)
merged_df.head()

In [None]:
merged_df.head()

In [None]:
merged_df.to_pickle("../data/interim/combined_Rank_CDCLocals_BEAgdp_Spending.pickle")

In [None]:

# Calculate IQR for 'GDP_Per_Capita_2020'
Q1 = merged_df['GDP_Per_Capita_2020'].quantile(0.01)
Q3 = merged_df['GDP_Per_Capita_2020'].quantile(0.99)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
filtered_df = merged_df[(merged_df['GDP_Per_Capita_2020'] >= lower_bound) & (merged_df['GDP_Per_Capita_2020'] <= upper_bound)]
filtered_df['Weighted_Score_Normalized'] = round(filtered_df.Weighted_Score_Normalized,2)
hover_text = [
    f"{row['LocationName']}, {row['StateDesc']}<br>County Health Score: {row['Weighted_Score_Normalized']}<br>Rank: {row['Rank']}<br>GDP Per Capita: {row['GDP_Per_Capita_2020']}<br>Population: {row['population']}"
    for index, row in filtered_df.iterrows()
]

# Fit a GAM model
# Apply logarithmic transformation and then scale to the range [0, 1]
normalized_weights = np.log(filtered_df['population'] + 1)  # Add 1 to avoid log(0)
normalized_weights = (normalized_weights - normalized_weights.min()) / (normalized_weights.max() - normalized_weights.min())

# Fit the GAM model with normalized weights
gam = LinearGAM(s(0, n_splines=15, constraints='monotonic_inc', lam=0.10))
gam.fit(filtered_df[['GDP_Per_Capita_2020']], filtered_df['Weighted_Score_Normalized'], weights=normalized_weights)

# Generate predictions and intervals as before
x_pred = pd.DataFrame({'GDP_Per_Capita_2020': np.linspace(filtered_df['GDP_Per_Capita_2020'].min(), filtered_df['GDP_Per_Capita_2020'].max(), 500)})
y_pred = gam.predict(x_pred)
y_intervals = gam.prediction_intervals(x_pred, width=0.95)

# Generate predictions for the trend line
x_pred = pd.DataFrame({'GDP_Per_Capita_2020': np.linspace(filtered_df['GDP_Per_Capita_2020'].min(), filtered_df['GDP_Per_Capita_2020'].max(), 500)})
y_pred = gam.predict(x_pred)
# Generate prediction intervals
y_intervals = gam.prediction_intervals(x_pred, width=0.8)

# Create the scatter plot using plotly.graph_objects
scatter_plot = go.Scatter(
    x=filtered_df['GDP_Per_Capita_2020'],
    y=filtered_df['Weighted_Score_Normalized'],
    mode='markers',
    marker=dict(
        color=filtered_df['Rank'],
        colorscale='RdYlGn_r',
        showscale=False,
        size=filtered_df['population'],  # Set the size based on the 'population' column
        sizemode='area',  # This will scale the area of the marker, not the diameter
        sizeref=2.*max(filtered_df['population'])/(40.**2),  # Adjust for desired size range
        sizemin=4,  # Minimum marker size
             line=dict(
            width=.2,  # Thinner border
            color='black'  # Change the border color if needed
        )
    ),
    text=hover_text,
    hoverinfo='text',
    name='County',
    #opacity=0.7
)


# Add the GAM trend line
trend_line = go.Scatter(x=x_pred['GDP_Per_Capita_2020'], y=y_pred, mode='lines', 
                        name='GAM Trend Line', 
                        line=dict(color='darkgrey', width=5))

# Add prediction intervals
y_intervals = gam.prediction_intervals(x_pred, width=0.8)
lower_interval = go.Scatter(
    x=x_pred['GDP_Per_Capita_2020'],
    y=y_intervals[:, 0],
    mode='lines',
    line=dict(color='lightgrey', width=1, dash='dot'),  # Lighter color, dashed line
    name='Lower Interval',
    showlegend=False
)

upper_interval = go.Scatter(
    x=x_pred['GDP_Per_Capita_2020'],
    y=y_intervals[:, 1],
    fill='tonexty',
    mode='lines',
    line=dict(color='lightgrey', width=1, dash='dot'),  # Lighter color, dashed line
    name='95% Prediction Interval',
    fillcolor='rgba(150, 150, 150, 0.3)',  # Light fill color with reduced opacity
    showlegend=True
)

fig_bubble = go.Figure()

fig_bubble.add_trace(trend_line)

fig_bubble.add_trace(lower_interval)
fig_bubble.add_trace(upper_interval)
fig_bubble.add_trace(scatter_plot)


# Update the layout for a dark and minimalist theme
fig_bubble.update_layout(
    title='GDP per Capita vs Health Score by County',
    xaxis=dict(title='GDP per capita 2020', range=[0, 200000], showgrid=False, linecolor='darkgrey', linewidth=1),  # Hide grid lines and set axis line color
    yaxis=dict(range=[0, 101], showgrid=False, linecolor='darkgrey', linewidth=1),  # Hide grid lines and set axis line color
    yaxis_title='County Health Score',
    width=1000, height=700,
    coloraxis_showscale=False,
    
    legend=dict(
        x=0.02,
        y=1,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=12,
            color="white"
        ),
        bordercolor="Black",
        borderwidth=1
    ),
    paper_bgcolor="black",  # Background color
    plot_bgcolor="black",  # Plot area background color
    font=dict(color="white"),  # Text color
    annotations=[
        dict(
            xref='paper', yref='paper',
            x=0.5, y=-0.13,
            showarrow=False,
            text="Bubble size represents the population size of the county",
            font=dict(size=10)
            
        )
    ]
)


#fig_bubble.show()



In [None]:
filtered_df.head()

In [None]:

# Calculate IQR for 'GDP_Per_Capita_2020'
Q1 = merged_df['GDP_Per_Capita_2020'].quantile(0.01)
Q3 = merged_df['GDP_Per_Capita_2020'].quantile(0.99)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
filtered_df = merged_df[(merged_df['GDP_Per_Capita_2020'] >= lower_bound) & (merged_df['GDP_Per_Capita_2020'] <= upper_bound)]
filtered_df['Weighted_Score_Normalized'] = round(filtered_df.Weighted_Score_Normalized,2)
hover_text = [
    f"{row['LocationName']}, {row['StateDesc']}<br>County Health Score: {row['Weighted_Score_Normalized']}<br>Rank: {row['Rank']}<br>GDP Per Capita: {row['GDP_Per_Capita_2020']}<br>Population: {row['population']}"
    for index, row in filtered_df.iterrows()
]

# Fit a GAM model
# Apply logarithmic transformation and then scale to the range [0, 1]
normalized_weights = np.log(filtered_df['population'] + 1)  # Add 1 to avoid log(0)
normalized_weights = (normalized_weights - normalized_weights.min()) / (normalized_weights.max() - normalized_weights.min())

# Fit the GAM model with normalized weights
gam = LinearGAM(s(0, n_splines=15, constraints='monotonic_inc', lam=0.10))
gam.fit(filtered_df[['GDP_Per_Capita_2020']], filtered_df['Weighted_Score_Normalized'], weights=normalized_weights)

# Generate predictions and intervals as before
x_pred = pd.DataFrame({'GDP_Per_Capita_2020': np.linspace(filtered_df['GDP_Per_Capita_2020'].min(), filtered_df['GDP_Per_Capita_2020'].max(), 500)})
y_pred = gam.predict(x_pred)
y_intervals = gam.prediction_intervals(x_pred, width=0.95)

# Generate predictions for the trend line
x_pred = pd.DataFrame({'GDP_Per_Capita_2020': np.linspace(filtered_df['GDP_Per_Capita_2020'].min(), filtered_df['GDP_Per_Capita_2020'].max(), 500)})
y_pred = gam.predict(x_pred)
# Generate prediction intervals
y_intervals = gam.prediction_intervals(x_pred, width=0.8)

# Calculate the quintile boundaries
quintiles = filtered_df['Weighted_Score_Normalized'].quantile([0.2, 0.4, 0.6, 0.8]).values

# Function to determine quintile number
def get_quintile_number(score, quintiles):
    if score <= quintiles[0]:
        return 1
    elif score <= quintiles[1]:
        return 2
    elif score <= quintiles[2]:
        return 3
    elif score <= quintiles[3]:
        return 4
    else:
        return 5

# Assign quintile number to each data point
filtered_df['quintile'] = filtered_df['Weighted_Score_Normalized'].apply(
    lambda score: get_quintile_number(score, quintiles)
)

# Custom colorscale based on quintiles
quintile_colorscale = {
    1: "darkred",
    2: "orange",
    3: "yellow",
    4: "lightgreen",
    5: "darkgreen"
}

# Apply the colorscale to the scatter plot
scatter_plot = go.Scatter(
    x=filtered_df['GDP_Per_Capita_2020'],
    y=filtered_df['Weighted_Score_Normalized'],
    mode='markers',
    marker=dict(
        color=[quintile_colorscale[q] for q in filtered_df['quintile']], # Apply quintile color
        size=filtered_df['population'],  # Size based on population
        sizemode='area',
        sizeref=2.*max(filtered_df['population'])/(40.**2),
        sizemin=4,
        line=dict(
            width=.2,
            color='black'
        )
    ),
    text=hover_text,
    hoverinfo='text',
    name='County'
)


# Add the GAM trend line
trend_line = go.Scatter(x=x_pred['GDP_Per_Capita_2020'], y=y_pred, mode='lines', 
                        name='GAM Trend Line', 
                        line=dict(color='darkgrey', width=5))

# Add prediction intervals
y_intervals = gam.prediction_intervals(x_pred, width=0.8)
lower_interval = go.Scatter(
    x=x_pred['GDP_Per_Capita_2020'],
    y=y_intervals[:, 0],
    mode='lines',
    line=dict(color='lightgrey', width=1, dash='dot'),  # Lighter color, dashed line
    name='Lower Interval',
    showlegend=False
)

upper_interval = go.Scatter(
    x=x_pred['GDP_Per_Capita_2020'],
    y=y_intervals[:, 1],
    fill='tonexty',
    mode='lines',
    line=dict(color='lightgrey', width=1, dash='dot'),  # Lighter color, dashed line
    name='95% Prediction Interval',
    fillcolor='rgba(150, 150, 150, 0.3)',  # Light fill color with reduced opacity
    showlegend=True
)

fig_bubble = go.Figure()

fig_bubble.add_trace(trend_line)

fig_bubble.add_trace(lower_interval)
fig_bubble.add_trace(upper_interval)
fig_bubble.add_trace(scatter_plot)


# Update the layout for a dark and minimalist theme
fig_bubble.update_layout(
    title='GDP per Capita vs Health Score by County',
    xaxis=dict(title='GDP per capita 2020', range=[0, 200000], showgrid=False, linecolor='darkgrey', linewidth=1),  # Hide grid lines and set axis line color
    yaxis=dict(range=[0, 101], showgrid=False, linecolor='darkgrey', linewidth=1),  # Hide grid lines and set axis line color
    yaxis_title='County Health Score',
    width=1000, height=700,
    coloraxis_showscale=False,
    
    legend=dict(
        x=0.02,
        y=1,
        traceorder="normal",
        font=dict(
            family="sans-serif",
            size=12,
            color="white"
        ),
        bordercolor="Black",
        borderwidth=1
    ),
    paper_bgcolor="black",  # Background color
    plot_bgcolor="black",  # Plot area background color
    font=dict(color="white"),  # Text color
    annotations=[
        dict(
            xref='paper', yref='paper',
            x=0.5, y=-0.13,
            showarrow=False,
            text="Bubble size represents the population size of the county",
            font=dict(size=10)
            
        )
    ]
)


#fig_bubble.show()



In [None]:
# GeoJSON file
file_path_geo_json = "../data/interim/us_census_counties_geojson.json"
with open(file_path_geo_json) as f:
    counties = json.load(f)

In [None]:
selected_year = 2020
filtered_df_county_map = df_ranking[(df_ranking['Year'] == selected_year)]
# Calculate the 5th and 95th percentiles of the data
#percentile_low = filtered_df_county_map['Weighted_Score_Normalized'].quantile(0.05)
#percentile_high = filtered_df_county_map['Weighted_Score_Normalized'].quantile(0.95)

# Calculate the quintile boundaries
quintiles = filtered_df_county_map['Weighted_Score_Normalized'].quantile([0.2, 0.4, 0.6, 0.8]).values
# Function to determine quintile number
def get_quintile_number(score, quintiles):
    if score <= quintiles[0]:
        return 1
    elif score <= quintiles[1]:
        return 2
    elif score <= quintiles[2]:
        return 3
    elif score <= quintiles[3]:
        return 4
    else:
        return 5

# Normalized scale for quintiles (0-1 range)
quintile_scale = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

# Custom colorscale based on quintiles
quintile_colorscale = [
    [quintile_scale[0], "darkred"],
    [quintile_scale[1], "orange"],
    [quintile_scale[2], "yellow"],
    [quintile_scale[3], "lightgreen"],
    [quintile_scale[4], "darkgreen"],
    [quintile_scale[5], "darkgreen"]
]

# Map the normalized score to the 0-1 scale for the colorscale
filtered_df_county_map['normalized_quintile'] = filtered_df_county_map['Weighted_Score_Normalized'].apply(
    lambda score: get_quintile_number(score, quintiles) / 5
)
filtered_df_county_map['quintile'] = filtered_df_county_map['Weighted_Score_Normalized'].apply(
    lambda score: get_quintile_number(score, quintiles)
)
num_counties = len(filtered_df_county_map)
# Create the choropleth map using the normalized quintile for color
fig_choropleth = go.Figure(go.Choropleth(
    geojson=counties,
    featureidkey="properties.GEOID",
    locations=filtered_df_county_map['GEOID'],
    z=filtered_df_county_map['normalized_quintile'],  # Use normalized quintile for color
    colorscale=quintile_colorscale,
    hovertemplate = '%{customdata[0]} County, %{customdata[1]}<br>Score: %{customdata[3]:.2f}<br>Rank: %{customdata[2]} of ' + str(len(filtered_df_county_map)) + '<br>Quintile: %{customdata[4]}',
    customdata=filtered_df_county_map[['LocationName', 'StateAbbr', 'Rank', 'Weighted_Score_Normalized','quintile']],
    marker_line_width=0,
    name="",
    showscale=False
))
fig_choropleth.update_layout(
    geo=dict(
        scope="usa",
        lakecolor='black',
        landcolor='black',
        bgcolor='black',
        subunitcolor='black',
        showlakes=True,
        showsubunits=True,
        showland=True,
        showcountries=False,
        showcoastlines=False,
    ),
    paper_bgcolor='black',
    plot_bgcolor='black',
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    title_text=f"Overall Health Score by County for {selected_year}",
    title_y=0.9,
    title_x=0.5,
    title_font=dict(size=20, color='white'),
    width=1000,
    height=700
)


fig_choropleth.show()

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Assuming 'df_ranking' is your original dataframe and 'counties' is your GeoJSON data
selected_year = 2020
filtered_df_county_map = df_ranking[(df_ranking['Year'] == selected_year)]
num_counties = len(filtered_df_county_map)

# Calculate the quintile boundaries
quintiles = filtered_df_county_map['Weighted_Score_Normalized'].quantile([0.2, 0.4, 0.6, 0.8]).values

# Function to determine quintile number
def get_quintile_number(score, quintiles):
    if score <= quintiles[0]:
        return 1
    elif score <= quintiles[1]:
        return 2
    elif score <= quintiles[2]:
        return 3
    elif score <= quintiles[3]:
        return 4
    else:
        return 5

# Assign quintile number to each row
filtered_df_county_map['quintile'] = filtered_df_county_map['Weighted_Score_Normalized'].apply(
    lambda score: get_quintile_number(score, quintiles)
)

# Create separate dataframes for each quintile
quintile_dfs = []
for i in range(1, 6):
    quintile_dfs.append(filtered_df_county_map[filtered_df_county_map['quintile'] == i])

# Custom colorscale based on quintiles
quintile_colorscale = ["darkred", "orange", "yellow", "green", "darkgreen"]

# Create the choropleth map using separate traces for each quintile
fig = go.Figure()

for i, quintile_df in enumerate(quintile_dfs):
    fig.add_trace(go.Choropleth(
        geojson=counties,
        featureidkey="properties.GEOID",
        locations=quintile_df['GEOID'],
        z=[i]*len(quintile_df),  # Assign a constant value for color for each quintile
        colorscale=[[0, quintile_colorscale[i]], [1, quintile_colorscale[i]]],
        name=f"Quintile {i+1}",
        legendgroup=f"quintile{i+1}",
        customdata=quintile_df[['LocationName', 'StateAbbr', 'Rank', 'Weighted_Score_Normalized','quintile']],
        # hovertemplate = '%{customdata[0]} County, %{customdata[1]}<br>Score: %{customdata[3]:.2f}<br>Rank: %{customdata[2]} of ' + str(len(filtered_df_county_map)) + '<br>Quintile: %{customdata[4]}',
hovertemplate = '%{customdata[0]} County, %{customdata[1]}<br>Score: %{customdata[3]:.2f}<br>Rank: %{customdata[2]} of ' + str(num_counties) + '<extra>Quintile: %{customdata[4]}</extra>',
        marker_line_width=0,
        showlegend=True,
        showscale=False  # Hide the color scale bar
    ))

fig.update_layout(
    geo=dict(
        scope="usa",
        lakecolor='black',
        landcolor='black',
        bgcolor='black',
        subunitcolor='black',
        showlakes=True,
        showsubunits=True,
        showland=True,
        showcountries=False,
        showcoastlines=True,
    ),
    paper_bgcolor='black',
    plot_bgcolor='black',
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    title_text=f"Overall Health Score by County for {selected_year}",
    title_y=0.9,
    title_x=0.5,
    title_font=dict(size=20, color='white'),
    width=1000,
    height=700,
    legend=dict(
        y=0.5,  # Centers the legend vertically
        yanchor="middle",  # Anchors the legend at its middle point
        x=.87,
        font=dict(
            color="white"  # Sets the legend text color to white
        )
    ),
)

fig.show()


In [None]:
filtered_df_county_map[filtered_df_county_map.StateDesc=='Alaska']

In [None]:
filtered_df_county_map.quintile.unique()

In [None]:
import plotly.graph_objects as go

selected_year = 2020
filtered_df_county_map = df_ranking[df_ranking['Year'] == selected_year]
quintiles = filtered_df_county_map['Weighted_Score_Normalized'].quantile([0.2, 0.4, 0.6, 0.8]).values

# Define custom colors for 5 categories
custom_colors = ["darkred", "orange", "yellow", "green", "darkgreen"]
# Create a function to map data values to categories
def map_to_category(score):
    if score <= quintiles[0]:
        return 0
    elif score <= quintiles[1]:
        return 1
    elif score <= quintiles[2]:
        return 2
    elif score <= quintiles[3]:
        return 3
    else:
        return 4

# Map the data to categories and get the corresponding colors
filtered_df_county_map['category'] = filtered_df_county_map['Weighted_Score_Normalized'].apply(map_to_category)
category_colors = [custom_colors[category] for category in filtered_df_county_map['category']]
filtered_df_county_map['category_colors'] = category_colors
filtered_df_county_map = filtered_df_county_map[filtered_df_county_map.StateDesc=='Arizona']
# Create the choropleth map using custom colors
fig_choropleth = go.Figure(data=go.Choropleth(
    geojson=counties,
    featureidkey="properties.GEOID",
    locations=filtered_df_county_map['GEOID'],
    z=filtered_df_county_map['category'],
    colorscale=list(filtered_df_county_map['category_colors']),
    hovertemplate='%{customdata[0]} County, %{customdata[1]}<br>Score: %{customdata[3]:.2f}<br>Rank: %{customdata[2]}',
    customdata=filtered_df_county_map[['LocationName', 'StateAbbr', 'Rank', 'Weighted_Score_Normalized']],
    marker_line_width=0,
    autocolorscale=False,
    name=""
))

fig_choropleth.update_geos(
    scope="usa",
    lakecolor='black',
    landcolor='black',
    bgcolor='black',
    subunitcolor='black',
    showlakes=True,
    showsubunits=True,
    showland=True,
    showcountries=False,
    showcoastlines=False,
)

fig_choropleth.update_layout(
    paper_bgcolor='black',
    plot_bgcolor='black',
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    title_text=f"Overall Health Score by County for {selected_year}",
    title_font=dict(size=20, color='white'),
    width=1000,
    height=700
)

fig_choropleth.show()


In [None]:
import plotly.express as px

selected_year = 2020
filtered_df_county_map = df_ranking[df_ranking['Year'] == selected_year]
quintiles = filtered_df_county_map['Weighted_Score_Normalized'].quantile([0.2, 0.4, 0.6, 0.8]).values

# Create a function to map data values to categories
def map_to_category(score):
    if score <= quintiles[0]:
        return 0
    elif score <= quintiles[1]:
        return 1
    elif score <= quintiles[2]:
        return 2
    elif score <= quintiles[3]:
        return 3
    else:
        return 4

# Map the data to categories
filtered_df_county_map['category'] = filtered_df_county_map['Weighted_Score_Normalized'].apply(map_to_category)

# Create the choropleth map using Plotly Express with a custom color scale
fig_choropleth = px.choropleth(
    filtered_df_county_map,
    geojson=counties,
    featureidkey="properties.GEOID",
    locations='GEOID',
    color='category',
    color_continuous_scale=["darkred", "orange", "yellow", "green", "darkgreen"],
    color_continuous_midpoint=2.5,  # Specify the midpoint
    hover_name='LocationName',
    hover_data=['StateAbbr', 'Rank', 'Weighted_Score_Normalized'],
    title=f"Overall Health Score by County for {selected_year}",
    
)

fig_choropleth.update_geos(
    scope="usa",
    lakecolor='black',
    landcolor='black',
    bgcolor='black',
    subunitcolor='black',
    showlakes=True,
    showsubunits=True,
    showland=True,
    showcountries=False,
    showcoastlines=False,
)

fig_choropleth.update_layout(
    paper_bgcolor='black',
    plot_bgcolor='black',
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    title_font=dict(size=20, color='white'),
    width=1000,
    height=700
    
)

fig_choropleth.show()


In [None]:
filtered_df_county_map

In [None]:
filtered_df_county_map['color'] = category_colors

In [None]:
filtered_df_county_map[filtered_df_county_map.StateDesc=='Arizona']

In [None]:
from plotly.subplots import make_subplots

# Create a subplot with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2,
    #subplot_titles=('Choropleth Map', 'Bubble Chart'),
    specs=[[{"type": "choropleth"}, {"type": "scatter"}]], # Ensure the type is set correctly here
    horizontal_spacing=0.02  # Adjust spacing to your preference
)

# Add the choropleth trace to the first subplot
for trace in fig_choropleth.data:
    fig.add_trace(trace, row=1, col=1)

# Add the bubble chart trace to the second subplot
for trace in fig_bubble.data:
    fig.add_trace(trace, row=1, col=2)

# Update the layout for the subplots
fig.update_layout(
    title_text='',
    showlegend=False,
    width=2200,  # Ensure the total width is enough to accommodate both plots
    height=800,
    paper_bgcolor='black',
    plot_bgcolor='black'
)

# Update the geos of the choropleth map to maintain the dark background
fig.update_geos(
    bgcolor='black',  # Set the background color of the geo subplot to black
    lakecolor='black',  # Set lake color to black
    landcolor='black',  # Set land color to black
    scope="usa",
    row=1, col=1
)

# Remove the separate titles, as we have a main title now
fig.update_layout(annotations=[])

# For the bubble chart, update axes to match the dark theme
fig.update_xaxes(title_font=dict(color='white'), row=1, col=2)
fig.update_yaxes(title_font=dict(color='white'), row=1, col=2)

# If the bubble chart has a colorbar, update it to match the dark theme
fig.update_layout(coloraxis_colorbar=dict(
    titlefont=dict(color='white'), 
    tickfont=dict(color='white'),
    outlinewidth=0,
    bordercolor='black'
))


fig.show()


In [None]:
filepath_health_data = "../data/interim/CDC_PLACES_GEOID.pickle"
df = pd.read_pickle(filepath_health_data)  
df['Data_Value'] = df['Data_Value'] / 100

all_measures = list(df.Measure.unique())
selected_measure = [all_measures[15]][0]
selected_year = 2020

In [None]:
filtered_df_map = df[(df['Measure'] == selected_measure) & (df['Year'] == selected_year)]
# Calculate the 10th and 90th percentiles of the data
percentile_low = filtered_df_map['Data_Value'].quantile(0.05)
percentile_high = filtered_df_map['Data_Value'].quantile(0.95)

In [None]:
fig = go.Figure(go.Choropleth(
    geojson=counties,
    featureidkey="properties.GEOID",
    locations=filtered_df_map['GEOID'],
    z=filtered_df_map['Data_Value'],
    colorscale="RdYlGn_r",
    hovertemplate='%{customdata[0]} County, %{customdata[1]}<br>' + selected_measure + ': %{z:.2%}',
    customdata=filtered_df_map[['LocationName', 'StateAbbr']],
    colorbar=dict(thickness=15, len=0.5, tickformat=".1%", title=dict(text="", side="right"), 
                  x=0.93,  # Position the color bar
                  outlinewidth=0,
                  ticks="outside",
                  ticklen=3,
                  tickcolor='white',
                  tickwidth=1,
                  titlefont=dict(size=12, color='white'),
                  tickfont=dict(size=10, color='white')),
    zmin=percentile_low,
    zmax=percentile_high,
    showscale=True,
    marker_line_width=0,  # No borders on the counties
    name=""
))

fig.update_layout(
    geo=dict(
        scope="usa",
        lakecolor='black',  # Lake colors
        landcolor='black',  # Land colors
        bgcolor='black',  # Background color
        subunitcolor='black',  # Borders color
        showlakes=True,
        showsubunits=True,
        showland=True,
        showcountries=False,
        showcoastlines=False,
    ),
    paper_bgcolor='black',  # Background color outside of the map
    plot_bgcolor='black',  # Plot background color
    margin={"r": 0, "l": 0, "b": 0, "t": 0},
    title_text=selected_measure + ' by County',
    title_y=0.9,
    title_x=0.5,
    title_font=dict(size=20, color='white'),
    width=1200,
    height=900
)
fig.add_annotation(
    text="Color scale represents<br>5th to 95th percentile",
    align='left',
    showarrow=False,
    xref='paper', yref='paper',
    x=0.99, y=.21,  # Adjust the position according to your layout
    bgcolor="black",
    bordercolor="white",
    borderpad=4,
    font=dict(color='white')
)

In [7]:
from pygam import LinearGAM, s
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np



# Split the data into training and testing sets
train_df, test_df = train_test_split(filtered_df, test_size=0.3, random_state=42)
normalized_weights = np.log(train_df['population'] + 1)  # Add 1 to avoid log(0)
normalized_weights = (normalized_weights - normalized_weights.min()) / (normalized_weights.max() - normalized_weights.min())
# Define lambda and n_splines values for grid search
lambda_values = [0.1, 1, 5,10, 100, 200,300,500]
n_splines_values = [4,5,7,10, 15, 20, 25,50]

# Variables for the best model
best_lam = None
best_n_splines = None
best_mse = float('inf')
best_model = None

for lam in lambda_values:
    
    for n_splines in n_splines_values:
        gam = LinearGAM(s(0, n_splines=n_splines, constraints='monotonic_inc'), lam=lam)
        gam.fit(train_df[['GDP_Per_Capita_2020']], train_df['Weighted_Score_Normalized'], weights=normalized_weights)
        predictions = gam.predict(test_df[['GDP_Per_Capita_2020']])
        mse = mean_squared_error(test_df['Weighted_Score_Normalized'], predictions)
        print(f"lambda: {lam}, n-splines: {n_splines}")
        print(f"MSE: {mse}")
        print("")
        if mse < best_mse:
            best_mse = mse
            best_lam = lam
            best_n_splines = n_splines
            best_model = gam

print(f"Best lambda: {best_lam}, Best n_splines: {best_n_splines} with MSE: {best_mse}")


NameError: name 'filtered_df' is not defined

In [None]:
### Load data
df1 = pd.read_pickle("../data/interim/CDC_PLACES_county_rankings_by_year.pickle")
df1 = df1[df1.Year==2020]

df2 = pd.read_pickle("../data/interim/combined_Rank_CDCLocals_BEAgdp_Spending.pickle")

In [None]:
df1.head()

In [None]:
df2.head()