# Choropleth visualization
# Sandy Chen

In [None]:
import pandas as pd
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns

In [None]:
path = "/Users/wenyuanchen/Desktop/Stanford/BIOMEDIN 212/Project/WIDS_DMMTS/Data/train_test_added_climate_data.csv"
data = pd.read_csv(path)
train_df = data[data['allocated_set'] == 'train'].drop(columns = ['allocated_set', 'patient_id'])

test_df= data[data['allocated_set'] == 'test']

imputed_path = "/Users/wenyuanchen/Desktop/Stanford/BIOMEDIN 212/Project/WIDS_DMMTS/Data/train_test_added_climate_data_imputed.csv"
imputed_data = pd.read_csv(imputed_path)
train_imputed = imputed_data[imputed_data['allocated_set'] == 'train']
test_imputed = imputed_data[imputed_data['allocated_set'] == 'test']


numerical_features = train_df.select_dtypes(include=['number']).columns.tolist() 
numerical_features = [feature for feature in numerical_features if feature not in ['patient_id', 'patient_zip3']]

categorical_features = train_df.select_dtypes(include=['object']).columns.tolist() + ['patient_zip3']



In [None]:
patient_level_features = ['patient_race', 'payer_type', 'patient_state',
       'patient_zip3', 'patient_age', 'patient_gender', 'bmi',
       'breast_cancer_diagnosis_code', 'breast_cancer_diagnosis_desc',
       'breast_cancer_diagnosis_year', 'metastatic_cancer_diagnosis_code',
       'metastatic_first_treatment', 'metastatic_first_treatment_type',
       'metastatic_first_novel_treatment',
       'metastatic_first_novel_treatment_type', 'region', 'division','side', 'quadrant',
       'metastatic_organ', 'cleaned_metastatic_first_treatment', 'cleaned_metastatic_first_treatment_type']
patient_level_numerical_features = [feature for feature in patient_level_features if feature in numerical_features]
patient_level_categorical_features = [feature for feature in patient_level_features if feature in categorical_features]
assert len(patient_level_features) == len(patient_level_numerical_features) + len(patient_level_categorical_features)

population_level_features = ['population', 'density', 'age_median', 'age_under_10', 'age_10_to_19',
       'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s',
       'age_over_80', 'male', 'female', 'married', 'divorced', 'never_married',
       'widowed', 'family_size', 'family_dual_income',
       'income_household_median', 'income_household_under_5',
       'income_household_5_to_10', 'income_household_10_to_15',
       'income_household_15_to_20', 'income_household_20_to_25',
       'income_household_25_to_35', 'income_household_35_to_50',
       'income_household_50_to_75', 'income_household_75_to_100',
       'income_household_100_to_150', 'income_household_150_over',
       'income_household_six_figure', 'income_individual_median',
       'home_ownership', 'housing_units', 'home_value', 'rent_median',
       'rent_burden', 'education_less_highschool', 'education_highschool',
       'education_some_college', 'education_bachelors', 'education_graduate',
       'education_college_or_above', 'education_stem_degree',
       'labor_force_participation', 'unemployment_rate', 'self_employed',
       'farmer', 'race_white', 'race_black', 'race_asian', 'race_native',
       'race_pacific', 'race_other', 'race_multiple', 'hispanic', 'disabled',
       'poverty', 'limited_english', 'commute_time', 'health_uninsured',
       'veteran','Ozone', 'PM25', 'N02']
population_level_numerical_features = [feature for feature in population_level_features if feature in numerical_features]
population_level_categorical_features = [feature for feature in population_level_features if feature in categorical_features]
assert len(population_level_features) == len(population_level_numerical_features) + len(population_level_categorical_features)
target_variable = 'treatment_pd'
assert len(patient_level_features) + len(population_level_features) + 1 == train_df.shape[1], len(patient_level_features) + len(population_level_features) + 1 + train_df.shape[1]

In [None]:
missingValues = train_df.isnull().sum()
pd.set_option('display.max_rows', 500)
display(missingValues)

In [None]:
for feature in numerical_features:
    if feature == "patient_zip3" or feature == "patient_id" :
        continue

    plt.figure(figsize=(10, 5))
    sns.boxplot(x=train_df[feature])
    plt.title(f'Boxplot of {feature}')
    plt.show()


In [None]:
outliers_iqr = {}

for feature in numerical_features:
    if feature == "patient_zip3" or feature == "patient_id" :
        continue
    Q1 = train_df[feature].quantile(0.25)
    Q3 = train_df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = train_df[(train_df[feature] < lower_bound) | (train_df[feature] > upper_bound)]
    outliers_iqr[feature] = outliers
    print(f'Outliers in {feature} using IQR method:')
    print(outliers[[feature]])


In [None]:
from scipy import stats

outliers_zscore = {}
threshold = 3
for feature in numerical_features:
    if feature == "patient_zip3" or feature == "patient_id" :
        continue
    z_scores = np.abs(stats.zscore(train_df[feature].dropna()))
    try:
        outliers = train_df[(z_scores > threshold)]
        outliers_zscore[feature] = outliers
        print(f'Outliers in {feature} using Z-score method:')
        print(outliers[[feature]])
    except:
        pass


# Univariate analysis

# numerical features

In [None]:
for feature in numerical_features:
    plt.figure(figsize=(20, 12))
    
    plt.subplot(1, 2, 1)
    sns.histplot(train_df[feature].dropna(), kde=False, bins=30)
    plt.title(f'Histogram of {feature}')
    
    plt.subplot(1, 2, 2)
    sns.kdeplot(train_df[feature].dropna(), shade=True)
    plt.title(f'KDE Plot of {feature}')
    
    plt.show()


# categorical features

In [None]:
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=train_df[feature])
    plt.title(f'Bar Plot of {feature}')
    plt.xticks(rotation=45)
    plt.show()


# bivariate analysis

# Bivariate Analysis - Numerical vs. Numerical

In [None]:
# Pair plot for all numerical features


# Scatter plots for each numerical feature vs target variable
for feature in numerical_features:
    if feature != target_variable:
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=train_df[feature], y=train_df[target_variable])
        plt.title(f'Scatter Plot of {feature} vs {target_variable}')
        plt.xlabel(feature)
        plt.ylabel(target_variable)
        plt.show()

    



In [None]:
sns.pairplot(train_df[patient_level_numerical_features + [target_variable]])
plt.show()


# Bivariate Analysis - Categorical vs. Numerical

In [None]:
for feature in categorical_features:
    plt.figure(figsize=(40, 20))
    
    plt.subplot(1, 2, 1)
    sns.boxplot(x=train_df[feature], y=train_df[target_variable])
    plt.title(f'Box Plot of {target_variable} by {feature}', fontsize=30)
    plt.xticks(rotation=90, fontsize=20)
    plt.yticks(fontsize=30)
    plt.xlabel(feature, fontsize=30)
    plt.ylabel(target_variable, fontsize=30)
    
    plt.subplot(1, 2, 2)
    sns.violinplot(x=train_df[feature], y=train_df[target_variable])
    plt.title(f'Violin Plot of {target_variable} by {feature}', fontsize=30)
    plt.xticks(rotation=90, fontsize=30)
    plt.yticks(fontsize=30)
    plt.xlabel(feature, fontsize=30)
    plt.ylabel(target_variable, fontsize=30)
    
    plt.tight_layout()
    plt.show()


# Bivariate Analysis - Categorical vs. Categorical

In [None]:
from scipy.stats import chi2_contingency

# Contingency tables and heatmaps
for i in range(len(categorical_features)):
    for j in range(i + 1, len(categorical_features)):
        feature1 = categorical_features[i]
        feature2 = categorical_features[j]
        
        contingency_table = pd.crosstab(train_df[feature1], train_df[feature2])
        chi2, p, dof, ex = chi2_contingency(contingency_table)
        
        plt.figure(figsize=(30,15))
        sns.heatmap(contingency_table, annot=True, fmt="d", cmap="YlGnBu")
        plt.title(f'Contingency Table and Heatmap of {feature1} vs {feature2}\nChi2: {chi2:.2f}, p-value: {p:.2e}', fontsize=20)
        plt.xlabel(feature2, fontsize=20)
        plt.ylabel(feature1, fontsize=20)
        plt.xticks(rotation=90, fontsize=20)

# Correlation matrix

In [None]:

# Compute the correlation matrix
correlation_matrix = train_df[numerical_features].corr()

# Display the correlation matrix
print(correlation_matrix)
# Plot the heatmap

plt.figure(figsize=(20, 10))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix Heatmap")
plt.show()



In [None]:
pd.DataFrame(correlation_matrix)["treatment_pd"].sort_values(ascending=False, key=abs)

# Check for Multicollinearity Using VIF

In [None]:
# Create a DataFrame to hold the VIF values
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

vif_data = pd.DataFrame()
vif_data["Feature"] = patient_level_numerical_features

# Calculate VIF for each feature

# Fill missing values with the mean of each column: very simple imputation
train_df_filled = train_df[patient_level_numerical_features].fillna(train_df[patient_level_numerical_features].mean())

vif_data["VIF"] = [variance_inflation_factor(train_df_filled.values, i) for i in range(len(patient_level_numerical_features))]

# Display the VIF values
print(vif_data)


In [None]:
# Create a DataFrame to hold the VIF values
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

vif_data = pd.DataFrame()
vif_data["Feature"] = population_level_numerical_features

# Calculate VIF for each feature

# Fill missing values with the mean of each column: very simple imputation
train_df_filled = train_df[population_level_numerical_features].fillna(train_df[population_level_numerical_features].mean())

vif_data["VIF"] = [variance_inflation_factor(train_df_filled.values, i) for i in range(len(population_level_numerical_features))]

# Display the VIF values
print(vif_data)

In [None]:
# Define the target variable
target_variable = 'treatment_pd'  # Replace with your actual target variable

# Plot scatter plots for each pair of numerical features vs. the target variable
for feature1 in numerical_features:
    for feature2 in numerical_features:
        if feature1 != feature2:
            plt.figure(figsize=(8, 6))
            sns.scatterplot(x=train_df[feature1], y=train_df[feature2], hue=train_df[target_variable])
            plt.title(f'Scatter Plot of {feature1} vs {feature2}')
            plt.xlabel(feature1)
            plt.ylabel(feature2)
            plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from itertools import combinations

# Assuming train_df is your DataFrame and 'treatment_pd' is your outcome variable

# Select numerical features excluding the outcome variable


# Define the target variable
target_variable = 'treatment_pd'

# Iterate over all pairs of numerical features
for feature1, feature2 in combinations(train_df[numerical_features].drop(columns=['treatment_pd']),2):
    # Create interaction term
    interaction_term = train_df_filled[feature1] * train_df_filled[feature2]
    interaction_df = train_df_filled[[feature1, feature2, target_variable]].copy()
    interaction_df['interaction'] = interaction_term
    
    # Fit a linear model
    model = sm.OLS(interaction_df[target_variable], sm.add_constant(interaction_df[[feature1, feature2, 'interaction']])).fit()
    print(f'Interaction effect of {feature1} and {feature2} on {target_variable}')
    print(model.summary())
    
    # Plot interaction effects
    fig = plt.figure(figsize=(8, 6))
    sm.graphics.plot_ccpr(model, 'interaction', ax=fig.add_subplot(111))
    plt.title(f'Interaction Plot of {feature1} and {feature2} on {target_variable}')
    plt.xlabel(f'Interaction of {feature1} and {feature2}')
    plt.ylabel(target_variable)
    plt.show()


# zipcode 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming train_df is your DataFrame
# Count the number of patients in each ZIP code
zip_code_distribution = train_df['patient_zip3'].value_counts()

# Plot the distribution
plt.figure(figsize=(30, 12))
zip_code_distribution.plot(kind='bar')
plt.title('Distribution of Patients Across ZIP Codes')
plt.xlabel('ZIP Code')
plt.ylabel('Number of Patients')
plt.show()


In [None]:
import pandas as pd

# Assuming train_df is your DataFrame
temp = train_df.copy()

# Step 1: Create the Mapping Dictionary
zip_to_state_mapping = temp.groupby('patient_zip3')['patient_state'].agg(lambda x: x.mode()[0]).to_dict()

# Step 2: Implement the Function
def map_zip_to_state(zip_code, state):
    """
    Map 3-digit ZIP code to state using the mapping dictionary created from the DataFrame.
    
    Parameters:
    zip_code (str): 3-digit ZIP code
    state (str): State abbreviation (fallback if mapping not found)
    
    Returns:
    str: State abbreviation corresponding to the 3-digit ZIP code
    """
    # Ensure zip_code is a string
    zip_code = str(zip_code)
    
    # Look up the 3-digit ZIP code in the mapping dictionary
    return zip_to_state_mapping.get(zip_code, state)

# Step 3: Apply the Function to the DataFrame
temp['mapped_state'] = temp.apply(lambda row: map_zip_to_state(row['patient_zip3'], row['patient_state']), axis=1)

# Display the DataFrame with the new column
print(temp[['patient_zip3', 'patient_state', 'mapped_state']].head())


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Assuming train_df is already loaded with the necessary data

temp = train_df[population_level_features + [target_variable] + ["patient_zip3", "patient_state"]].copy()

# Create ZIP to State Mapping
zip_to_state_mapping = temp.groupby('patient_zip3')['patient_state'].agg(lambda x: x.mode()[0]).to_dict()

def map_zip_to_state(zip_code, state):
    zip_code = str(zip_code)
    return zip_to_state_mapping.get(zip_code, state)

temp['mapped_state'] = temp.apply(lambda row: map_zip_to_state(row['patient_zip3'], row['patient_state']), axis=1)

# Ensure all states are included, even if no data
all_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
temp_all_states = pd.DataFrame({'mapped_state': all_states})

# Calculate the number of patients and average treatment period per state
state_stats = temp.groupby('mapped_state').agg(
    num_patients=('patient_zip3', 'size'),
    avg_treatment_pd=(target_variable, 'mean')
).reset_index()

# Merge with the all_states DataFrame to ensure all states are included
temp_all_states = temp_all_states.merge(state_stats, on='mapped_state', how='left')

# Fill missing values with 0
temp_all_states[['num_patients', 'avg_treatment_pd']] = temp_all_states[['num_patients', 'avg_treatment_pd']].fillna(0)

# Add state names for labeling
state_names = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 
    'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 
    'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 
    'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 
    'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}
temp_all_states['state_name'] = temp_all_states['mapped_state'].map(state_names)

# Plot number of patients per state
fig1 = px.choropleth(temp_all_states,
                     geojson= 'three_dig_zips.geojson',
                     locations='mapped_state',
                     locationmode='USA-states',
                     color='num_patients',
                     color_continuous_scale='Viridis',
                     scope='usa',
                     labels={'num_patients': 'Number of Patients'},
                     hover_data=['state_name'])

fig1.update_geos(fitbounds="locations", visible=False)
fig1.update_layout(title='Number of Patients per State')

# Save the figure to an HTML file
fig1.write_html("num_patients_map.html")

# Plot average treatment period per state
fig2 = px.choropleth(temp_all_states,
                     geojson= 'three_dig_zips.geojson',
                     locations='mapped_state',
                     locationmode='USA-states',
                     color='avg_treatment_pd',
                     color_continuous_scale='Viridis',
                     scope='usa',
                     labels={'avg_treatment_pd': 'Average Treatment Period'},
                     hover_data=['state_name'])

fig2.update_geos(fitbounds="locations", visible=False)
fig2.update_layout(title='Average Treatment Period per State')

# Save the figure to an HTML file
fig2.write_html("avg_treatment_pd_map.html")

# To display in Jupyter Notebook or IPython environment
from IPython.display import IFrame
IFrame(src='num_patients_map.html', width=800, height=600)
IFrame(src='avg_treatment_pd_map.html', width=800, height=600)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
import os

# Assuming train_df is already loaded with the necessary data

temp = train_df[population_level_features + [target_variable] + ["patient_zip3", "patient_state"]].copy()



# Create ZIP to State Mapping
# zip_to_state_mapping = temp.groupby('patient_zip3')['patient_state'].agg(lambda x: x.mode()[0]).to_dict()
temp['mapped_state'] = temp.apply(lambda row: map_zip_to_state(row['patient_zip3'], row['patient_state']), axis=1)

# Ensure all states are included, even if no data
all_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
              'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 
              'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR']
all_state_names = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 
                   'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 
                   'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
                   'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
                   'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
                   'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 
                   'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico']

state_name_mapping = dict(zip(all_states, all_state_names))

temp_all_states = pd.DataFrame({'mapped_state': all_states})

# Calculate the number of patients, average treatment period, and averages of other numerical features per state
agg_dict_state = {feature: 'mean' for feature in population_level_features}
agg_dict_state.update({'patient_zip3': 'size', target_variable: 'mean'})

state_stats = temp.groupby('mapped_state').agg(agg_dict_state).reset_index()

# Rename columns for clarity
state_stats.rename(columns={'patient_zip3': 'num_patients', target_variable: 'avg_treatment_pd'}, inplace=True)

# Merge with the all_states DataFrame to ensure all states are included
temp_all_states = temp_all_states.merge(state_stats, on='mapped_state', how='left')

# Fill missing values with 0
temp_all_states.fillna(0, inplace=True)

# Add state names for labeling
temp_all_states['state_name'] = temp_all_states['mapped_state'].map(state_name_mapping)



# Function to create and save choropleth map for each feature
def create_choropleth(feature):
    fig = px.choropleth(temp_all_states,
                        geojson='https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json',
                        locations='mapped_state',
                        locationmode='USA-states',
                        color=feature,
                        color_continuous_scale='Viridis',
                        scope='usa',
                        labels={feature: feature.replace('_', ' ').title()},
                        hover_data=['state_name'])
    # fig.add_scattergeo(
    #     locations=temp_all_states['mapped_state'],
    #     locationmode="USA-states", 
    #     text=temp_all_states['mapped_state'],
    #     # text = temp_all_states[feature],
    #     mode='text',
    #     textfont=dict(color='white')
    # )
    temp_all_states['label'] = temp_all_states['mapped_state'] + ': ' + (temp_all_states[feature]).astype(int).astype(str)
    fig.add_scattergeo(
        locations=temp_all_states['mapped_state'],
        locationmode="USA-states", 
        text=temp_all_states['label'],
        mode='text',
        textfont=dict(color='red')
    )
    # fig.add_scattergeo(
    #     locations=temp_all_states['mapped_state'],
    #     locationmode="USA-states", 
    #     # text=temp_all_states['mapped_state'],
    #     text = temp_all_states[feature],
    #     mode='text',
    #     textfont=dict(color='white')
    # )

    fig.update_geos(fitbounds="locations", visible=False)
    fig.update_layout(title=f'Spatial Distribution of {feature.replace("_", " ").title()}')
    os.makedirs('choropleth_maps', exist_ok=True)
    file_name = f"choropleth_maps/{feature}_map.html"
    fig.write_html(file_name)
    
    return file_name

# Create and save choropleth maps for each population-level numerical feature
choropleth_files = []
for feature in ['num_patients', 'avg_treatment_pd'] + population_level_features:
    file_name = create_choropleth(feature)
    choropleth_files.append(file_name)

# Display the choropleth maps in Jupyter Notebook or IPython environment
# from IPython.display import IFrame, display
# for file in choropleth_files:
#     display(IFrame(src=file, width=800, height=600))


In [None]:
# import pandas as pd
# import json
# import plotly.express as px
# from IPython.display import IFrame, display

# # Assuming 'train_df' is the DataFrame with the required data
# # and 'population_level_features' and 'target_variable' are defined

# # Step 1: Convert ZIP codes to string with leading zeros
# temp = train_df[population_level_features + [target_variable] + ["patient_zip3", "patient_state"]].copy()
# temp['patient_zip3'] = temp['patient_zip3'].astype(str).str.zfill(3)
# temp['mapped_zip'] = temp['patient_zip3']

# # # Step 2: Aggregate the data to get average values for numerical features and count for patients
# # agg_dict = {feature: 'mean' for feature in population_level_features}
# # agg_dict.update({'patient_zip3': 'size', target_variable: 'mean'})

# # # Perform the aggregation
# # zip_stats = temp.groupby('patient_zip3').agg(agg_dict).reset_index()




# # # Rename columns for clarity
# # zip_stats.rename(columns={'patient_zip3': 'num_patients', target_variable: 'avg_treatment_pd'}, inplace=True)
# # zip_stats.rename(columns={'patient_zip3': 'mapped_zip'}, inplace=True)
# # Calculate the number of patients, average treatment period, and averages of other numerical features per 3-digit ZIP code
# agg_dict = {feature: 'mean' for feature in population_level_features}
# agg_dict.update({'patient_zip3': 'size', 'treatment_pd': 'mean'})

# zip_stats = temp.groupby('mapped_zip').agg(agg_dict).reset_index()

# # Rename columns for clarity
# zip_stats.rename(columns={'patient_zip3': 'num_patients', 'treatment_pd': 'avg_treatment_pd'}, inplace=True)



# # Fill missing values with 0
# zip_stats.fillna(0, inplace=True)

# # Load the GeoJSON file
# geojson_path = 'three_dig_zips.geojson'
# with open(geojson_path, 'r') as f:
#     geojson_content = json.load(f)

# # Function to create and save choropleth map for each feature
# def create_choropleth(feature):
#     fig = px.choropleth(zip_stats,
#                         geojson=geojson_content,
#                         locations='mapped_zip',
#                         featureidkey='properties.3dig_zip',
#                         color=feature,
#                         color_continuous_scale='Viridis',
#                         scope='usa',
#                         labels={feature: feature.replace('_', ' ').title()},
#                         hover_data=['mapped_zip'])

#     fig.update_geos(fitbounds="locations", visible=False)
#     fig.update_layout(title=f'Spatial Distribution of {feature.replace("_", " ").title()}')
#     file_name = f"{feature}_map.html"
#     fig.write_html(file_name)
#     return file_name

# # Create and save choropleth maps for each population-level numerical feature
# choropleth_files = []
# for feature in ['num_patients', 'avg_treatment_pd'] + population_level_features:
#     file_name = create_choropleth(feature)
#     choropleth_files.append(file_name)

# # Display the choropleth maps in Jupyter Notebook or IPython environment
# for file in choropleth_files:
#     display(IFrame(src=file, width=800, height=600))


In [None]:
# import pandas as pd
# import numpy as np
# import plotly.express as px
# import json
# from IPython.display import IFrame, display


# # Ensure all states are included, even if no data
# all_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
#               'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 
#               'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR']
# all_state_names = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 
#                    'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 
#                    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
#                    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
#                    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
#                    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 
#                    'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico']
# # Assuming train_df is already loaded with the necessary data

# # Convert ZIP codes to string with leading zeros
# temp = train_df[population_level_features + [target_variable] + ["patient_zip3", "patient_state"]].copy()
# temp['patient_zip3'] = temp['patient_zip3'].astype(str).str.zfill(3)
# temp['mapped_zip'] = temp['patient_zip3']

# # Aggregate data by ZIP code
# agg_dict = {feature: 'mean' for feature in population_level_features}
# # agg_dict.update({'patient_zip3': 'size', 'treatment_pd': 'mean'})
# agg_dict.update({'patient_zip3': 'size', 'treatment_pd': 'mean', 'patient_state': lambda x: x.mode()[0]})

# zip_stats = temp.groupby('mapped_zip').agg(agg_dict).reset_index()

# # Rename columns for clarity
# zip_stats.rename(columns={'patient_zip3': 'num_patients', 'treatment_pd': 'avg_treatment_pd'}, inplace=True)

# # Fill missing values with 0
# zip_stats.fillna(0, inplace=True)

# # Load the GeoJSON file for ZIP codes
# geojson_path = 'three_dig_zips.geojson'
# with open(geojson_path, 'r') as f:
#     geojson_content = json.load(f)

# # Create ZIP to State Mapping
# zip_to_state_mapping = temp.groupby('patient_zip3')['patient_state'].agg(lambda x: x.mode()[0]).to_dict()

# def map_zip_to_state(zip_code, state):
#     zip_code = str(zip_code)
#     return zip_to_state_mapping.get(zip_code, state)

# temp['mapped_state'] = temp.apply(lambda row: map_zip_to_state(row['patient_zip3'], row['patient_state']), axis=1)

# # Ensure all states are included, even if no data
# # all_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
# temp_all_states = pd.DataFrame({'mapped_state': all_states})

# # Calculate the number of patients, average treatment period, and averages of other numerical features per state
# agg_dict_state = {feature: 'mean' for feature in population_level_features}
# agg_dict_state.update({'patient_zip3': 'size', target_variable: 'mean'})

# state_stats = temp.groupby('mapped_state').agg(agg_dict_state).reset_index()

# # Rename columns for clarity
# state_stats.rename(columns={'patient_zip3': 'num_patients', target_variable: 'avg_treatment_pd'}, inplace=True)

# # Merge with the all_states DataFrame to ensure all states are included
# temp_all_states = temp_all_states.merge(state_stats, on='mapped_state', how='left')

# # Fill missing values with 0
# temp_all_states.fillna(0, inplace=True)

# # Add state names for labeling
# # state_names = {
# #     'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 
# #     'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 
# #     'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 
# #     'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 
# #     'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 
# #     'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming',
    
# # }
# temp_all_states['state_name'] = temp_all_states['mapped_state'].map(state_names)

# # Create state-level choropleth map
# fig_state = px.choropleth(temp_all_states,
#                           geojson='https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json',
#                           locations='mapped_state',
#                           locationmode='USA-states',
#                           color='avg_treatment_pd',
#                           color_continuous_scale='Blues',
#                           scope='usa',
#                           labels={'avg_treatment_pd': 'Average Treatment Period'},
#                           hover_data=['state_name'])

# zip_stats_copy= zip_stats.merge(temp_all_states[['mapped_state',"avg_treatment_pd"]], left_on = "patient_state",right_on='mapped_state', how='left')

# # Create ZIP code-level choropleth map
# fig_zip = px.choropleth(zip_stats_copy,
#                         geojson=geojson_content,
#                         locations='mapped_zip',
#                         featureidkey='properties.3dig_zip',
#                         color='avg_treatment_pd',
#                         color_continuous_scale='Reds',
#                         scope='usa',
#                         labels={'avg_treatment_pd': 'Average Treatment Period'},
#                         hover_data=['mapped_zip', 'num_patients', "patient_state" ])

# # Combine the state-level and ZIP code-level maps
# fig = fig_state
# for data in fig_zip.data:
#     fig.add_trace(data)

# # Update layout
# fig.update_layout(title='Combined Choropleth Map: States and ZIP Codes')

# # Save the figure to an HTML file
# file_name = "combined_choropleth_map.html"
# fig.write_html(file_name)

# # Display the combined choropleth map
# display(IFrame(src=file_name, width=800, height=600))


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import json
import requests
from IPython.display import IFrame, display

# Assuming train_df is already loaded with the necessary data

# Convert ZIP codes to string with leading zeros
temp = train_df[population_level_features + [target_variable] + ["patient_zip3", "patient_state"]].copy()
temp['patient_zip3'] = temp['patient_zip3'].astype(str).str.zfill(3)
temp['mapped_zip'] = temp['patient_zip3']

# Aggregate data by ZIP code
agg_dict = {feature: 'mean' for feature in population_level_features}
agg_dict.update({'patient_zip3': 'size', 'treatment_pd': 'mean', 'patient_state': lambda x: x.mode()[0]})

zip_stats = temp.groupby('mapped_zip').agg(agg_dict).reset_index()

# Rename columns for clarity
zip_stats.rename(columns={'patient_zip3': 'num_patients', 'treatment_pd': 'avg_treatment_pd'}, inplace=True)

# Fill missing values with 0
zip_stats.fillna(0, inplace=True)

# Load the GeoJSON file for ZIP codes
geojson_path = 'three_dig_zips.geojson'
with open(geojson_path, 'r') as f:
    geojson_content = json.load(f)

# Create ZIP to State Mapping
zip_to_state_mapping = temp.groupby('patient_zip3')['patient_state'].agg(lambda x: x.mode()[0]).to_dict()

def map_zip_to_state(zip_code, state):
    zip_code = str(zip_code)
    return zip_to_state_mapping.get(zip_code, state)

temp['mapped_state'] = temp.apply(lambda row: map_zip_to_state(row['patient_zip3'], row['patient_state']), axis=1)

# Ensure all states are included, even if no data
all_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
              'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 
              'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR']
all_state_names = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 
                   'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 
                   'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
                   'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
                   'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
                   'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 
                   'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico']

state_name_mapping = dict(zip(all_states, all_state_names))

temp_all_states = pd.DataFrame({'mapped_state': all_states})

# Calculate the number of patients, average treatment period, and averages of other numerical features per state
agg_dict_state = {feature: 'mean' for feature in population_level_features}
agg_dict_state.update({'patient_zip3': 'size', target_variable: 'mean'})

state_stats = temp.groupby('mapped_state').agg(agg_dict_state).reset_index()

# Rename columns for clarity
state_stats.rename(columns={'patient_zip3': 'num_patients', target_variable: 'avg_treatment_pd'}, inplace=True)

# Merge with the all_states DataFrame to ensure all states are included
temp_all_states = temp_all_states.merge(state_stats, on='mapped_state', how='left')

# Fill missing values with 0
temp_all_states.fillna(0, inplace=True)

# Add state names for labeling
temp_all_states['state_name'] = temp_all_states['mapped_state'].map(state_name_mapping)

# Create state-level choropleth map
fig_state = px.choropleth(temp_all_states,
                          geojson='https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json',
                          locations='mapped_state',
                          locationmode='USA-states',
                          color='avg_treatment_pd',
                          color_continuous_scale='Blues',
                          scope='usa',
                          labels={'avg_treatment_pd': 'Average Treatment Period'},
                          hover_data=['state_name'])

# Merge zip_stats with temp_all_states to get state-level information
zip_stats_copy = zip_stats.merge(temp_all_states[['mapped_state', 'avg_treatment_pd']], left_on='patient_state', right_on='mapped_state', how='left')
zip_stats_copy.rename(columns={'avg_treatment_pd_y': 'State overall avg treatment_pd'}, inplace=True)
zip_stats_copy.rename(columns={'avg_treatment_pd_x': 'Zipcode overall avg treatment_pd'}, inplace=True)


# Create ZIP code-level choropleth map
fig_zip = px.choropleth(zip_stats_copy,
                        geojson=geojson_content,
                        locations='mapped_zip',
                        featureidkey='properties.3dig_zip',
                        color='Zipcode overall avg treatment_pd',
                        color_continuous_scale='Reds',
                        scope='usa',
                        labels={'Zipcode overall avg treatment_pd': 'Zipcode overall avg treatment_pd'},
                        hover_data=['mapped_zip', 'num_patients', 'patient_state', 'State overall avg treatment_pd'])

# Combine the state-level and ZIP code-level maps
fig = fig_state
for data in fig_zip.data:
    fig.add_trace(data)

# Update layout
fig.update_layout(title='Combined Choropleth Map: States and ZIP Codes')

# Save the figure to an HTML file
# os.makedirs('choropleth_maps', exist_ok=True)
os.makedirs('choropleth_maps_combined', exist_ok=True)
file_name = "choropleth_maps_combined/treatment_pd.html"
fig.write_html(file_name)

# Display the combined choropleth map
# display(IFrame(src=file_name, width=800, height=600))


In [None]:
zip_stats_copy

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import json
import requests
from IPython.display import IFrame, display
import os

# Assuming train_df is already loaded with the necessary data

# Convert ZIP codes to string with leading zeros
temp = train_df[population_level_features + [target_variable] + ["patient_zip3", "patient_state"]].copy()
temp['patient_zip3'] = temp['patient_zip3'].astype(str).str.zfill(3)
temp['mapped_zip'] = temp['patient_zip3']

# Aggregate data by ZIP code
agg_dict = {feature: 'mean' for feature in population_level_features}
agg_dict.update({'patient_zip3': 'size', 'treatment_pd': 'mean', 'patient_state': lambda x: x.mode()[0]})

zip_stats = temp.groupby('mapped_zip').agg(agg_dict).reset_index()

# Rename columns for clarity
zip_stats.rename(columns={'patient_zip3': 'num_patients', 'treatment_pd': 'avg_treatment_pd'}, inplace=True)

# Fill missing values with 0
zip_stats.fillna(0, inplace=True)

# Load the GeoJSON file for ZIP codes
geojson_path = 'three_dig_zips.geojson'
with open(geojson_path, 'r') as f:
    geojson_content = json.load(f)

# Create ZIP to State Mapping
zip_to_state_mapping = temp.groupby('patient_zip3')['patient_state'].agg(lambda x: x.mode()[0]).to_dict()

def map_zip_to_state(zip_code, state):
    zip_code = str(zip_code)
    return zip_to_state_mapping.get(zip_code, state)

temp['mapped_state'] = temp.apply(lambda row: map_zip_to_state(row['patient_zip3'], row['patient_state']), axis=1)

# Ensure all states are included, even if no data
all_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
              'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 
              'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR']
all_state_names = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 
                   'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 
                   'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
                   'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
                   'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
                   'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 
                   'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico']

state_name_mapping = dict(zip(all_states, all_state_names))

temp_all_states = pd.DataFrame({'mapped_state': all_states})

# Calculate the number of patients, average treatment period, and averages of other numerical features per state
agg_dict_state = {feature: 'mean' for feature in population_level_features}
agg_dict_state.update({'patient_zip3': 'size', target_variable: 'mean'})

state_stats = temp.groupby('mapped_state').agg(agg_dict_state).reset_index()

# Rename columns for clarity
state_stats.rename(columns={'patient_zip3': 'num_patients', target_variable: 'avg_treatment_pd'}, inplace=True)

# Merge with the all_states DataFrame to ensure all states are included
temp_all_states = temp_all_states.merge(state_stats, on='mapped_state', how='left')

# Fill missing values with 0
temp_all_states.fillna(0, inplace=True)




# Function to create and save choropleth map for each feature
def create_combined_choropleth(feature):
    # Create state-level choropleth map
    fig_state = px.choropleth(temp_all_states,
                              geojson='https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json',
                              locations='mapped_state',
                              locationmode='USA-states',
                              color=feature,
                              color_continuous_scale='Blues',
                              scope='usa',
                              labels={feature: feature.replace('_', ' ').title()},
                              hover_data=['state_name'])

    # # Merge zip_stats with temp_all_states to get state-level information
    # zip_stats_copy = zip_stats.merge(temp_all_states[['mapped_state', feature]], left_on='patient_state', right_on='mapped_state', how='left')
    # # zip_stats_copy.rename(columns={f'{feature}_x': f'Zipcode {feature}', f'{feature}_y': f'State {feature}'}, inplace=True)
    # zip_stats_copy.rename(columns={'avg_treatment_pd_y': 'State overall avg treatment_pd'}, inplace=True)
    # zip_stats_copy.rename(columns={'avg_treatment_pd_x': 'Zipcode overall avg treatment_pd'}, inplace=True)
    # Add state names for labeling
    temp_all_states['state_name'] = temp_all_states['mapped_state'].map(state_name_mapping)
    zip_stats_copy = zip_stats.merge(temp_all_states[['mapped_state', 'avg_treatment_pd', "num_patients"]], left_on='patient_state', right_on='mapped_state', how='left')
    zip_stats_copy.rename(columns={'avg_treatment_pd_y': 'State overall avg treatment_pd'}, inplace=True)
    zip_stats_copy.rename(columns={'avg_treatment_pd_x': 'Zipcode overall avg treatment_pd'}, inplace=True)
    zip_stats_copy.rename(columns={'num_patients_x': "Zipcode num_patients"}, inplace=True)
    zip_stats_copy.rename(columns={'num_patients_y': "State num_patients"}, inplace=True)
    print(zip_stats_copy.head())



    # Create ZIP code-level choropleth map
    fig_zip = px.choropleth(zip_stats_copy,
                            geojson=geojson_content,
                            locations='mapped_zip',
                            featureidkey='properties.3dig_zip',
                            color=feature,
                            color_continuous_scale='Reds',
                            scope='usa',
                            labels={f'Zipcode {feature}': f'Zipcode {feature}'.replace('_', ' ').title()},
                            hover_data=['mapped_zip', 'patient_state', 'Zipcode num_patients', 'State num_patients'])

    # Combine the state-level and ZIP code-level maps
    fig = fig_state
    for data in fig_zip.data:
        fig.add_trace(data)

    # Update layout
    fig.update_layout(title=f'Combined Choropleth Map: {feature.replace("_", " ").title()}')

    # Save the figure to an HTML file
    os.makedirs('choropleth_maps_combined', exist_ok=True)
    file_name = f"choropleth_maps_combined/{feature}.html"
    fig.write_html(file_name)

    # Display the combined choropleth map
    # display(IFrame(src=file_name, width=800, height=600))

# Create and save choropleth maps for each population-level numerical feature
# for feature in ['num_patients'] + population_level_features:
for feature in [feature for feature in population_level_features if feature in zip_stats_copy.columns]:
    create_combined_choropleth(feature)


In [None]:
# import pandas as pd
# import json
# import plotly.express as px
# from IPython.display import IFrame, display

# # Load the GeoJSON file
# geojson_path = 'three_dig_zips.geojson'
# with open(geojson_path, 'r') as f:
#     geojson_content = json.load(f)

# # Create a DataFrame with one ZIP code and a value of 1
# # zip_stats = pd.DataFrame({
# #   'mapped_zip': ['032', '033', '034', '035', '036', '037', '038', '039', '040', '041', '042', '043'],
# #     'value': [1, 2] * 6 
# # })

# # Create a DataFrame with 100 ZIP codes and a value of 1 for each
# zip_codes = [feature['properties']['3dig_zip'] for feature in geojson_content['features'][:100]]
# zip_stats = pd.DataFrame({
#     'mapped_zip': zip_codes,
#     'value': [1] * len(zip_codes)
# })

# # Simple choropleth plot with value 1 for one ZIP code
# fig = px.choropleth(zip_stats,
#                     geojson=geojson_content,
#                     locations='mapped_zip',
#                     featureidkey='properties.3dig_zip',
#                     color='value',
#                     color_continuous_scale='Viridis',
#                     scope='usa',
#                     labels={'value': 'Value'},
#                     hover_data=['mapped_zip'])

# fig.update_geos(fitbounds="locations", visible=False)
# fig.update_layout(title='Choropleth Map with Value 1 for One ZIP Code')

# # Save the figure to an HTML file
# file_name = "one_zip_value_map.html"
# fig.write_html(file_name)

# # Display the choropleth map
# IFrame(src=file_name, width=800, height=600)



# OLD CODE

In [None]:
full_data = pd.concat([train_imputed, train_df], axis=1).dropna(axis = 1)

In [None]:
plt.figure(figsize=(50,20))
plt.bar(train_imputed['patient_state'].value_counts().index, train_imputed['patient_state'].value_counts().values)

In [None]:
plt.figure(figsize=(50,20))
train_df['patient_zip3'].value_counts().sort_values(ascending=False).plot(kind='bar')
train_df['patient_zip3'].value_counts().sort_values(ascending=False).index

# plt.bar(train_df['patient_zip3'].value_counts().index, train_df['patient_zip3'].value_counts().values)