# Spatiotemporal Analysis of UV Drought VS Stroke cases across the US

### Data Source: Centers for Disease Control and Prevention (CDC)

### Data Loading and Cleaning

In [None]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

stroke = pd.read_csv('Stroke.csv', encoding='ISO-8859-1')
drought = pd.read_csv('Drought.csv', encoding='ISO-8859-1')

In [None]:
stroke

## Data Wrangling

In [None]:
def remove_parentheses(s):
    return re.sub(r'\([^)]*\)', '', s)

In [None]:
stroke = stroke.drop(['StateFIPS', 'CountyFIPS','CountyFIPS','County','End Year','Data Comment'], axis=1)
stroke = stroke[stroke.Value != "Insufficient Data"]
stroke['Race/Ethnicity'] = stroke['Race/Ethnicity'].str.replace(', not including Hispanic', '', regex=False)
stroke = stroke.rename(columns={'Start Year': 'Year'})
stroke

In [None]:
drought

In [None]:
drought = drought.drop(['StateFIPS', 'CountyFIPS','CountyFIPS','County','Cumulative Drought Severity','Data Comment'], axis=1)
drought

## Stroke count across Race/Ethnicity

In [None]:
import matplotlib.pyplot as plt
race = stroke['Race/Ethnicity'].value_counts()
race.plot(kind='bar')

plt.title('stroke Counts of Race / Ethnicity')
plt.xlabel('Ethnicity')
plt.ylabel('Counts')
plt.grid(True)
plt.show()

## Stroke count across Sex

In [None]:
import matplotlib.pyplot as plt
sex = stroke['Gender'].value_counts()
sex.plot(kind='bar')

plt.title('stroke Counts of Sex')
plt.xlabel('Sex')
plt.ylabel('Counts')
plt.grid(True)
plt.show()

In [None]:
stroke['Value'] = stroke['Value'].astype(float)
stroke_ttl = stroke.groupby("State")["Value"].sum().reset_index()

# Sort the data in descending order (states with highest average cancer count will be on top)
stroke_ttl = stroke_ttl.sort_values(by="Value", ascending=False)

# Plot the data
plt.figure(figsize=(10, 8))
plt.barh(stroke_ttl['State'], stroke_ttl['Value'], color='skyblue')
plt.xlabel('Toral stroke')
plt.ylabel('State')
plt.title('Total stroke by State in 2018')
plt.gca().invert_yaxis()  # invert the y-axis so that states with higher values are on top
plt.show()

In [None]:
drought_avg = drought.groupby("State")["Value"].mean().reset_index()

# Sort the data in descending order (states with highest average cancer count will be on top)
drought_avg = drought_avg.sort_values(by="Value", ascending=False)

# Plot the data
plt.figure(figsize=(10, 8))
plt.barh(drought_avg['State'], drought_avg['Value'], color='skyblue')
plt.xlabel('Average drought')
plt.ylabel('State')
plt.title('Average consecutive weeks of drought by State over Year 2018')
plt.gca().invert_yaxis()  # invert the y-axis so that states with higher values are on top
plt.show()

In [None]:
import matplotlib.pyplot as plt
import geopandas
import geopandas as gpd
import pandas as pd
import numpy as np
import folium
from folium.features import GeoJsonTooltip

In [None]:
#Read the geoJSON file using geopandas
geojson = gpd.read_file(r'georef-united-states-of-america-county.geojson')
geojson=geojson[['geometry','ste_name']] 

geojson['ste_name'] = geojson['ste_name'].str[0]
geojson

In [None]:
df_final = geojson.merge(drought_avg, left_on="ste_name", right_on="State", how="outer") 
df_final = df_final[~df_final['geometry'].isna()]
df_final

## Consecutive weeks of drought by State in 2018 

In [None]:
us_map = folium.Map(location=[40, -96], zoom_start=4,tiles='openstreetmap')

In [None]:
custom_scale = (df_final['Value'].quantile((0,0.2,0.4,0.6,0.8,1))).tolist()
folium.Choropleth(
            geo_data=geojson,
            data=df_final,
            columns=['State', 'Value'],
             key_on='feature.properties.ste_name',
            fill_color='YlOrRd',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            highlight=True,#Here we tell folium to get the county fips and plot new_cases_7days metric for each county
            ).add_to(us_map) 
us_map

## Total stroke by State in 2018 

In [None]:
stroke_final = geojson.merge(stroke_ttl, left_on="ste_name", right_on="State", how="outer") 
stroke_final

In [None]:
us_map1 = folium.Map(location=[40, -96], zoom_start=4,tiles='openstreetmap')

In [None]:
custom_scale = (stroke_final['Value'].quantile((0,0.2,0.4,0.6,0.8,1))).tolist()
folium.Choropleth(
            geo_data=geojson,
            data=stroke_final,
            columns=['State', 'Value'],
             key_on='feature.properties.ste_name',
            fill_color='YlOrRd',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            highlight=True,#Here we tell folium to get the county fips and plot new_cases_7days metric for each county
            ).add_to(us_map1) 
us_map1

In [None]:
new_df_org = pd.merge(stroke_ttl, df_final,  how='left', left_on=['State'], right_on = ['State']) 
new_df_org

In [None]:
new_df_org = new_df_org.dropna()
new_df_org

In [None]:
correlation_by_year = new_df_org.groupby('ste_name').apply(lambda x: x['Value_y'].corr(x['Value_x']))

correlation_by_year

correlation_by_year_df = correlation_by_year.reset_index()
correlation_by_year_df.columns = ['State', 'correlation']

correlation_by_year_df