## Spatiotemporal Analysis of UV Exposure(J/m2) VS Melanoma cases across the US

### Data Source: Centers for Disease Control and Prevention (CDC)

### Data Loading and Cleaning

In [None]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

uv = pd.read_csv('UV.csv')
skn = pd.read_csv('skin.csv')


In [None]:
def remove_parentheses(s):
    return re.sub(r'\([^)]*\)', '', s)

In [None]:
skn_clean = skn[skn.Value != "Suppressed"]
skn_clean['Race Ethnicity'] = skn_clean['Race Ethnicity'].apply(remove_parentheses)
skn_clean = skn_clean.rename(columns={'Value': 'Melanoma_cases'})
skn_clean

## Race Ethnicity 

In [None]:
import matplotlib.pyplot as plt
race = skn_clean['Race Ethnicity'].value_counts()
race.plot(kind='bar')

plt.title('Value Counts of Race / Ethnicity')
plt.xlabel('Ethnicity')
plt.ylabel('Counts')
plt.grid(True)
plt.show()

# Sex

In [None]:
sex = skn_clean['Gender'].value_counts()
sex.plot(kind='bar')

plt.title('Value Counts of Sex')
plt.xlabel('Sex')
plt.ylabel('Counts')
plt.grid(True)
plt.show()

## Temporal Analysis

In [None]:
skn_clean['Melanoma_cases'] = skn_clean['Melanoma_cases'].str.replace(',', '').astype(float)
sc_avg = skn_clean.groupby("State")["Melanoma_cases"].mean().reset_index()

# Sort the data in descending order (states with highest average cancer count will be on top)
sc_avg = sc_avg.sort_values(by="Melanoma_cases", ascending=False)

# Plot the data
plt.figure(figsize=(10, 8))
plt.barh(sc_avg['State'], sc_avg['Melanoma_cases'], color='skyblue')
plt.xlabel('Average Cancer Count')
plt.ylabel('State')
plt.title('Average Cancer Count by State over Years')
plt.gca().invert_yaxis()  # invert the y-axis so that states with higher values are on top
plt.show()

In [None]:
uv = uv.drop(['Data Comment', 'Unnamed: 5'], axis=1)
uv = uv.rename(columns={'Value': 'uv_exp'})
uv

In [None]:
uv['uv_exp'] = uv['uv_exp'].str.replace(',', '').astype(float)
uv_avg = uv.groupby("State")["uv_exp"].mean().reset_index()

# Sort the data in descending order (states with highest average cancer count will be on top)
uv_avg = uv_avg.sort_values(by="uv_exp", ascending=False)

# Plot the data
plt.figure(figsize=(10, 8))
plt.barh(uv_avg['State'], uv_avg['uv_exp'], color='skyblue')
plt.xlabel('Average UV Exposure')
plt.ylabel('State')
plt.title('Average UV Exposure by State over Years')
plt.gca().invert_yaxis()  # invert the y-axis so that states with higher values are on top
plt.show()

### Merging UV exposure data with Melanoma_cases data

In [None]:
uv_avg.head(5)

In [None]:
sc_avg.head(5)

In [None]:
agg_df = pd.merge(sc_avg, uv_avg,  how='left', left_on=['State'], right_on = ['State'])

In [None]:
agg_df

In [None]:
import matplotlib.pyplot as plt
import geopandas

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import folium
from folium.features import GeoJsonTooltip

In [None]:
#Read the geoJSON file using geopandas
geojson = gpd.read_file(r'georef-united-states-of-america-county.geojson')
geojson=geojson[['coty_code','geometry','ste_name']] 




In [None]:
geojson['coty_code'] = geojson['coty_code'].str[0]
geojson['ste_name'] = geojson['ste_name'].str[0]


In [None]:
geojson.head()

In [None]:
df_final = geojson.merge(agg_df, left_on="ste_name", right_on="State", how="outer") 
df_final = df_final[~df_final['geometry'].isna()]

In [None]:
df_final.head(10)

In [None]:
us_map = folium.Map(location=[40, -96], zoom_start=4,tiles='openstreetmap')

## UV Exposure (J/m2) across US states (Annual)

In [None]:
custom_scale = (df_final['uv_exp'].quantile((0,0.2,0.4,0.6,0.8,1))).tolist()
folium.Choropleth(
            geo_data=geojson,
            data=df_final,
            columns=['State', 'uv_exp'],
             key_on='feature.properties.ste_name',
            fill_color='YlOrRd',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            highlight=True,#Here we tell folium to get the county fips and plot new_cases_7days metric for each county
            ).add_to(us_map) 
us_map

## Melanoma cases across US states

In [None]:
us_map1 = folium.Map(location=[40, -96], zoom_start=4,tiles='openstreetmap')

In [None]:
custom_scale = (df_final['Melanoma_cases'].quantile((0,0.2,0.4,0.6,0.8,1))).tolist()
folium.Choropleth(
            geo_data=geojson,
            data=df_final,
            columns=['State', 'Melanoma_cases'],
             key_on='feature.properties.ste_name',
            fill_color='YlOrRd',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            highlight=True,#Here we tell folium to get the county fips and plot new_cases_7days metric for each county
            ).add_to(us_map1) 
us_map1

In [None]:
new_df_org = pd.merge(skn_clean, uv,  how='left', left_on=['State','Year'], right_on = ['State','Year']) 
new_df_org.dtypes

In [None]:
new_df_org

## Average normalised correlation across years between UV exposure and Melanoma cases

In [None]:
correlation_by_year = new_df_org.groupby('State').apply(lambda x: x['uv_exp'].corr(x['Melanoma_cases']))



correlation_by_year_df = correlation_by_year.reset_index()
correlation_by_year_df.columns = ['State', 'correlation']

correlation_by_year_df

In [None]:
df_final_cor = geojson.merge(correlation_by_year_df, left_on="ste_name", right_on="State", how="outer") 
df_final_cor

In [None]:
us_map2 = folium.Map(location=[40, -96], zoom_start=4,tiles='openstreetmap')

In [None]:
custom_scale = (df_final_cor['correlation'].quantile((0,0.2,0.4,0.6,0.8,1))).tolist()
folium.Choropleth(
            geo_data=geojson,
            data=df_final_cor,
            columns=['State', 'correlation'],
            key_on='feature.properties.ste_name',
            fill_color='YlOrRd',
            nan_fill_color="White", #Use white color if there is no data available for the county
            fill_opacity=0.7,
            line_opacity=0.2,
            highlight=True,#Here we tell folium to get the county fips and plot new_cases_7days metric for each county
            ).add_to(us_map2) 
us_map2