In [None]:
import pandas as pd
import networkx as nx
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

from shapely.geometry import Point
from geopy.distance import geodesic
from matplotlib.colors import LogNorm

from src.transform_data import get_city_name, get_state_name

In [None]:
mapa = gpd.read_file("data/BR_Municipios_2024/BR_Municipios_2024.shp")

In [None]:
mapa['MUN'] = mapa['NM_MUN'].astype(str) + ' - ' + mapa['NM_UF']
mapa.set_index('MUN', inplace=True)
mapa

In [None]:
df = pd.read_csv('data/agg_data/infectious_disease.csv')

In [None]:
df['UF_RES'] = get_state_name(df['MUNIC_RES'])
df['UF_MOV'] = get_state_name(df['MUNIC_MOV'])

In [None]:
df['MUNIC_RES'] = get_city_name(df['MUNIC_RES'])
df['MUNIC_MOV'] = get_city_name(df['MUNIC_MOV'])

In [None]:
df['MUNIC_RES'] = df['MUNIC_RES'].astype(str) + ' - ' + df['UF_RES']
df['MUNIC_MOV'] = df['MUNIC_MOV'].astype(str) + ' - ' + df['UF_MOV']
df

In [None]:
mapa.loc['São Paulo - São Paulo']

In [None]:
def get_centroid_coords(mun_name):
    try:
        geom = mapa.loc[mun_name].geometry
        if geom is None:
            return None
        centroid = geom.centroid
        return (centroid.y, centroid.x)
    except KeyError:
        return None

def calc_distance(row):
    coords_res = get_centroid_coords(row['MUNIC_RES'])
    coords_mov = get_centroid_coords(row['MUNIC_MOV'])
    if coords_res is None or coords_mov is None:
        return None  # If county not found in mapa or geometry is missing
    return geodesic(coords_res, coords_mov).kilometers

df['DIST_KM'] = df.apply(calc_distance, axis=1)
df

In [None]:
grouped_weighted_mean = df.groupby('MUNIC_RES').apply(
    lambda g: (g['DIST_KM'] * g['HOSPITALIZACOES']).sum() / g['HOSPITALIZACOES'].sum()
)

In [None]:
grouped_weighted_mean.mean()

In [None]:

plt.figure(figsize=(10, 6))
grouped_weighted_mean.hist(bins=30, edgecolor='black')
plt.title('Histogram of Mean Distances Traveled for Hospitalization by Municipality (mean = {:.2f} km)'.format(grouped_weighted_mean.mean()))
plt.xlabel('Weighted Mean Distance (km)')
plt.ylabel('Frequency')


In [None]:
grouped_weighted_mean = grouped_weighted_mean.replace(0, 1)
mapa['WEIGHTED_MEAN_DIST'] = mapa.index.map(grouped_weighted_mean).fillna(0)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
mapa.plot(
    column='WEIGHTED_MEAN_DIST',
    ax=ax,
    legend=True,
    legend_kwds={
        'label': "Weighted Mean Distance (km)",
        'orientation': "horizontal"
    },
    cmap='viridis',
    norm=LogNorm(vmin=mapa['WEIGHTED_MEAN_DIST'][mapa['WEIGHTED_MEAN_DIST'] > 0].min(), vmax=mapa['WEIGHTED_MEAN_DIST'].max()),
    missing_kwds={"color": "lightgrey"}
)
plt.title('Mean Distances Traveled for Hospitalization by Municipality (Log Scale)')
plt.show()


In [None]:
total_dist_traveled = df['DIST_KM'] * df['HOSPITALIZACOES']

total_dist_traveled.sum() / df['HOSPITALIZACOES'].sum()

In [None]:
t = []

for row in df.itertuples():
    t.extend([row.DIST_KM] * row.HOSPITALIZACOES)

In [None]:
t = pd.Series(t)

In [None]:
t

In [None]:

plt.figure(figsize=(10, 6))
t.hist(bins=100)
plt.title('Histogram of Distances Traveled for Hospitalization (mean = {:.2f} km)'.format(t.mean()))
plt.xlabel('Weighted Mean Distance (km)')
plt.ylabel('Frequency')

In [None]:
t.mean()

In [None]:
df['DIST_KM'].hist(bins=100, figsize=(10, 6))

In [None]:
# Creating a directed graph from the DataFrame, with weight (HOSPITALIZACOES)
G = nx.from_pandas_edgelist(
    df,
    source='MUNIC_RES',
    target='MUNIC_MOV',
    edge_attr=['HOSPITALIZACOES', 'DIST_KM'],
    create_using=nx.DiGraph()
)

In [None]:
in_degree = dict(G.in_degree(weight='HOSPITALIZACOES'))
out_degree = dict(G.out_degree(weight='HOSPITALIZACOES'))
for node in G.nodes():
    if G.has_edge(node, node):
        out_degree[node] -= G[node][node].get('HOSPITALIZACOES', 0)
degree_diff = {node: in_degree[node] / (in_degree[node] + out_degree[node]) for node in G.nodes()}

degree_diff = pd.Series(degree_diff, name='DEGREE_DIFF')

degree_diff.sort_values(ascending=False).round(3)

In [None]:
# plotting the degree difference
degree_diff.sort_values(ascending=False).plot(kind='bar', figsize=(12, 6), title='Diferença percentual entre hospitalizações no município e moradores do município hospitalizados', ylabel='Degree Difference', xticks=range(0,len(degree_diff),len(degree_diff)//10))

In [None]:
(degree_diff < 0.5).sum(), (degree_diff >= 0.5).sum(), (degree_diff[degree_diff !=0] < 0.5).sum()

In [None]:
(degree_diff == 0).sum(),

In [None]:
df['MUNIC_RES'].unique().shape

In [None]:
d = degree_diff[(degree_diff > 0.0) & (degree_diff < 0.5)]
d.sort_values(ascending=False).plot(kind='bar', figsize=(12, 6), title='Degree Difference by City', ylabel='Degree Difference', xticks=range(0,len(d),len(d)//10))

In [None]:
pop = pd.read_excel("data/estimativa_dou_2024.ods", engine="odf", sheet_name="MUNICÍPIOS")

In [None]:
pop