In [None]:
%config InlineBackend.figure_formats = ["retina"]

In [None]:
!pip install -q geopy seaborn statsmodels

In [None]:
import sqlite3

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from functools import partial

from geopy import distance
from shapely.geometry import Point

In [None]:
optn_df = pd.read_sql("SELECT * FROM optn", sqlite3.connect("../../preprocessing/outputs/optn_reduced.sqlite3"))
optn_df["deceased_donor"] = optn_df["deceased_donor"].astype(bool)
optn_df["succeeded_1_year"] = optn_df["succeeded_1_year"].astype(bool)

In [None]:
optn_df

In [None]:
def transplant_distance_miles(row):
    return distance.distance(
        (row["donor_lat"], row["donor_lon"]),
        (row["transplant_lat"], row["transplant_lon"])
    ).miles

In [None]:
optn_df["distance_miles"] = optn_df.apply(transplant_distance_miles, axis=1)

In [None]:
optn_df

In [None]:
organs = ["Liver", "Kidney"]

In [None]:
optn_df["organ"].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(18, 6))
for organ in organs:
    optn_df[(optn_df["distance_miles"] > 0) & (optn_df["organ"] == organ.lower())].hist(
        "preservation_hours", ax=ax, alpha=0.8, bins=range(0, 49), density=True, label=organ,
    )

ax.set_title("Cold ischemic time by organ")
ax.set_xlabel("Cold ischemic time (hours)")
ax.set_ylabel("Freq")
plt.legend()
plt.show()

In [None]:
for organ in organs:
    fig, ax = plt.subplots(figsize=(12, 6))
    optn_df[(optn_df["distance_miles"] > 0) & (optn_df["organ"] == organ.lower())].hist(
        "distance_miles", ax=ax, bins=range(0, 2500, 50),density=True, label=organ,
    )

    ax.set_title(f"{organ} transit distance")
    ax.set_xlabel("Distance (miles)")
    ax.set_ylabel("Freq")
    plt.show()

In [None]:
for organ in organs:
    fig, ax = plt.subplots(figsize=(12, 6))
    optn_df[(optn_df["distance_miles"] > 0) & (optn_df["organ"] == organ.lower())].plot.scatter(
        y="preservation_hours", x="distance_miles", ax=ax, s=0.2,
    )
    ax.set_title(f"{organ} cold ischemic time vs. transit distance")
    ax.set_xlabel("Distance (miles)")
    ax.set_ylabel("Cold ischemic time (hours)")
    plt.show()

In [None]:
for organ in organs:
    joint_df = optn_df[
        (optn_df["distance_miles"] > 0) & 
        (optn_df["organ"] == organ.lower()) &
        (optn_df["distance_miles"] <= 2000)
    ]
    ax = sns.kdeplot(
        data=joint_df,
        x="distance_miles",
        y="preservation_hours",
        fill=True,
    )
    ax.set_title(f"{organ} cold ischemic time vs. transit distance")
    ax.set_xlabel("Distance (miles)")
    ax.set_ylabel("Cold ischemic time (hours)")
    plt.show()

In [None]:
for organ in organs:
    success_df = optn_df[
        (optn_df["distance_miles"] > 0) & 
        (optn_df["organ"] == organ.lower())
    ].sample(frac=0.2)
    ax = sns.regplot(
        data=success_df,
        x=success_df["distance_miles"],
        y=success_df["succeeded_1_year"],
        logistic=True,
    )
    ax.set_title(f"{organ} p(success) by transit distance")
    ax.set_xlabel("Distance (miles)")
    ax.set_ylabel("p(success)")
    plt.show()

In [None]:
for organ in organs:
    success_df = optn_df[(optn_df["organ"] == organ.lower())].sample(frac=0.2)
    ax = sns.regplot(
        data=success_df,
        x=success_df["preservation_hours"],
        y=success_df["succeeded_1_year"],
        logistic=True,
    )
    ax.set_title(f"{organ} p(success) by cold ischemic time")
    ax.set_xlabel("Cold ischemic time (hours)")
    ax.set_ylabel("p(success)")
    plt.show()

## Transplant volume by Census region/division

In [None]:
region_gdf = gpd.read_file("https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_region_500k.zip")

In [None]:
division_gdf = gpd.read_file("https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_division_500k.zip")

In [None]:
def loc_to_label(row, label_gdf, prefix, cache=None):
    coords = (row[f"{prefix}_lon"], row[f"{prefix}_lat"])
    if cache and coords in cache:
        return cache[coords]

    p = Point(*coords)
    matches = label_gdf.iloc[label_gdf.sindex.query(p, predicate="intersects")]
    if not len(matches):
        return "N/A"
    
    assert len(matches) == 1
    cache[coords] = matches.iloc[0]["NAME"]
    return matches.iloc[0]["NAME"]

In [None]:
region_cache = {}
optn_df["transplant_census_region"] = optn_df.apply(
    partial(loc_to_label, label_gdf=region_gdf, prefix="transplant", cache=region_cache),
    axis=1,
)
optn_df["donor_census_region"] = optn_df.apply(
    partial(loc_to_label, label_gdf=region_gdf, prefix="donor", cache=region_cache),
    axis=1,
)

In [None]:
division_cache = {}
optn_df["transplant_census_division"] = optn_df.apply(
    partial(loc_to_label, label_gdf=division_gdf, prefix="transplant", cache=division_cache),
    axis=1,
)
optn_df["donor_census_division"] = optn_df.apply(
    partial(loc_to_label, label_gdf=division_gdf, prefix="donor", cache=division_cache),
    axis=1,
)

In [None]:
optn_df

In [None]:
pd.crosstab(optn_df["transplant_census_region"], optn_df["donor_census_region"])

In [None]:
pd.crosstab(optn_df["transplant_census_division"], optn_df["donor_census_division"])