## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [2]:
import os
import sys

In [3]:
this_dir = os.path.abspath("")

In [4]:
parent_df = os.path.dirname(os.path.dirname(this_dir))

In [5]:
sys.path.insert(0, parent_df)

Import utilities from `../utils` module.

In [6]:
from utils import env, reader, writer, cleaners

## Imports

In [7]:
import json
import numpy as np
import pandas as pd
import geopandas as gpd

In [8]:
mapped_df = (
    reader.processed_csv(
        "places/timeseries-mapped.csv",
        dtype={
            "fips": str,
            "id": str,
            "clean_name": str,
            "county": str,
            "raw_name": str,
            "slug": str,
            "confirmed_cases_note": str,
            "confirmed_cases": int,
            "lat": float,
            "lon": float,
            "population": float,
            "zcta_id": str,
            "id": str,
            "source": str,
        },
        parse_dates=["date"],
    )
    .rename(columns={"clean_name": "name"})
    .drop("raw_name", axis=1)
)

📖🗄️ /Users/stiles/github/coronavirus-tracker/_notebooks/data/processed/places/timeseries-mapped.csv ➡️ 612,105 records


## Analyze

In [9]:
grouper = ["county", "slug"]

New case counts

In [10]:
mapped_df["new_confirmed_cases"] = mapped_df.groupby(grouper).confirmed_cases.diff()

Replace all the values below zero with zero

In [11]:
mapped_df.new_confirmed_cases = mapped_df.new_confirmed_cases.apply(
    lambda x: 0 if x < 0 else x
).fillna(0)

Seven day averages

In [12]:
mapped_df["new_confirmed_cases_seven_day_total"] = (
    mapped_df.groupby(grouper).new_confirmed_cases.rolling(7).sum().droplevel([0, 1])
)

In [13]:
mapped_df["new_confirmed_cases_seven_day_average"] = (
    mapped_df.groupby(grouper).new_confirmed_cases.rolling(7).mean().droplevel([0, 1])
)

In [14]:
mapped_df["new_confirmed_cases_fourteen_day_total"] = (
    mapped_df.groupby(grouper).new_confirmed_cases.rolling(14).sum().droplevel([0, 1])
)

In [15]:
mapped_df["new_confirmed_cases_fourteen_day_average"] = (
    mapped_df.groupby(grouper).new_confirmed_cases.rolling(14).mean().droplevel([0, 1])
)

Population adjustment

In [16]:
percap = lambda numerator, denominator: (numerator / denominator) * 100000

In [17]:
mapped_df["confirmed_cases_per_100k"] = percap(
    mapped_df.confirmed_cases,
    mapped_df.population,
)

In [18]:
mapped_df["new_confirmed_cases_per_100k"] = percap(
    mapped_df.new_confirmed_cases,
    mapped_df.population,
)

In [19]:
mapped_df["new_confirmed_cases_seven_day_per_100k"] = percap(
    mapped_df.new_confirmed_cases_seven_day_total,
    mapped_df.population,
)

In [20]:
mapped_df["new_confirmed_cases_fourteen_day_per_100k"] = percap(
    mapped_df.new_confirmed_cases_fourteen_day_total,
    mapped_df.population,
)

Wipe out percapita numbers for LA cities under 1000 people

In [22]:
small_la_cities = (
    (~pd.isnull(mapped_df.population))
    & (mapped_df.population < 1000)
    & (mapped_df.county == "Los Angeles")
)

In [23]:
mapped_df.loc[small_la_cities, "confirmed_cases_per_100k"] = pd.NA

In [24]:
mapped_df.loc[small_la_cities, "new_confirmed_cases_per_100k"] = pd.NA

In [25]:
mapped_df.loc[small_la_cities, "new_confirmed_cases_seven_day_per_100k"] = pd.NA

In [26]:
mapped_df.loc[small_la_cities, "new_confirmed_cases_fourteen_day_per_100k"] = pd.NA

Wipe out percapita numbers for areas under 50 people

In [27]:
small_ca_areas = (
    (~pd.isnull(mapped_df.population))
    & (mapped_df.population < 50)
    & (mapped_df.county != "Los Angeles")
)

In [28]:
mapped_df.loc[small_ca_areas, "confirmed_cases_per_100k"] = pd.NA

In [29]:
mapped_df.loc[small_ca_areas, "new_confirmed_cases_per_100k"] = pd.NA

In [30]:
mapped_df.loc[small_ca_areas, "new_confirmed_cases_seven_day_per_100k"] = pd.NA

In [31]:
mapped_df.loc[small_ca_areas, "new_confirmed_cases_fourteen_day_per_100k"] = pd.NA

Counties with de-identified cities

In [32]:
de_id_areas = ~pd.isnull(mapped_df.confirmed_cases_note)

In [33]:
mapped_df.loc[de_id_areas, "confirmed_cases_per_100k"] = pd.NA
mapped_df.loc[de_id_areas, "new_confirmed_cases_per_100k"] = pd.NA
mapped_df.loc[de_id_areas, "new_confirmed_cases_seven_day_per_100k"] = pd.NA
mapped_df.loc[de_id_areas, "new_confirmed_cases_fourteen_day_per_100k"] = pd.NA

In [34]:
filter_zeros = (~pd.isnull(mapped_df.population)) & (mapped_df.population < 1)

In [35]:
mapped_df.loc[filter_zeros, "confirmed_cases_per_100k"] = pd.NA

In [36]:
mapped_df.loc[filter_zeros, "new_confirmed_cases_per_100k"] = pd.NA

In [37]:
mapped_df.loc[filter_zeros, "new_confirmed_cases_seven_day_per_100k"] = pd.NA

In [38]:
mapped_df.loc[filter_zeros, "new_confirmed_cases_fourteen_day_per_100k"] = pd.NA

## Get the latest stuff

In [39]:
max_dates = mapped_df.groupby(["county"]).date.max().reset_index()

In [40]:
try:
    assert len(max_dates[max_dates["date"] <= env.get_date_in_progress()]) == len(
        max_dates
    )
except AssertionError as e:
    print(
        f"""ERROR: {len(max_dates[max_dates["date"] > env.get_date_in_progress()])} counties have place list dates in the future"""
    )
    print(mapped_df[mapped_df.date > env.get_date_in_progress()].county.unique())

In [110]:
trimmed_df = mapped_df

---

Calculate 28-day change to seven-day case rate in larger LA communities

In [139]:
trimmed_df_la = trimmed_df[
    (trimmed_df["county"] == "Los Angeles")
    & (trimmed_df["population"] > 1000)
    & (trimmed_df["new_confirmed_cases_seven_day_per_100k"] > 0)
].copy()

In [140]:
trimmed_df_la.columns

Index(['name', 'date', 'county', 'fips', 'slug', 'confirmed_cases_note',
       'confirmed_cases', 'lat', 'lon', 'population', 'zcta_id', 'id',
       'new_confirmed_cases', 'new_confirmed_cases_seven_day_total',
       'new_confirmed_cases_seven_day_average',
       'new_confirmed_cases_fourteen_day_total',
       'new_confirmed_cases_fourteen_day_average', 'confirmed_cases_per_100k',
       'new_confirmed_cases_per_100k',
       'new_confirmed_cases_seven_day_per_100k',
       'new_confirmed_cases_fourteen_day_per_100k'],
      dtype='object')

In [141]:
trimmed_df_la["new_confirmed_cases_month_change"] = (
    trimmed_df_la.new_confirmed_cases_seven_day_per_100k.pct_change(28)
    .replace([np.inf, -np.inf], -1)
    .round(2)
) * 100

In [142]:
slim_df_la = trimmed_df_la[
    [
        "name",
        "date",
        "county",
        "fips",
        "slug",
        "lat",
        "lon",
        "population",
        "zcta_id",
        "id",
        "new_confirmed_cases_seven_day_per_100k",
        "new_confirmed_cases_fourteen_day_per_100k",
        "new_confirmed_cases_month_change",
    ]
]

In [143]:
slim_latest_df_la = slim_df_la[slim_df_la["date"] == slim_df_la["date"].max()]

In [149]:
slim_latest_df_la.sort_values("new_confirmed_cases_month_change", ascending=False).head(
    10
)

Unnamed: 0,name,date,county,fips,slug,lat,lon,population,zcta_id,id,new_confirmed_cases_seven_day_per_100k,new_confirmed_cases_fourteen_day_per_100k,new_confirmed_cases_month_change,geometry
243716,Tarzana,2021-07-12,Los Angeles,37,los-angeles-tarzana,34.156974,-118.549107,30876.0,,Los Angeles - Tarzana,71.252753,100.401606,2100.0,POINT (-118.54911 34.15697)
167728,Green Meadows,2021-07-12,Los Angeles,37,los-angeles-green-meadows,33.950734,-118.263144,21505.0,,Los Angeles - Green Meadows,88.351546,125.552197,1800.0,POINT (-118.26314 33.95073)
152240,Downtown,2021-07-12,Los Angeles,37,los-angeles-downtown,34.044724,-118.257803,27507.0,,Los Angeles - Downtown,258.116116,352.63751,1675.0,POINT (-118.25780 34.04472)
169180,Harbor City,2021-07-12,Los Angeles,37,los-angeles-harbor-city,33.781993,-118.296386,29070.0,,Los Angeles - Harbor City,58.479532,79.119367,1600.0,POINT (-118.29639 33.78199)
267916,Watts,2021-07-12,Los Angeles,37,los-angeles-watts,33.940452,-118.241148,42674.0,,Los Angeles - Watts,39.836903,72.643764,1600.0,POINT (-118.24115 33.94045)
161436,Encino,2021-07-12,Los Angeles,37,los-angeles-encino,34.155925,-118.501698,45172.0,,Los Angeles - Encino,73.054104,110.688037,1550.0,POINT (-118.50170 34.15593)
168696,Hancock Park,2021-07-12,Los Angeles,37,los-angeles-hancock-park,34.069715,-118.32887,17039.0,,Los Angeles - Hancock Park,93.902224,181.93556,1500.0,POINT (-118.32887 34.06972)
211772,Park La Brea,2021-07-12,Los Angeles,37,los-angeles-park-la-brea,34.069731,-118.355628,13580.0,,Los Angeles - Park La Brea,103.092784,147.275405,1300.0,POINT (-118.35563 34.06973)
175472,Hollywood Hills,2021-07-12,Los Angeles,37,los-angeles-hollywood-hills,34.120516,-118.346365,29434.0,,Los Angeles - Hollywood Hills,88.33322,159.679282,1200.0,POINT (-118.34637 34.12052)
186604,Lakewood,2021-07-12,Los Angeles,37,city-of-lakewood,33.84707,-118.122171,80362.0,,City of Lakewood,47.286031,84.617108,1167.0,POINT (-118.12217 33.84707)


In [145]:
slim_latest_df_la_gdf = gpd.GeoDataFrame(
    slim_latest_df_la,
    geometry=gpd.points_from_xy(slim_latest_df_la.lon, slim_latest_df_la.lat),
)

In [147]:
slim_latest_df_la_gdf.to_file(
    "../../data/processed/places/latest_la_change.geojson", driver="GeoJSON"
)

---

In [42]:
latest_df = trimmed_df.merge(max_dates, on=["county", "date"], how="inner").sort_values(
    ["county", "name"]
)

Round off coordinates

In [43]:
latest_df.lon = latest_df.lon.round(6)
latest_df.lat = latest_df.lat.round(6)

In [44]:
trimmed_latest_df = latest_df[
    [
        "county",
        "fips",
        "id",
        "name",
        "slug",
        "confirmed_cases",
        "new_confirmed_cases_fourteen_day_total",
        "confirmed_cases_note",
        "confirmed_cases_per_100k",
        "new_confirmed_cases_per_100k",
        "new_confirmed_cases_fourteen_day_per_100k",
        "lat",
        "lon",
    ]
]

In [45]:
socal = [
    "Los Angeles",
    "Ventura",
    "San Diego",
    "Santa Barbara",
    "Riverside",
    "Orange",
    "San Bernardino",
    "San Luis Obispo",
    "Kern",
]

In [46]:
socal_latest_df = trimmed_latest_df[trimmed_latest_df.county.isin(socal)]

How much of the state do we have covered?

In [47]:
counties_with_cities = trimmed_latest_df.county.nunique()

In [48]:
population_covered = sum(
    [
        cleaners.get_county_metadata(c)["population"]
        for c in trimmed_latest_df.county.unique()
    ]
)

In [49]:
population_total = 39512000

In [50]:
population_covered_percent = (population_covered / population_total) * 100

## Tests

Verify we don't have any gigantic leaps

In [51]:
latest_df["new_confirmed_cases_pct_change"] = latest_df.new_confirmed_cases / (
    latest_df.confirmed_cases - latest_df.new_confirmed_cases
)

In [52]:
latest_df = latest_df.replace([np.inf, -np.inf], pd.NA)

In [53]:
test_df = latest_df[
    (latest_df.new_confirmed_cases_pct_change > 1)
    & (latest_df.new_confirmed_cases > 10)
]
try:
    assert len(test_df) == 0
except AssertionError:
    raise Exception(
        f"{len(test_df)} cities have a suspiciously big increase in new cases. \n {test_df}"
    )

In [54]:
test_df = latest_df[
    (latest_df.new_confirmed_cases_pct_change < 0)
    & (latest_df.new_confirmed_cases < -10)
]
try:
    assert len(test_df) == 0
except AssertionError:
    raise Exception(
        f"{len(test_df)} cities have a suspiciously big decrease in new cases. \n {test_df}"
    )

## Export

In [55]:
writer.processed_csv(trimmed_df, "places/timeseries.csv")

✒️🗄️ 612,105 records ➡️ ./processed//Users/stiles/github/coronavirus-tracker/_notebooks/data/processed/places/timeseries.csv


In [56]:
latest_df = trimmed_df[trimmed_df.date == trimmed_df.date.max()]

In [57]:
writer.processed_csv(latest_df, "places/latest.csv")

✒️🗄️ 1,256 records ➡️ ./processed//Users/stiles/github/coronavirus-tracker/_notebooks/data/processed/places/latest.csv


In [58]:
export_options = dict(
    dtype={
        "confirmed_cases": int,
        "id": str,
        "fips": str,
        "name": str,
    },
    indent=2,
)

In [59]:
trimmed_latest_df[trimmed_latest_df.name == "Glendale"].iloc[0]

county                                            Los Angeles
fips                                                      037
id                                           City of Glendale
name                                                 Glendale
slug                                         city-of-glendale
confirmed_cases                                         20428
new_confirmed_cases_fourteen_day_total                  149.0
confirmed_cases_note                                      NaN
confirmed_cases_per_100k                          9892.829297
new_confirmed_cases_per_100k                              0.0
new_confirmed_cases_fourteen_day_per_100k            72.15741
lat                                                 34.181933
lon                                               -118.246803
Name: 390, dtype: object

In [60]:
writer.baker_json(trimmed_latest_df, "places/latest/all.json", **export_options)

✒️🗄️ 1,383 records ➡️ /Users/stiles/github/coronavirus-tracker/_data/places/latest/all.json


In [61]:
writer.baker_json(socal_latest_df, "places/latest/socal.json", **export_options)

✒️🗄️ 740 records ➡️ /Users/stiles/github/coronavirus-tracker/_data/places/latest/socal.json


In [62]:
writer.baker_json(socal_latest_df, "apple/latest_cities_socal.json", **export_options)

✒️🗄️ 740 records ➡️ /Users/stiles/github/coronavirus-tracker/_data/apple/latest_cities_socal.json


In [63]:
writer.baker_json_by_county(trimmed_latest_df, "places/latest/", **export_options)

✒️🗄️ Writing 50 county files to places/latest/


In [64]:
with open(env.BAKER_DIR.joinpath("places/counties_with_cities.json"), "w") as f:
    json.dump(
        {
            "total": counties_with_cities,
            "population_covered": int(population_covered),
            "population_covered_percent": population_covered_percent,
        },
        f,
        indent=2,
    )

## GeoJSON

In [65]:
trimmed_gdf = gpd.GeoDataFrame(
    trimmed_latest_df,
    geometry=gpd.points_from_xy(trimmed_latest_df.lon, trimmed_latest_df.lat),
)

In [66]:
geojson_path = env.BAKER_DIR.joinpath("places/geojson/latest.json")

In [67]:
geojson_path.parent.mkdir(parents=True, exist_ok=True)

In [None]:
print(f"💽 {len(trimmed_latest_df)} ➡️ {geojson_path}")

In [69]:
trimmed_gdf[~pd.isnull(trimmed_gdf.lat)].drop(["lat", "lon"], axis=1).to_file(
    geojson_path, driver="GeoJSON"
)