In [None]:
# Ch02-2 - Pitfalls of joining data with pandas

In [None]:
########################## Notes & Updates ##############################################################
# If you are using Docker and your data directory is mapped to "/data" then you can use the commented-out
#   Docker lines below in place of the primary line (which you will comment out when running)
# You will also find other alternative lines or blocks that can be used to avoid potential issues 
#########################################################################################################

In [None]:
# 1. Import Libraries
import pandas as pd

In [None]:
# 2. Jumble the data using random sampling
vdata = pd.read_csv("data/2021VAERSDATA.csv.gz", encoding="iso-8859-1") 
# vdata = pd.read_csv("/data/2021VAERSDATA.csv.gz", encoding="iso-8859-1")  # Docker
vdata.sample(frac=0.9).to_csv("vdata_sample.csv.gz", index=False) 
vax = pd.read_csv("data/2021VAERSVAX.csv.gz", encoding="iso-8859-1") 
# vax = pd.read_csv("/data/2021VAERSVAX.csv.gz", encoding="iso-8859-1") # Docker 
vax.sample(frac=0.9).to_csv("vax_sample.csv.gz", index=False) 
# Note - it is ok to get a dtype warning here

In [None]:
# 3. Inner join on the tables
vdata = pd.read_csv("vdata_sample.csv.gz", low_memory=False) 
vax = pd.read_csv("vax_sample.csv.gz", low_memory=False) 
vdata_with_vax = vdata.join(vax.set_index("VAERS_ID"), on="VAERS_ID", how="inner") 
# vdata_with_vax = vdata.merge(vax, on="VAERS_ID", how="inner") # Docker - Alternate method 
len(vdata), len(vax), len(vdata_with_vax) 

In [None]:
# 4. Find the data not captured by the join
lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)] 
lost_vdata 
lost_vax = vax[~vax["VAERS_ID"].isin(vdata.index)] 
lost_vax 

In [None]:
# 5. Left outer join
vdata_with_vax_left = vdata.join(vax.set_index("VAERS_ID"), on="VAERS_ID") 
vdata_with_vax_left.groupby("VAERS_ID").size().sort_values() 
# vdata_with_vax_left = vdata.merge(vax, on="VAERS_ID", how="left") # Docker - alternate version
# vdata_with_vax_left.groupby("VAERS_ID").size().sort_values() # Docker - alternate version

In [None]:
# 6. Right join
dead = vdata[vdata.DIED == "Y"] 
vax19 = vax[vax.VAX_TYPE == "COVID19"] 
vax19_dead = vax19.join(dead.set_index("VAERS_ID"), on="VAERS_ID", how="right") 
len(vax19), len(dead), len(vax19_dead) 
len(vax19_dead[vax19_dead.VAERS_ID.duplicated()]) 
len(vax19_dead) - len(dead) 

In [None]:
# 6. Right join (alternate method)
# dead = vdata[vdata.DIED == "Y"] 
# vax19 = vax[vax.VAX_TYPE == "COVID19"] 
# vax19_dead = vax19.merge(dead, on="VAERS_ID", how="right")
# len(vax19), len(dead), len(vax19_dead)
# len(vax19_dead[vax19_dead.VAERS_ID.duplicated()])
# len(vax19_dead) - len(dead) 

In [None]:
# Problematic lots
vax19_dead["STATE"] = vax19_dead["STATE"].str.upper() 
dead_lot = vax19_dead[["VAERS_ID", "VAX_LOT", "STATE"]].set_index(["VAERS_ID", "VAX_LOT"]) 
dead_lot_clean = dead_lot[~dead_lot.index.duplicated()] 
dead_lot_clean = dead_lot_clean.reset_index() 
dead_lot_clean[dead_lot_clean.VAERS_ID.isna()] 
baddies = dead_lot_clean.groupby("VAX_LOT").size().sort_values(ascending=False) 

In [None]:
# Print problematic lots
for i, (lot, cnt) in enumerate(baddies.items()):
    print(lot, cnt, len(dead_lot_clean[dead_lot_clean.VAX_LOT == lot].groupby("STATE")))
    if i == 10:
        break

In [None]:
# End of Notebook #