In [None]:
# Ch02-1-pandas-basic
#  Overview of basic pandas functionality for manipulating data files and tables

In [None]:
# Libraries
import pandas as pd

In [None]:
# Read in the Vaccine data
# Note - the path below assumes you are running your notebook out of the "Ch02" directory with your data
    # in the "/data" directory underneath it - if not replace it with your own path to the data
vdata = pd.read_csv("data/2021VAERSDATA.csv.gz", encoding="iso-8859-1") 
# note - it is ok to get a dtype warning here

In [None]:
vdata.columns 

In [None]:
vdata.dtypes

In [None]:
# Get the shape of your data
vdata.shape

In [None]:
# Access a pandas array using an integer-based location
vdata.iloc[0] 

In [None]:
# Set the index using a column
vdata = vdata.set_index("VAERS_ID") 

In [None]:
# Get the data using a key
vdata.loc[916600] 

In [None]:
# Use head to look at the top part of the data
vdata.head(3) 

In [None]:
# Retrieve the first 3 rows using an array specification
vdata.iloc[:3] 

In [None]:
# Restrict the output to certain columns
vdata.iloc[:5, 2:4] 

In [None]:
# Compute the maximum age in the dataset
vdata["AGE_YRS"].max() 

In [None]:
# A different style of notation
vdata.AGE_YRS.max() 

In [None]:
# Plot the data
vdata["AGE_YRS"].sort_values().plot(use_index=False) 

In [None]:
# Second plot
vdata["AGE_YRS"].plot.hist(bins=20) 

In [None]:
# Plot using matplotlib
import matplotlib.pyplot as plt 
fig, ax = plt.subplots(1, 2, sharey=True) 
fig.suptitle("Age of adverse events") 
vdata["AGE_YRS"].sort_values().plot(use_index=False, ax=ax[0], xlabel="Obervation", ylabel="Age") 
vdata["AGE_YRS"].plot.hist(bins=20, orientation="horizontal") 

In [None]:
# Count events per year
vdata["AGE_YRS"].dropna().apply(lambda x: int(x)).value_counts() 

In [None]:
# Count the number of people who died
vdata.DIED.value_counts(dropna=False) 

In [None]:
# Set the is_dead column
vdata["is_dead"] = (vdata.DIED == "Y") 

In [None]:
# Associate data about deaths with vaccine involved
dead = vdata[vdata.is_dead] 
vax = pd.read_csv("data/2021VAERSVAX.csv.gz", encoding="iso-8859-1").set_index("VAERS_ID") 
vax.groupby("VAX_TYPE").size().sort_values() 
vax19 = vax[vax.VAX_TYPE == "COVID19"] 
vax19_dead = dead.join(vax19) 
# Alternate line - this join may work better if you encounter errors with the line above
#vax19_dead = dead.join(vax19, lsuffix='_dead', rsuffix='_vax')

In [None]:
vax19_dead

In [None]:
# Top 10 covid vaccine lots
baddies = vax19_dead.groupby("VAX_LOT").size().sort_values(ascending=False) 

In [None]:
# Print out the top lots
for i, (lot,cnt) in enumerate(baddies.items()):
    print(lot, cnt, len(vax19_dead[vax19_dead.VAX_LOT == lot].groupby("STATE")))
    if i == 10:
        break

In [None]:
## End of Notebook ##