In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns

import matplotlib.pyplot as plt
import pandas as pd

In [3]:
df_data_2020 = pd.read_csv("../dane/2020VAERSDATA.csv", encoding='latin1', low_memory=False)
df_vax_2020 = pd.read_csv("../dane/2020VAERSVAX.csv", encoding='latin1', low_memory=False)
df_2020 = pd.merge(df_data_2020, df_vax_2020, on='VAERS_ID')

df_data_2021 = pd.read_csv("../dane/2021VAERSDATA.csv", encoding='latin1', low_memory=False)
df_vax_2021 = pd.read_csv("../dane/2021VAERSVAX.csv", encoding='latin1', low_memory=False)
df_2021 = pd.merge(df_data_2021, df_vax_2021, on='VAERS_ID')

df = pd.concat([df_2020, df_2021])

In [16]:
df = df.set_index('VAERS_ID')

In [24]:
df['RECVDATE'] = pd.to_datetime(df['RECVDATE'], format='%m/%d/%Y')

start_date = '2020-12-14'
end_date = '2021-01-22'

df = df[(df['RECVDATE'] >= start_date) & (df['RECVDATE'] <= end_date)]

In [27]:
df = df[["STATE", "CAGE_YR", "SEX", "DIED", "HOSPITAL", "HOSPDAYS", "NUMDAYS", "VAX_TYPE", "VAX_MANU"]]

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64921 entries, 901875 to 2642529
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   STATE     52156 non-null  object 
 1   CAGE_YR   56362 non-null  float64
 2   SEX       64921 non-null  object 
 3   DIED      349 non-null    object 
 4   HOSPITAL  1044 non-null   object 
 5   HOSPDAYS  653 non-null    float64
 6   NUMDAYS   60416 non-null  float64
 7   VAX_TYPE  64921 non-null  object 
 8   VAX_MANU  64921 non-null  object 
dtypes: float64(3), object(6)
memory usage: 5.0+ MB


In [28]:
df = df[df["SEX"].isin(["F", "M"])]
df['SEX'] = df['SEX'].apply(lambda x: 1 if x == 'F' else 0)
df = df.dropna(subset=["STATE", "CAGE_YR", "NUMDAYS"])
df['DIED'] = df['DIED'].apply(lambda x: 1 if x == 'Y' else 0)
df['HOSPITAL'] = df['HOSPITAL'].apply(lambda x: 1 if x == 'Y' else 0)
df['HOSPDAYS'] = pd.to_numeric(df['HOSPDAYS'], errors='coerce').fillna(0)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45783 entries, 901875 to 2642010
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   STATE     45783 non-null  object 
 1   CAGE_YR   45783 non-null  float64
 2   SEX       45783 non-null  int64  
 3   DIED      45783 non-null  int64  
 4   HOSPITAL  45783 non-null  int64  
 5   HOSPDAYS  45783 non-null  float64
 6   NUMDAYS   45783 non-null  float64
 7   VAX_TYPE  45783 non-null  object 
 8   VAX_MANU  45783 non-null  object 
dtypes: float64(3), int64(3), object(3)
memory usage: 3.5+ MB


In [32]:
df = df[df["VAX_TYPE"] == 'COVID19']

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15794 entries, 902418 to 2507444
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   STATE     15794 non-null  object 
 1   CAGE_YR   15794 non-null  float64
 2   SEX       15794 non-null  int64  
 3   DIED      15794 non-null  int64  
 4   HOSPITAL  15794 non-null  int64  
 5   HOSPDAYS  15794 non-null  float64
 6   NUMDAYS   15794 non-null  float64
 7   VAX_TYPE  15794 non-null  object 
 8   VAX_MANU  15794 non-null  object 
dtypes: float64(3), int64(3), object(3)
memory usage: 1.2+ MB


In [33]:
df.describe()

Unnamed: 0,CAGE_YR,SEX,DIED,HOSPITAL,HOSPDAYS,NUMDAYS
count,15794.0,15794.0,15794.0,15794.0,15794.0,15794.0
mean,61.874509,0.79125,0.015069,0.024123,0.047486,3.549069
std,10.119965,0.406428,0.121831,0.153436,0.479054,76.907064
min,50.0,0.0,0.0,0.0,0.0,0.0
25%,54.0,1.0,0.0,0.0,0.0,0.0
50%,59.0,1.0,0.0,0.0,0.0,1.0
75%,67.0,1.0,0.0,0.0,0.0,2.0
max,104.0,1.0,1.0,1.0,15.0,7305.0
