## Download covid data from CSSE
- global and US data
- convert to unit of case per million

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

### World: daily new case by country

In [2]:
# case data
url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
case_world=pd.read_csv(url)

# convert date to integer
date_dict={}
i=22
for d in case_world.columns[4:]:
    date_dict[d]=i
    i+=1
case_world.rename(columns=date_dict,inplace=True)
days_total=i

case_world=case_world.groupby(by='Country/Region').sum()
case_world.drop(columns=["Lat","Long"],inplace=True)
case_world.reset_index(inplace=True)

# census data
pop_world=pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/"+ 
                 "COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/05-19-2021.csv")
pop_world.rename(columns={"Country_Region":"Country/Region"},inplace=True)
pop_world['population']=pop_world.Confirmed/pop_world.Incident_Rate*10**5
pop_world=pop_world.groupby(by='Country/Region').sum()
pop_world.reset_index(inplace=True)
case_world=case_world.merge(pop_world[['Country/Region','population']],on="Country/Region")

In [3]:
case_world

Unnamed: 0,Country/Region,22,23,24,25,26,27,28,29,30,...,808,809,810,811,812,813,814,815,816,population
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,176983,177039,177093,177191,177255,177321,177321,177321,177321,38928341.0
1,Albania,0,0,0,0,0,0,0,0,0,...,272961,273040,273088,273088,273146,273164,273257,273318,273387,2877800.0
2,Algeria,0,0,0,0,0,0,0,0,0,...,265524,265539,265550,265562,265573,265585,265599,265612,265621,43851043.0
3,Andorra,0,0,0,0,0,0,0,0,0,...,39234,39234,39234,39234,39234,39713,39713,39713,39713,77265.0
4,Angola,0,0,0,0,0,0,0,0,0,...,99003,99003,99003,99010,99058,99058,99081,99102,99106,32866268.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,654380,654380,654380,654954,654954,654954,655468,655468,655468,5101416.0
194,Winter Olympics 2022,0,0,0,0,0,0,0,0,0,...,535,535,535,535,535,535,535,535,535,0.0
195,Yemen,0,0,0,0,0,0,0,0,0,...,11797,11799,11801,11801,11802,11802,11803,11803,11803,29825968.0
196,Zambia,0,0,0,0,0,0,0,0,0,...,315623,315623,315623,315892,316088,316190,316312,316422,316501,18383956.0


### US: daily new case by state

In [29]:
#case data
url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv"
case_US=pd.read_csv(url)
case_US.drop(columns=['UID', 'code3', 'FIPS', 'Lat', 'Long_'],inplace=True)
states_rm=["Diamond Princess","Grand Princess",  "American Samoa", "Guam","Puerto Rico","Northern Mariana Islands","Virgin Islands"]
case_US=case_US[~case_US.Province_State.isin(states_rm)]
case_US=case_US.groupby(by='Province_State').sum()
case_US.reset_index(inplace=True)
case_US.rename(columns=date_dict,inplace=True)
# census data
pop_US=pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/"+ 
                 "COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/05-19-2021.csv")

pop_US=pop_US[['Province_State','Confirmed','Incident_Rate']]

case_US=case_US.merge(pop_US,on="Province_State")
case_US['population']=case_US['Confirmed']/case_US.Incident_Rate*10**5

### make prevalence data
- combine US and world data

In [30]:
case_world.rename(columns={'Country/Region':'location'},inplace=True)

In [31]:
case_US.drop(columns=['Confirmed','Incident_Rate'],inplace=True)
case_US.rename(columns={'Province_State':'location'},inplace=True)

In [32]:
case_world['country']=case_world['location']
case_US['country']='US'

In [33]:
def foo(loc):
    # rename US state to US.state
    return 'US.'+loc
case_US['location']=case_US['location'].apply(foo)

In [34]:
US=case_US.sum().to_frame().transpose()
US.location=['US']
US.country=['US']

In [35]:
case_world=case_world[case_world['location']!='US']
case_data=case_world.append(case_US,ignore_index=True)
case_data=case_data.append(US,ignore_index=True)

In [36]:
case_data

Unnamed: 0,location,22,23,24,25,26,27,28,29,30,...,678,679,680,681,682,683,684,685,population,country
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,156397,156397,156414,156456,156487,156510,156552,156610,38928341.0,Afghanistan
1,Albania,0,0,0,0,0,0,0,0,0,...,189355,190125,190815,191440,192013,192600,193075,193269,2877800.0,Albania
2,Algeria,0,0,0,0,0,0,0,0,0,...,207254,207385,207509,207624,207764,207873,207970,208104,43851043.0,Algeria
3,Andorra,0,0,0,0,0,0,0,0,0,...,15705,15717,15744,15744,15819,15819,15819,15907,77265.0,Andorra
4,Angola,0,0,0,0,0,0,0,0,0,...,64724,64762,64815,64857,64875,64899,64913,64913,32866268.0,Angola
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,US.Washington,1,1,1,1,1,1,1,1,1,...,742919,744364,746354,746354,750477,750477,750477,755011,7614893.0,US
243,US.West Virginia,0,0,0,0,0,0,0,0,0,...,278578,278837,279694,280727,281865,282405,283075,283561,1792147.0,US
244,US.Wisconsin,0,0,0,0,0,0,0,0,0,...,905245,909159,913237,917469,921221,921221,921221,928290,5822434.0,US
245,US.Wyoming,0,0,0,0,0,0,0,0,0,...,105990,106287,106698,106698,107483,107483,107483,108103,578759.0,US


In [37]:
# region list for simulations
with open('region_list.pkl','rb') as f:
    region_list=(pickle.load(f))

In [38]:
# make prevalence dataframe
# contain daily case/population
prevalence=pd.DataFrame(columns=case_data.columns)

# only include regions in cumcase
for r in region_list:
    prevalence=prevalence.append(case_data[ (case_data.location==r)],
                      ignore_index=True)
prevalence=prevalence.append(US,ignore_index=True)   

# populattion scale for simulation
prevalence['actual_pop_scale']=prevalence.population/10**6

In [39]:
prevalence

Unnamed: 0,location,22,23,24,25,26,27,28,29,30,...,679,680,681,682,683,684,685,population,country,actual_pop_scale
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,156397,156414,156456,156487,156510,156552,156610,38928341.0,Afghanistan,38.928341
1,Albania,0,0,0,0,0,0,0,0,0,...,190125,190815,191440,192013,192600,193075,193269,2877800.0,Albania,2.8778
2,Algeria,0,0,0,0,0,0,0,0,0,...,207385,207509,207624,207764,207873,207970,208104,43851043.0,Algeria,43.851043
3,Angola,0,0,0,0,0,0,0,0,0,...,64762,64815,64857,64875,64899,64913,64913,32866268.0,Angola,32.866268
4,Antigua and Barbuda,0,0,0,0,0,0,0,0,0,...,4091,4102,4102,4106,4118,4118,4118,97928.0,Antigua and Barbuda,0.097928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,US.West Virginia,0,0,0,0,0,0,0,0,0,...,278837,279694,280727,281865,282405,283075,283561,1792147.0,US,1.792147
195,US.Wisconsin,0,0,0,0,0,0,0,0,0,...,909159,913237,917469,921221,921221,921221,928290,5822434.0,US,5.822434
196,US.Wyoming,0,0,0,0,0,0,0,0,0,...,106287,106698,106698,107483,107483,107483,108103,578759.0,US,0.578759
197,China,548,643,920,1406,2075,2877,5509,6087,8141,...,110385,110454,110553,110629,110720,110773,110801,1404676330.0,China,1404.67633


In [40]:
for i in range(22,days_total):
    prevalence[i]=prevalence[i]/prevalence['population']

In [41]:
prevalence.to_csv("prevalance_cumlative.csv",index=False)
prevalence_cumcase=prevalence.copy()

In [42]:
# daily new 
for i in range(581,23,-1):
    prevalence[i]=prevalence[(i)]-prevalence[(i-1)]

In [43]:
prevalence.to_csv("prevalance_daily_new.csv",index=False)

In [44]:
prevalence

Unnamed: 0,location,22,23,24,25,26,27,28,29,30,...,679,680,681,682,683,684,685,population,country,actual_pop_scale
0,Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004018,0.004018,0.004019,0.00402,0.00402,0.004022,0.004023,38928341.0,Afghanistan,38.928341
1,Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066066,0.066306,0.066523,0.066722,0.066926,0.067091,0.067159,2877800.0,Albania,2.8778
2,Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004729,0.004732,0.004735,0.004738,0.00474,0.004743,0.004746,43851043.0,Algeria,43.851043
3,Angola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00197,0.001972,0.001973,0.001974,0.001975,0.001975,0.001975,32866268.0,Angola,32.866268
4,Antigua and Barbuda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041776,0.041888,0.041888,0.041929,0.042051,0.042051,0.042051,97928.0,Antigua and Barbuda,0.097928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,US.West Virginia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.155588,0.156066,0.156643,0.157278,0.157579,0.157953,0.158224,1792147.0,US,1.792147
195,US.Wisconsin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.156148,0.156848,0.157575,0.158219,0.158219,0.158219,0.159433,5822434.0,US,5.822434
196,US.Wyoming,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.183646,0.184357,0.184357,0.185713,0.185713,0.185713,0.186784,578759.0,US,0.578759
197,China,0.0,0.0,0.0,0.0,0.0,0.000001,0.000002,0.0,0.000001,...,0.000079,0.000079,0.000079,0.000079,0.000079,0.000079,0.000079,1404676330.0,China,1404.67633


## Determine start and end day for regional simulations
- start day: day when cumulative case reached 100 per million
- end day: day 516 (5/31/2021) or the day when vaccine coverage reached 3%, whichever was earlier


In [45]:
data=prevalence_cumcase

### start day

In [47]:
# start day adjustment
loc_refit=['Central African Republic','Djibouti','Mali','Mongolia','Nigeria','Papua New Guinea',
          'Somalia','South Africa','South Sudan','Thailand','Togo','Trinidad and Tobago']
# start day addition to the day that reached 100 cases per million 
start_day=[330,290,100,250,225,200,160,200,210,350,250,320]

start_day={loc_refit[i]:start_day[i] for i in range(12)}
start_day['South Africa']=320
start_day['Somalia']=300
start_day['Mali']= 340
start_day['Thailand']= 380

skip=['Belize','Congo (Kinshasa)','Eswatini','Lesotho','Saint Vincent and the Grenadines',
     'Tajikistan','Burkina Faso','China']

In [65]:
starts={}
offset={}
fit_info = pd.DataFrame(columns=['location','start','end','offset','actual_pop_scale'])
fit_info['location']=region_list
for i,row in fit_info.iterrows():
    r=row.location

    for d in np.arange(22,days_total):
        fit_info.loc[i,'actual_pop_scale']=prevalence.loc[prevalence.location==r,'actual_pop_scale'].item()
        # set start day to be the day when cumulative case exceed 100 per million 
        if data.loc[data.location==r,d].item()*10**6>100:
            
            fit_info.loc[i,'start']=d
            fit_info.loc[i,'offset']=0
            # if start day adjustment is needed
            # adjust start day and account for additional offset
            if r in start_day and fit_info.loc[i,'start']+start_day[r] < days_total:
                fit_info.loc[i,'start']=fit_info.loc[i,'start']+start_day[r]
                fit_info.loc[i,'offset']= data.loc[data.location==r,int(fit_info.loc[i,'start'])].item()*10**6-100
        
            break
            


### prepare vaccine data

In [68]:
# read in vaccine info from Qingfeng
vac=pd.read_csv("vaccineRateDaily_global_20210630.csv")
vac.rename(columns={'country':'location'},inplace=True)
vac_us=pd.read_csv("vaccineRateDaily_USA_20210630.csv")
# correct US state names
vac_us['state']=vac_us['state'].apply(lambda x:'US.'+x)
vac_us.rename(columns={'state':'location'},inplace=True)
# merge global and US files
vac=vac.append(vac_us,ignore_index=True)

# make a correction for vac1RateCumu_asOfLastMonth
for i,row in vac.iterrows():
    if not np.isnan(row['days_exceed3']):
        vac.loc[i,'vac1RateCumu_asOfLastMonth']= 1.5+((516-row['days_exceed3'])*row.dailyVac1_permillion_bw3andLastMonth)/10**4
vac.vac2RateCumu_asOfLastMonth=vac.vac1RateCumu_asOfLastMonth  

vac.to_csv("vaccine_info_83.csv",index=False)
# isoloate those regions that exceed 3% coverage before 5/31/2021
#vac_3percent=vac[~vac.days_exceed3.isnull()]

### end day

In [71]:
# set end day of first fitting period
for i,row in fit_info.iterrows():
    r = row.location
    #if region exceeds 3% coverage already, set end date to that day
    if not vac[vac.location==r].days_exceed3.isnull().item():
        fit_info.loc[i,'end']=int(vac[vac.location==r].days_exceed3.item())
    # else set to 5/31/2021
    else:fit_info.loc[i,'end']=516

In [72]:
fit_info

Unnamed: 0,location,start,end,offset,actual_pop_scale
0,Afghanistan,131,516,0,38.928341
1,Albania,94,459,0,2.8778
2,Algeria,124,516,0,43.851043
3,Angola,256,516,0,32.866268
4,Antigua and Barbuda,94,440,0,0.097928
...,...,...,...,...,...
193,US.Washington,76,409,0,7.614893
194,US.West Virginia,92,395,0,1.792147
195,US.Wisconsin,85,408,0,5.822434
196,US.Wyoming,87,408,0,0.578759


In [None]:
fit_info.to_csv("fitting_info_aug3.csv",index=False)