In [1]:
from matplotlib import pyplot as plt
import numpy as np
import gzip
import csv
import re
from datetime import datetime
# Open the CSV file
with open('covid.csv', mode='r', newline='') as f:
    #gp from raw data into filtered and clean data--------------------------------------------------
    raw = [re.sub(r',(?=\d{3}(?:|$|\.|\s))', '', row).replace('\n', '').split(',') for row in f.readlines()]
    header,data = raw[0],raw[1:]
idx = {header[i]:i for i in range(len(header))} #build the dictionary so we can select columns by name
print(header)
print(data[1])

['report_date', 'city', 'city_population', 'cumulative_cases', 'cumulative_tests_reportable', 'cumulative_deaths', 'cases_7days', 'tests_reportable_7days', 'positive_naat_7days', 'tests_naat_7days', 'naat_positivity_7days', 'cumulative_positive_naat', 'cumulative_tests_naat', 'positive_ag_7days', 'case_rate_weekly', 'town_number', 'data_updated']
['"06/01/2023"', '"Ansonia"', '"18654"', '"6074"', '"88343"', '"62"', '', '"13"', '', '"13"', '', '"6523"', '"77049"', '', '', '"2"', '"06/01/2023"']


In [2]:
for e in data:
    print(e[idx['city_population']])

"3236"
"18654"
"4255"
"18276"
"3606"
"6222"
"20436"
"5548"
"19800"
"3402"
"21211"
"4884"
"2726"
"27900"
"144399"
"1635"
"59947"
"16973"
"8272"
"9704"
"1053"
"5079"
"10254"
"2239"
"28937"
"4213"
"12925"
"15809"
"1400"
"5379"
"1362"
"12407"
"13839"
"84694"
"21728"
"4443"
"12339"
"7165"
"5140"
"8997"
"12800"
"49872"
"28569"
"18462"
"11668"
"1790"
"7521"
"16467"
"43659"
"6668"
"62045"
"25497"
"1920"
"34482"
"2863"
"11507"
"62840"
"11534"
"38436"
"22133"
"8193"
"60556"
"1842"
"122105"
"2120"
"5420"
"9504"
"2777"
"17336"
"6364"
"7144"
"14621"
"4220"
"8094"
"2316"
"18030"
"57584"
"25487"
"6335"
"59395"
"7798"
"4374"
"46258"
"54747"
"19434"
"18508"
"2254"
"31108"
"72495"
"20233"
"13878"
"6656"
"130250"
"26858"
"26805"
"30014"
"27891"
"1630"
"14146"
"3251"
"23683"
"5196"
"88816"
"38768"
"7306"
"10061"
"13926"
"13255"
"15125"
"17534"
"11598"
"4203"
"9267"
"4625"
"9702"
"9389"
"9116"
"24959"
"20115"
"2152"
"4083"
"3600"
"1672"
"16437"
"2689"
"41129"
"3630"
"25395"
"10784"
"26162"
"19571"
"43834"


In [2]:
cols = header
print(cols)

['report_date', 'city', 'city_population', 'cumulative_cases', 'cumulative_tests_reportable', 'cumulative_deaths', 'cases_7days', 'tests_reportable_7days', 'positive_naat_7days', 'tests_naat_7days', 'naat_positivity_7days', 'cumulative_positive_naat', 'cumulative_tests_naat', 'positive_ag_7days', 'case_rate_weekly', 'town_number', 'data_updated']


In [3]:
date_data = [datetime.strptime(e[idx['report_date']].replace('"', ''), '%m/%d/%Y').timestamp() for e in data] #stored as datetime object

city_data = [e[idx['city']].replace('"', '') for e in data] #stored as string

population_data = [(np.nan if (e[idx['city_population']] == '') else float(e[idx['city_population']].replace('"', ''))) for e in data] #stored as float

case_data = [(np.nan if (e[idx['cumulative_cases']] == '') else float(e[idx['cumulative_cases']].replace('"', ''))) for e in data] #stored as float

death_data = [(np.nan if (e[idx['cumulative_deaths']] == '') else float(e[idx['cumulative_deaths']].replace('"', ''))) for e in data] #stored as float

rate_data = [(np.nan if (e[idx['case_rate_weekly']] == '')  else float(e[idx['case_rate_weekly']].replace('"', ''))) for e in data] #stored as float

print("done cleaning!")

done cleaning!


In [4]:
print('population domain')
print(min(population_data))
print(max(population_data),'\n')

print('case domain')
print(min(case_data))
print(max(case_data),'\n')

print('death domain')
print(min(death_data))
print(max(death_data),'\n')

print('rate domain')
print(min(rate_data))
print(max(rate_data),'\n')

print('date domain')
print(min(date_data))
print(max(date_data),'\n')



population domain
839.0
144399.0 

case domain
28.0
48917.0 

death domain
0.0
538.0 

rate domain
0.0
632.2053 

date domain
1655352000.0
1685592000.0 



In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
date = np.asarray(date_data)
city = np.asarray(city_data)
population = np.asarray(population_data)
case = np.asarray(case_data)
death = np.asarray(death_data)
rate = np.asarray(rate_data)

n = len(date)
D = np.concatenate([population.reshape(n,1),case.reshape(n,1),
                    death.reshape(n,1),rate.reshape(n,1)],axis=1)
print(D,D.shape) #this will have the nan which will will use MICE to fill in

[[3.236000e+03 6.140000e+02 6.000000e+00 0.000000e+00]
 [1.865400e+04 6.074000e+03 6.200000e+01          nan]
 [4.255000e+03 8.740000e+02 7.000000e+00 0.000000e+00]
 ...
 [9.502000e+03 1.637000e+03 1.200000e+01 1.473374e+02]
 [7.858000e+03 1.435000e+03 7.000000e+00          nan]
 [         nan 2.594000e+03 2.000000e+00          nan]] (59160, 4)


In [8]:
imp = IterativeImputer(max_iter=20,random_state=0) #this MICE
imp.fit(D)
X = imp.transform(D)
X #no more nan values...

array([[3.23600000e+03, 6.14000000e+02, 6.00000000e+00, 0.00000000e+00],
       [1.86540000e+04, 6.07400000e+03, 6.20000000e+01, 5.65002585e+01],
       [4.25500000e+03, 8.74000000e+02, 7.00000000e+00, 0.00000000e+00],
       ...,
       [9.50200000e+03, 1.63700000e+03, 1.20000000e+01, 1.47337400e+02],
       [7.85800000e+03, 1.43500000e+03, 7.00000000e+00, 6.32393043e+01],
       [1.08089301e+04, 2.59400000e+03, 2.00000000e+00, 6.11471952e+01]])

In [11]:
imp_data = []
for i in range(len(D)):
    imp_row =[
        date[i],
        city[i],
        X[i][0],(0 if X[i][0]==D[i][0] else 1),
        X[i][1],(0 if X[i][1]==D[i][1] else 1),
        X[i][2],(0 if X[i][2]==D[i][2] else 1),
        X[i][3],(0 if X[i][3]==D[i][3] else 1),]
    imp_data += [imp_row]
imp_data[0:10]

[[np.float64(1685592000.0),
  np.str_('Andover'),
  np.float64(3236.0),
  0,
  np.float64(614.0),
  0,
  np.float64(6.0),
  0,
  np.float64(0.0),
  0],
 [np.float64(1685592000.0),
  np.str_('Ansonia'),
  np.float64(18654.0),
  0,
  np.float64(6074.0),
  0,
  np.float64(62.0),
  0,
  np.float64(56.500258494071616),
  1],
 [np.float64(1685592000.0),
  np.str_('Ashford'),
  np.float64(4255.0),
  0,
  np.float64(874.0),
  0,
  np.float64(7.0),
  0,
  np.float64(0.0),
  0],
 [np.float64(1685592000.0),
  np.str_('Avon'),
  np.float64(18276.0),
  0,
  np.float64(3822.0),
  0,
  np.float64(89.0),
  0,
  np.float64(72.96950195177749),
  1],
 [np.float64(1685592000.0),
  np.str_('Barkhamsted'),
  np.float64(3606.0),
  0,
  np.float64(649.0),
  0,
  np.float64(12.0),
  0,
  np.float64(0.0),
  0],
 [np.float64(1685592000.0),
  np.str_('Beacon Falls'),
  np.float64(6222.0),
  0,
  np.float64(1601.0),
  0,
  np.float64(13.0),
  0,
  np.float64(58.85205398059663),
  1],
 [np.float64(1685592000.0),
  

In [13]:
s = ','.join(['date','city','population', 'population_imp', 'case',
              'case_imp', 'death', 'death_imp', 'rate', 'rate_imp'])+'\n'
s += '\n'.join([','.join([str(x) for x in row]) for row in imp_data])+'\n'
with open('covid_imp.csv','w') as f: f.write(s)
print('done')

done
