/
main.py
70 lines (62 loc) · 2.87 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd
import matplotlib.pyplot as plt
import utils, plot_utils
SEED = 12345
START_DATE = '2008-01'
END_DATE = '2020-01'
FILENAME_ZIMBABWE = '01_zim_original-1.xlsx'
def main():
# rs = np.random.RandomState(SEED)
# Get the outbreaks, and loop through the
df_outbreaks = utils.get_outbreaks()
df_shocks = utils.get_shocks_data()
df_risk_all = pd.read_excel(f'input/risk/{FILENAME_ZIMBABWE}')
df_performance_all = utils.get_df_performance_all()
# Get adm2 present in risks file
adm2_shortlist = utils.get_adm2_shortlist(df_risk_all)
# Plot for outbreaks and shocks
fig, axs = plt.subplots(len(adm2_shortlist), 1, figsize=(10, 10))
for iadm2, (admin2_pcode, admin2_name) in enumerate(adm2_shortlist):
print(f'Analyzing admin region {admin2_name}')
df_outbreak = df_outbreaks[df_outbreaks['admin2Pcode'] == admin2_pcode]
df_shock = df_shocks[df_shocks['pcode'] == admin2_pcode]
# Make the fake data
# df_risk = utils.generate_fake_risk(rs, START_DATE, END_DATE)
# Get risk from Zimbabwe data
df_risk = utils.get_risk_df(df_risk_all, admin2_name)
# Get outbreak date indices
df_risk['outbreak'] = df_risk['date'].isin(df_outbreak['Outbreak month'])
real_outbreaks = df_risk[df_risk['outbreak']].index.values
# Get shocks
shocks, df_risk = utils.get_shocks(df_shock, df_risk)
# Get detections per threshold
df_performance = utils.loop_over_thresholds(df_risk['risk'], real_outbreaks)
df_performance = utils.calculate_f1(df_performance)
# Add it to the full frame
df_performance_all = (pd.concat([df_performance[['thresh', 'TP', 'FP', 'FN']], df_performance_all])
.groupby(['thresh'])
.sum()
.reset_index())
# Make plots
plot_utils.plot_adm2(df_risk, df_performance, real_outbreaks, shocks, admin2_pcode, admin2_name)
# Plot shocks / outbreaks
plot_utils.plot_shocks_and_outbreaks(axs[iadm2], real_outbreaks, shocks, admin2_name, df_risk,
show_x_axis=(iadm2 == len(adm2_shortlist) - 1))
# TODO: evaluate the best threshold value and calculate the overall value of precision and recall
# Save the shocks / outbreaks figure
fig.savefig('plots/outbreaks_shocks.png')
plt.close(fig)
# Caclulate overall performance
df_performance_all = utils.calculate_f1(df_performance_all)
# Confusion matrix
fig, ax = plt.subplots()
plot_utils.plot_confusion_matrix(df_performance_all, ax)
fig.savefig('plots/full_confusion_matrix.png')
plt.close()
# Performance
fig, ax = plt.subplots()
plot_utils.plot_performance(df_performance_all, ax)
fig.savefig('plots/full_performance.png')
plt.close()
if __name__ == '__main__':
main()