In [1]:
import geopandas as gpd
import pandas as pd
import os

In [2]:
gdf_adm = gpd.read_file('../data/ADM_Shp/selected_distict_mauza.shp').to_crs('EPSG:4326')
df_summary = pd.read_csv('../data/FE_Results/June_Aug/MAUZ_flood_summary_QA.csv')
df_survey = pd.read_csv('../data/CDP_Survey/household_locations_impactevaluation_matched_floodlevel_exposure.csv')
df_ts_sent = pd.read_csv('../data/FE_Results/June_Aug/MAUZ_flood_extent_sentinel.csv')
df_ts_interp = pd.read_csv('../data/FE_Results/June_Aug/MAUZ_flood_extent_interpolated.csv')

Let's rename some of the columns to standardize them across datasets.

In [3]:
df_summary = df_summary.rename(columns={'COV': 'ERR', 'PCODE': 'OBJECTID'})
df_ts_interp = df_ts_interp.rename(columns={'PCODE': 'OBJECTID', 'FLOOD_EXTENT': 'FLOOD_FRACTION', 'date': 'DATE'})
df_ts_sent = df_ts_sent.rename(columns={'MAUZ_PCODE': 'OBJECTID', 'flooded_fraction': 'FLOOD_FRACTION', 'date': 'DATE'})

Let's add some of the additional QA flag columns.

In [4]:
df_summary['PEAK_DIFF'] = abs(df_summary['DIFF_SAT'])>20
df_summary['ERR_ERR'] = df_summary['ERR']>20

We'll subset the results to just the mauzas that have survey data.

In [5]:
survey_mauz = set(df_survey['OBJECTID'])
df_summary_survey = df_summary[df_summary['OBJECTID'].isin(survey_mauz)]
assert len(df_summary_survey.index)==len(survey_mauz)

Let's also subset both of the time series datasets by the mauzas in the survey.

In [6]:
df_ts_sent_survey = df_ts_sent[df_ts_sent['OBJECTID'].isin(survey_mauz)]
assert len(set(df_ts_sent_survey.OBJECTID)) == len(survey_mauz)
df_ts_sent_survey.to_csv('../data/FE_Results/June_Aug/MAUZ_flood_extent_sentinel_survey.csv', index=False)

df_ts_interp_survey = df_ts_interp[df_ts_interp['OBJECTID'].isin(survey_mauz)]
assert len(set(df_ts_interp_survey.OBJECTID)) == len(survey_mauz)
df_ts_interp_survey.to_csv('../data/FE_Results/June_Aug/MAUZ_flood_extent_interpolated_survey.csv', index=False)

Let's count the number of mauzas that have problems with the Gaussian fitting.

In [7]:
print(df_summary_survey.NO_FIT.sum())
print(df_summary_survey.NEG.sum())
print(df_summary_survey.RIVER.sum())
print(df_summary_survey.FWHM_ERR.sum())
print(df_summary_survey.MAX_DIFF.sum())
print(df_summary_survey.PEAK_DIFF.sum())
print(df_summary_survey.ERR_ERR.sum())

3
0
1
1
1
1
4


Let's calculate the centroid of each mauza and join this in with the survey data.

In [8]:
gdf_adm = gdf_adm.to_crs('EPSG:32646')
gdf_adm['centroid'] = gdf_adm.centroid
gdf_adm = gdf_adm.set_geometry('centroid')
gdf_adm = gdf_adm.to_crs('EPSG:4326')
gdf_adm['LAT'] = gdf_adm['centroid'].y
gdf_adm['LON'] = gdf_adm['centroid'].x
df_summary_survey = df_summary_survey.merge(gdf_adm[['OBJECTID', 'LAT', 'LON']], left_on='OBJECTID', right_on='OBJECTID', how='left')
#df_summary_survey = df_summary_survey.drop(columns=['OBJECTID'])

In [9]:
df_summary_survey.head(5)

Unnamed: 0,ERR,DIFF_SAT,FWHM,MAX_SAT,OBJECTID,PEAK_G,PEAK_SAT,RMSE,MAX_G,NO_FIT,NEG,RIVER,FWHM_ERR,MAX_DIFF,PEAK_DIFF,ERR_ERR,LAT,LON
0,1.808677,6.0,25.0,0.1145,425.0,2020-07-15,2020-07-21,0.018582,0.088123,False,False,False,False,False,False,False,26.21427,89.65501
1,3.122185,0.0,27.0,0.0653,489.0,2020-07-21,2020-07-21,0.013396,0.041417,False,False,False,False,False,False,False,26.195581,89.65772
2,3.678658,0.0,33.0,0.1253,533.0,2020-07-21,2020-07-21,0.029337,0.085286,False,False,False,False,False,False,False,26.181015,89.68044
3,2.594886,1.0,23.0,0.0698,546.0,2020-07-26,2020-07-27,0.013669,0.042601,False,False,False,False,False,False,False,26.185698,89.640361
4,1.789581,2.0,30.0,0.1427,605.0,2020-07-19,2020-07-21,0.020128,0.114673,False,False,False,False,False,False,False,26.171229,89.66635


In [10]:
df_summary_survey.to_csv('../data/FE_Results/June_Aug/MAUZ_flood_summary_QA_survey.csv', index=False)