In [3]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os
import pickle


### Raw Data

In [4]:
df_raw = pd.read_csv("/data/raw/data_ksa_long.csv")
df_raw["idsubsegment2"] = df_raw.idsegmen.astype("str") + df_raw.idsubsegmen
df_raw['tahun'] = df_raw['tahun'].replace({22: '2022', 23: '2023'})
df_raw.head()

Unnamed: 0.1,Unnamed: 0,idsegmen,idsubsegmen,tahun,bulan,obs,idsubsegment2
0,0,110101001,A1,2022,1,8.0,110101001A1
1,1,110101001,A2,2022,1,4.0,110101001A2
2,2,110101001,A3,2022,1,4.0,110101001A3
3,3,110101001,B1,2022,1,8.0,110101001B1
4,4,110101001,B2,2022,1,4.0,110101001B2


In [5]:
print(df_raw.tahun.unique())
print(df_raw.bulan.unique())

['2022' '2023']
[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [7]:
df_raw.groupby('obs').size()

obs
0.00       3414
1.00     507193
2.00     395336
3.00     527647
3.10       2887
3.20       2474
3.30       2207
4.00     981628
5.00     387644
6.00      31779
7.10     269072
7.11      50346
7.12        391
7.13       2143
7.14       3194
7.20       9908
7.30      19582
7.40      17976
7.50      36854
7.60      11658
7.70      56673
7.80      11259
7.90       1005
7.99     595387
8.00     469933
12.00      9890
dtype: int64

### Define

In [9]:
# df.columns
cols_VV = ['idpoint', 'idsubsegment', 'idsegment', 'nth', 'periode', 'observation',
       'MGRS', 'VV_30', 'VV_29', 'VV_28', 'VV_27', 'VV_26', 'VV_25',
       'VV_24', 'VV_23', 'VV_22', 'VV_21', 'VV_20', 'VV_19', 'VV_18', 'VV_17',
       'VV_16', 'VV_15', 'VV_14', 'VV_13', 'VV_12', 'VV_11', 'VV_10', 'VV_9',
       'VV_8', 'VV_7', 'VV_6', 'VV_5', 'VV_4', 'VV_3', 'VV_2', 'VV_1', 'VV_0',
       'year', 'date']

In [10]:
# df.columns
cols_VH = ['idpoint', 'idsubsegment', 'idsegment', 'nth', 'periode', 'observation',
       'MGRS', 'VH_30', 'VH_29', 'VH_28', 'VH_27', 'VH_26', 'VH_25',
       'VH_24', 'VH_23', 'VH_22', 'VH_21', 'VH_20', 'VH_19', 'VH_18', 'VH_17',
       'VH_16', 'VH_15', 'VH_14', 'VH_13', 'VH_12', 'VH_11', 'VH_10', 'VH_9',
       'VH_8', 'VH_7', 'VH_6', 'VH_5', 'VH_4', 'VH_3', 'VH_2', 'VH_1', 'VH_0',
       'year', 'date']

# df.columns
cols_VV = ['idpoint', 'idsubsegment', 'idsegment', 'nth', 'periode', 'observation',
       'MGRS', 'VV_30', 'VV_29', 'VV_28', 'VV_27', 'VV_26', 'VV_25',
       'VV_24', 'VV_23', 'VV_22', 'VV_21', 'VV_20', 'VV_19', 'VV_18', 'VV_17',
       'VV_16', 'VV_15', 'VV_14', 'VV_13', 'VV_12', 'VV_11', 'VV_10', 'VV_9',
       'VV_8', 'VV_7', 'VV_6', 'VV_5', 'VV_4', 'VV_3', 'VV_2', 'VV_1', 'VV_0',
       'year', 'date']

In [11]:
files_VV = [ '/data/ksa/04_Data_Preprocessing/training-test/32/testing_32_VV.pkl',
 '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_32_VV.pkl',
 '/data/ksa/04_Data_Preprocessing/training-test/32/training_32_VV.pkl',
 '/data/ksa/04_Data_Preprocessing/training-test/32/testing_imputation_32_VV.pkl']

files_VH = ['/data/ksa/04_Data_Preprocessing/training-test/32/testing_imputation_32_VH.pkl',
 '/data/ksa/04_Data_Preprocessing/training-test/32/testing_32_VH.pkl',
 '/data/ksa/04_Data_Preprocessing/training-test/32/training_32_VH.pkl',
 '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_32_VH.pkl']

# files_VV = ['/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter10_32_VV.pkl']

# files_VH = ['/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter10_32_VH.pkl']

### VV-files

In [12]:
for i in files_VV[2:]:
    ## read data
    with open(i, 'rb') as f:
        df = pickle.load(f)
    
    ## edit obs
    df['nth'] = df.nth.astype('int')
    
    df2 = df.merge(df_raw[['obs','tahun','bulan','idsubsegment2']], how='left', left_on = ['year','nth','idsubsegment'], right_on=['tahun','bulan','idsubsegment2'])
    df2['obs'] = df2.obs.astype('str')
    df2 = df2[~df2['obs'].isin(['0.0', '12.0'])]
    df2['obs'] = df2['obs'].apply(lambda x: '7.0' if str(x).startswith('7') else x)
    
    df2['observation'] = df2['obs']
    df2 = df2[cols_VV]

    ## dump pickle
    # with open(i[:-4]+'_unrecode.pkl', 'wb') as f:
    #     pickle.dump(df2, f)

    # break

In [15]:
df.groupby('observation').size()

observation
0.0    539950
1.0    124325
2.0     62900
3.0    109500
4.0    112950
5.0    160975
6.0    229500
7.0    176325
dtype: int64

In [16]:
df_raw.groupby('obs').size()

obs
0.00       3414
1.00     507193
2.00     395336
3.00     527647
3.10       2887
3.20       2474
3.30       2207
4.00     981628
5.00     387644
6.00      31779
7.10     269072
7.11      50346
7.12        391
7.13       2143
7.14       3194
7.20       9908
7.30      19582
7.40      17976
7.50      36854
7.60      11658
7.70      56673
7.80      11259
7.90       1005
7.99     595387
8.00     469933
12.00      9890
dtype: int64

In [24]:
df.nth 

0        10
1        10
2        10
3        10
4        10
         ..
14095    12
14096    12
14097    12
14098    12
14099    12
Name: nth, Length: 1519550, dtype: int64

In [30]:
df.loc[(df.observation == '6.0') & (df.nth >9) & (df.year=='2023')]

Unnamed: 0,idpoint,idsubsegment,idsegment,nth,periode,observation,class,MGRS,VV_30,VV_29,...,VV_7,VV_6,VV_5,VV_4,VV_3,VV_2,VV_1,VV_0,year,date
150,321222001C1#01,321222001C1,321222001,10,2023_25,6.0,NP,48MZU,-6.284059,-7.502235,...,-12.424339,-10.866164,-12.132264,-11.510483,-11.188438,-10.690455,-11.453719,-6.526930,2023,2023-10-01
151,321222001C1#02,321222001C1,321222001,10,2023_25,6.0,NP,48MZU,-9.494167,-8.689418,...,-12.727200,-11.324289,-12.180267,-11.459779,-10.868052,-11.182851,-12.218193,-8.875887,2023,2023-10-01
152,321222001C1#03,321222001C1,321222001,10,2023_25,6.0,NP,48MZU,-10.717319,-8.771981,...,-12.172147,-11.157005,-12.315205,-11.588629,-11.450826,-10.494730,-9.949430,-8.307879,2023,2023-10-01
153,321222001C1#04,321222001C1,321222001,10,2023_25,6.0,NP,48MZU,-10.603805,-8.308215,...,-10.935440,-9.852500,-11.932938,-10.981853,-10.769513,-10.238853,-10.064614,-8.186713,2023,2023-10-01
154,321222001C1#05,321222001C1,321222001,10,2023_25,6.0,NP,48MZU,-11.170871,-9.290513,...,-11.196993,-9.976233,-11.402789,-10.881310,-11.026359,-9.924810,-9.812675,-6.455154,2023,2023-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13145,320608014C1#21,320608014C1,320608014,12,2023_30,6.0,NP,48MZS,-7.435377,-7.776232,...,-8.195408,-8.522762,-7.292422,-7.930414,-8.274657,-8.048275,-7.952509,-7.959874,2023,2023-12-01
13146,320608014C1#22,320608014C1,320608014,12,2023_30,6.0,NP,48MZS,-8.652041,-9.367982,...,-9.001326,-10.358762,-6.803561,-8.131852,-8.526231,-8.084930,-7.900497,-7.823379,2023,2023-12-01
13147,320608014C1#23,320608014C1,320608014,12,2023_30,6.0,NP,48MZS,-8.906265,-9.079432,...,-8.627879,-10.706377,-8.633032,-9.200946,-8.993944,-9.007940,-8.991070,-8.921111,2023,2023-12-01
13148,320608014C1#24,320608014C1,320608014,12,2023_30,6.0,NP,48MZS,-9.216525,-8.740955,...,-8.449161,-10.028898,-9.199806,-10.105874,-10.709974,-9.375631,-7.990122,-7.284871,2023,2023-12-01


In [13]:
print(df.shape)
print(df2.groupby('observation').size())

(1519550, 41)
observation
1.0    124325
2.0     62900
3.0     79600
3.1     10100
3.2      9925
3.3      9875
4.0    618675
5.0    195200
7.0    229500
8.0    176325
dtype: int64


### VH-files

In [11]:
for i in files_VH:
    ## read data
    with open(i, 'rb') as f:
        df = pickle.load(f)
    
    ## edit obs
    df['nth'] = df.nth.astype('int')
    df2 = df.merge(df_raw[['obs','tahun','bulan','idsubsegment2']], how='left', left_on = ['year','nth','idsubsegment'], right_on=['tahun','bulan','idsubsegment2'])
    df2['obs'] = df2.obs.astype('str')
    df2 = df2[~df2['obs'].isin(['0.0', '12.0'])]
    df2['obs'] = df2['obs'].apply(lambda x: '7.0' if str(x).startswith('7') else x)
    
    df2['observation'] = df2['obs']
    df2 = df2[cols_VH]

    ## dump pickle
    with open(i[:-4]+'_unrecode.pkl', 'wb') as f:
        pickle.dump(df2, f)

    # break

In [12]:
print(df.shape)
print(df2.groupby('observation').size())

(1076875, 44)
observation
1.0    168350
2.0    130225
3.0    184650
4.0    233425
5.0     93300
7.0    138175
8.0    126375
dtype: int64


--------------

### Raw Data 2

In [49]:
# df.columns
cols_VH = ['idpoint', 'idsubsegment', 'idsegment', 'nth', 'periode', 'observation',
       'MGRS', 'VH_30', 'VH_29', 'VH_28', 'VH_27', 'VH_26', 'VH_25',
       'VH_24', 'VH_23', 'VH_22', 'VH_21', 'VH_20', 'VH_19', 'VH_18', 'VH_17',
       'VH_16', 'VH_15', 'VH_14', 'VH_13', 'VH_12', 'VH_11', 'VH_10', 'VH_9',
       'VH_8', 'VH_7', 'VH_6', 'VH_5', 'VH_4', 'VH_3', 'VH_2', 'VH_1', 'VH_0',
       'year', 'date']

# df.columns
cols_VV = ['idpoint', 'idsubsegment', 'idsegment', 'nth', 'periode', 'observation',
       'MGRS', 'VV_30', 'VV_29', 'VV_28', 'VV_27', 'VV_26', 'VV_25',
       'VV_24', 'VV_23', 'VV_22', 'VV_21', 'VV_20', 'VV_19', 'VV_18', 'VV_17',
       'VV_16', 'VV_15', 'VV_14', 'VV_13', 'VV_12', 'VV_11', 'VV_10', 'VV_9',
       'VV_8', 'VV_7', 'VV_6', 'VV_5', 'VV_4', 'VV_3', 'VV_2', 'VV_1', 'VV_0',
       'year', 'date']

files_VV = [ '/data/ksa/04_Data_Preprocessing/training-test/32/testing_32_VV.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_32_VV.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_32_VV.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/testing_imputation_32_VV.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter5_32_VV.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter10_32_VV.pkl']

files_VH = ['/data/ksa/04_Data_Preprocessing/training-test/32/testing_imputation_32_VH.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/testing_32_VH.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_32_VH.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_32_VH.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter5_32_VH.pkl',
     '/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter10_32_VH.pkl']


In [25]:
df_raw2 = pd.read_csv("/data/raw/processed/relabelledandoriginal_data_ksa.csv")
df_raw2["idsubsegment2"] = df_raw2.idsegmen.astype("str") + df_raw2.idsubsegmen
df_raw2['tahun'] = df_raw2['tahun'].replace({22: '2022', 23: '2023'})
df_raw2.head()

Unnamed: 0,idsegmen,idsubsegmen,tahun,bulan,obs,nth,id_x,observation,class,idsubsegment2
0,110101001,A1,2022,1,8.0,0,110101001A1,8.0,NV,110101001A1
1,110101001,A2,2022,1,4.0,0,110101001A2,4.0,H,110101001A2
2,110101001,A3,2022,1,4.0,0,110101001A3,4.0,H,110101001A3
3,110101001,B1,2022,1,8.0,0,110101001B1,8.0,NV,110101001B1
4,110101001,B2,2022,1,4.0,0,110101001B2,4.0,H,110101001B2


In [26]:
print(df_raw2.tahun.unique())
print(df_raw2.bulan.unique())

['2022' '2023']
[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [34]:
for i in files_VV:
    ## read data
    with open(i, 'rb') as f:
        df = pickle.load(f)
    
    ## edit obs
    df['nth'] = df.nth.astype('int')
    if('class' in df.columns): df.drop(['class'], axis=1, inplace=True)
    
    df2 = df.merge(df_raw2[['class','tahun','bulan','idsubsegment2']], how='left', left_on = ['year','nth','idsubsegment'], right_on=['tahun','bulan','idsubsegment2'])
    
    df2['observation'] = df2['class']
    df2 = df2[cols_VV]

    # dump pickle
    with open(i[:-4]+'_recode2.pkl', 'wb') as f:
        pickle.dump(df2, f)

    # break

In [50]:
for i in files_VH:
    ## read data
    with open(i, 'rb') as f:
        df = pickle.load(f)
    
    ## edit obs
    df['nth'] = df.nth.astype('int')
    if('class' in df.columns): df.drop(['class'], axis=1, inplace=True)
    
    df2 = df.merge(df_raw2[['class','tahun','bulan','idsubsegment2']], how='left', left_on = ['year','nth','idsubsegment'], right_on=['tahun','bulan','idsubsegment2'])
    
    df2['observation'] = df2['class']
    df2 = df2[cols_VH]

    # dump pickle
    with open(i[:-4]+'_recode2.pkl', 'wb') as f:
        pickle.dump(df2, f)

    # break

In [48]:
i

'/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter5_32_VH.pkl/data/ksa/04_Data_Preprocessing/training-test/32/training_imputation_varfilter10_32_VH.pkl'

In [44]:
print(df.shape)
df.tail()

(1519550, 40)


Unnamed: 0,idpoint,idsubsegment,idsegment,nth,periode,observation,MGRS,VH_30,VH_29,VH_28,...,VH_7,VH_6,VH_5,VH_4,VH_3,VH_2,VH_1,VH_0,year,date
58720,321222101C3#21,321222101C3,321222101,12,2023_30,0.0,49MAN,-21.70273,-25.170452,-23.626261,...,-17.653775,-17.804113,-17.223524,-18.032181,-18.93556,-18.603713,-18.433805,-18.56768,2023,2023-12-01
58721,321222101C3#22,321222101C3,321222101,12,2023_30,0.0,49MAN,-21.438181,-25.02924,-22.548168,...,-17.362868,-16.814962,-17.157547,-17.204791,-17.553091,-17.691608,-17.909828,-18.144841,2023,2023-12-01
58722,321222101C3#23,321222101C3,321222101,12,2023_30,0.0,49MAN,-21.057226,-24.467693,-23.314026,...,-17.333859,-15.136828,-18.657932,-17.326017,-17.058886,-17.866014,-18.541021,-18.946023,2023,2023-12-01
58723,321222101C3#24,321222101C3,321222101,12,2023_30,0.0,49MAN,-22.040913,-24.115118,-25.292135,...,-17.599831,-16.47254,-18.111565,-17.559373,-17.519373,-18.82587,-20.140518,-21.0009,2023,2023-12-01
58724,321222101C3#25,321222101C3,321222101,12,2023_30,0.0,49MAN,-22.65252,-25.59034,-25.1819,...,-17.766392,-16.909813,-18.877167,-18.549246,-18.789925,-19.337137,-19.836735,-20.265898,2023,2023-12-01


In [45]:
print(df2.shape)
df2.tail()

(1519550, 40)


Unnamed: 0,idpoint,idsubsegment,idsegment,nth,periode,observation,MGRS,VH_30,VH_29,VH_28,...,VH_7,VH_6,VH_5,VH_4,VH_3,VH_2,VH_1,VH_0,year,date
1519545,321222101C3#21,321222101C3,321222101,12,2023_30,BP,49MAN,-21.70273,-25.170452,-23.626261,...,-17.653775,-17.804113,-17.223524,-18.032181,-18.93556,-18.603713,-18.433805,-18.56768,2023,2023-12-01
1519546,321222101C3#22,321222101C3,321222101,12,2023_30,BP,49MAN,-21.438181,-25.02924,-22.548168,...,-17.362868,-16.814962,-17.157547,-17.204791,-17.553091,-17.691608,-17.909828,-18.144841,2023,2023-12-01
1519547,321222101C3#23,321222101C3,321222101,12,2023_30,BP,49MAN,-21.057226,-24.467693,-23.314026,...,-17.333859,-15.136828,-18.657932,-17.326017,-17.058886,-17.866014,-18.541021,-18.946023,2023,2023-12-01
1519548,321222101C3#24,321222101C3,321222101,12,2023_30,BP,49MAN,-22.040913,-24.115118,-25.292135,...,-17.599831,-16.47254,-18.111565,-17.559373,-17.519373,-18.82587,-20.140518,-21.0009,2023,2023-12-01
1519549,321222101C3#25,321222101C3,321222101,12,2023_30,BP,49MAN,-22.65252,-25.59034,-25.1819,...,-17.766392,-16.909813,-18.877167,-18.549246,-18.789925,-19.337137,-19.836735,-20.265898,2023,2023-12-01


In [46]:
df.groupby("observation").size()

observation
0.0    539950
1.0    124325
2.0     62900
3.0    109500
4.0    112950
5.0    160975
6.0    229500
7.0    176325
dtype: int64

In [43]:
df2.groupby("observation").size()

observation
BP     505725
BPL     34225
G      109500
H      112950
NP     229500
NV     176325
PL     160975
V1     124325
V2      62900
dtype: int64

--------------------------

### catatan

In [None]:
with open('/data/ksa/04_Data_Preprocessing/training-test/'+idprov+'/'+cond+'_'+idprov+'_'+band+'.pkl', 'rb') as f:
    df = pickle.load(f)

In [None]:

df2 = df.merge(df_raw[['obs','tahun','bulan','idsubsegment2']], how='left', left_on = ['year','nth','idsubsegment'], right_on=['tahun','bulan','idsubsegment2'])
df2['obs'] = df2.obs.astype('str')
df2 = df2[~df2['obs'].isin(['0.0', '12.0'])]
df2['obs'] = df2['obs'].apply(lambda x: '7.0' if str(x).startswith('7') else x)

df2['observation'] = df2['obs']
df2 = df2[cols_]

In [21]:
df2.groupby('observation').size()#.sum()

observation
1.0    1683475
2.0    1302225
3.0    1846325
4.0    2334050
5.0     932875
7.0    1381675
8.0    1263550
dtype: int64

In [None]:
df.shape

In [None]:
with open('/data/ksa/04_Data_Preprocessing/training-test/'+idprov+'/'+cond+'_'+idprov+'_'+band+'_unrecode.pkl', 'wb') as f:
    pickle.dump(f)