## Label file creation + reorganization for ActiveFire dataset

The output of this dataset is already included in the latest version of the repository, but to recreate, first clone the Oxen repository: 

```bash oxen clone https://hub.oxen.ai/ba/ActiveFire```

Then run this file from the root of the ActiveFire directory.

In [None]:
import pandas as pd
import glob

In [93]:
# Glob existing files to get counts per directory after unpacking
landsat = glob.glob('data/landsat_patches/*')
manual = glob.glob('data/manual_annotations_patches/*')
masks = glob.glob('data/masks_patches/*')

In [94]:
print(f"Landsat patches: {len(landsat)}")
print(f"Manually annotated patches: {len(manual)}")
print(f"Masks annotated patches: {len(masks)}")

Landsat patches: 9045
Manually annotated patches: 100
Masks annotated patches: 1098


In [95]:
# Collect paths for each file / label type for matching to associate data with labels
data = pd.DataFrame()
landsat_paths = []
mask_paths = []
manual_paths = []
for path in landsat: 
    landsat_paths.append(path.split('/')[2])
    
for path in manual:
    manual_paths.append(path.split('/')[2])

for path in masks:
    mask_paths.append(path.split('/')[2])

landsat_paths = pd.Series(landsat_paths)
manual_paths = pd.Series(manual_paths)
mask_paths = pd.Series(mask_paths)

In [96]:
# Separate out the 5 different non-manual masking methods and get their counts
kumar = mask_paths[mask_paths.str.contains('Kumar')]
intersection = mask_paths[mask_paths.str.contains('Intersection')]
murphy = mask_paths[mask_paths.str.contains('Murphy')]
schroeder = mask_paths[mask_paths.str.contains('Schroeder')]
voting = mask_paths[mask_paths.str.contains('Voting')]

print("Kumar", len(kumar))
print("Intersection", len(intersection))
print("Murphy", len(murphy))
print("Schroeder", len(schroeder))
print("Voting", len(voting))

Kumar 391
Intersection 118
Murphy 164
Schroeder 227
Voting 198


In [97]:
# Convert all to dataframes to make new columns
landsat = pd.DataFrame(landsat_paths)
kumar = pd.DataFrame(kumar)
intersection = pd.DataFrame(intersection)
murphy = pd.DataFrame(murphy)
schroeder = pd.DataFrame(schroeder)
voting = pd.DataFrame(voting)
manual = pd.DataFrame(manual_paths)


In [98]:
# Helper function to slice out the unique additive to each path, leaving only the matchable scene_ids
def get_scene_name(path, key):
    path = path.replace(key, '')
    return path 

# Landsat paths are just the scene ids, so no need to process
keys = {
    'kumar': 'Kumar-Roy_',
    'intersection': 'Intersection_',
    'murphy': 'Murphy_',
    'schroeder': 'Schroeder_',
    'voting': 'Voting_',
    'manual': 'v1_',
    'landsat': ''
}

dfs = {'landsat': landsat, 'kumar': kumar, 'intersection': intersection, 
         'murphy': murphy, 
         'schroeder': schroeder, 
         'voting': voting, 
         'manual': manual}

for key, df in dfs.items():
    df['patch_id'] = df[0].apply(lambda x: get_scene_name(x, keys[key]))


In [99]:
for key, df in dfs.items(): 
    df.rename(columns={0: f'{key}_path'}, inplace=True)

In [100]:
# Merge all dataframes together on the common patch_id column, starting with landsat as the source of truth for left join
merged = dfs['landsat']
for key, df in dfs.items(): 
    if key != 'landsat':
        merged = pd.merge(merged, df, on='patch_id', how='left')

In [101]:
# Ensure merge happened properly and all paths in sub-files matched a landsat file 
for key, df in dfs.items():
    merged_count = (merged[f'{key}_path'].isna() == False).sum()
    original_count = len(df)
    assert merged_count == original_count, f"❌ Error with {key} merge. Merged count: {merged_count}, Original count: {original_count}"
    print(f"✅ {key} count: {(merged[f'{key}_path'].isna() == False).sum()} equals original count of {len(df)}")


✅ landsat count: 9045 equals original count of 9045
✅ kumar count: 391 equals original count of 391
✅ intersection count: 118 equals original count of 118
✅ murphy count: 164 equals original count of 164
✅ schroeder count: 227 equals original count of 227
✅ voting count: 198 equals original count of 198
✅ manual count: 100 equals original count of 100


In [102]:
# Create a scene id for merging with metadata before changing column identifiers 
merged['scene_id'] = merged.landsat_path.apply(lambda x: "_".join(x.split('_')[:-1]))

# Generate the full correct relative paths - these columns have different base folders
merged['landsat_path'] = merged.landsat_path.apply(lambda x: 'data/landsat_patches/'+x if x==x else x)
merged['manual_path'] = merged.manual_path.apply(lambda x: 'data/manual_annotations_patches/'+x if x==x else x)

for key, df in dfs.items():
    if key not in ['landsat', 'manual']:
        merged[f'{key}_path'] = merged[f'{key}_path'].apply(lambda x: 'data/masks_patches/'+x if x==x else x)


In [103]:
# Merge with scene-level metadata 
metadata = pd.read_csv('labels/images202009.csv', sep=';')
final_merged = pd.merge(merged, metadata, left_on = 'scene_id', right_on='productId', how='left')

final_merged.to_csv("labels/labels.csv", drop=True)