# Find duplicate granules in OPERA CSLC-S1 dataset

In [None]:
import leafmap
import pandas as pd
import geopandas as gpd
from datetime import datetime
import re

To  and access the data, you will need to create an Earthdata login. You can register for an account at [urs.earthdata.nasa.gov](https://urs.earthdata.nasa.gov).

In [None]:
leafmap.nasa_data_login()

## View available OPERA product metadate
TSV of NASA Earthdata products is available in the [NASA-Earth-Data](https://github.com/opengeos/NASA-Earth-Data) repo. We filter to just OPERA products.

In [None]:
url = 'https://github.com/opengeos/NASA-Earth-Data/raw/main/nasa_earth_data.tsv'
earth_data_df = pd.read_csv(url, sep='\t')
opera_df = earth_data_df[earth_data_df['ShortName'].str.contains('OPERA', case=False)]
opera_df

## Load all CSLC-S1 graunules into a geodataframe
For reference, as of Jan. 2024 there are ~225,000 CSLC-S1 granules, and it takes about 6 minutes to load it into the geodataframe.

In [None]:
results, gdf = leafmap.nasa_data_search(
    short_name='OPERA_L2_CSLC-S1_V1',
    cloud_hosted=True,
    bounding_box= (-180.0, -90.0, 180, 90.0),
    temporal=("2014-06-15", str(datetime.now().date())),
    count=-1,  # use -1 to return all datasets
    return_gdf=True,
)

In [183]:
gdf.tail()

Unnamed: 0,size,concept-type,concept-id,revision-id,native-id,collection-concept-id,provider-id,format,revision-date,BeginningDateTime,...,Projects,ArchiveAndDistributionInformation,DayNightFlag,Identifiers,ProductionDateTime,Platforms,Name,URL,EntryTitle,geometry
223863,250.121507,granule,G2849544209-ASF,1,OPERA_L2_CSLC-S1_T135-288105-IW1_20240131T2245...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-02-01T23:40:25.998Z,2024-01-31T22:45:21Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T135-288105-IW1_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T135-288105-...,2024-02-01T23:08:14Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-75.07141 47.61408, -74.44043 47.696..."
223864,261.004582,granule,G2849544707-ASF,1,OPERA_L2_CSLC-S1_T135-288105-IW2_20240131T2245...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-02-01T23:40:31.959Z,2024-01-31T22:45:22Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T135-288105-IW2_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T135-288105-...,2024-02-01T23:22:37Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-73.97846 47.81257, -73.33305 47.889..."
223865,229.972458,granule,G2849545118-ASF,1,OPERA_L2_CSLC-S1_T135-288105-IW3_20240131T2245...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-02-01T23:40:36.143Z,2024-01-31T22:45:23Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T135-288105-IW3_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T135-288105-...,2024-02-01T23:37:07Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-72.82583 48.00699, -72.25928 48.068..."
223866,250.157903,granule,G2849544867-ASF,1,OPERA_L2_CSLC-S1_T135-288106-IW1_20240131T2245...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-02-01T23:40:33.607Z,2024-01-31T22:45:24Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T135-288106-IW1_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T135-288106-...,2024-02-01T23:09:48Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-75.11872 47.77942, -74.48927 47.861..."
223867,260.77182,granule,G2849545293-ASF,1,OPERA_L2_CSLC-S1_T135-288106-IW2_20240131T2245...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-02-01T23:40:38.195Z,2024-01-31T22:45:24Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T135-288106-IW2_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T135-288106-...,2024-02-01T23:24:14Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-74.02377 47.97781, -73.37483 48.055..."


### Make a list of 'native-id' from the files

In [184]:
identifier_list = gdf['native-id'].tolist()
print('Total granules:', len(identifier_list))
print(identifier_list[0:2])

Total granules: 223868
['OPERA_L2_CSLC-S1_T136-290041-IW2_20231004T001425Z_20240123T005841Z_S1A_VV_v1.0', 'OPERA_L2_CSLC-S1_T136-290041-IW3_20231004T001426Z_20240123T005841Z_S1A_VV_v1.0']


## Find duplicates

### Isolate the part of the file name that would be the same for duplicate granules

In [185]:
### Access the parts that may indicate true duplicates from the identifier name
print(identifier_list[0][0:-29])   # burst ID

OPERA_L2_CSLC-S1_T136-290041-IW2_20231004T001425Z


### Create a set of duplicates and unique identifiers

In [186]:
duplicate_identifiers = set()
unique_identifiers = set()

for identifier in identifier_list:
    potential_duplicate_portion = identifier[0:-29]

    # Check if the identifier is already in the set
    if potential_duplicate_portion in unique_identifiers:
        duplicate_identifiers.add(potential_duplicate_portion)
    else:
        # Add the identifier to the set if it's not a duplicate
        unique_identifiers.add(potential_duplicate_portion)

# If you need the result as a list, you can convert the sets back to lists
duplicate_identifiers_list = list(duplicate_identifiers)
unique_identifiers_list = list(unique_identifiers)

In [187]:
print(f'Total CSLC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))
print('Granules with more than one version:',len(duplicate_identifiers))

Total CSLC-S1 granules as of 02-02-2024: 223868
Granules with more than one version: 23966


### Collect duplicates for each granulate where duplicates exist

In [188]:
# Create a dictionary to store the potentially duplicated portion as the key and a list of entire elements as the value
granules_dictionary = {}

# Create a list to store pairs of potentially duplicated elements
duplicate_pairs = []

# Iterate over the elements in the list
for granule in identifier_list:
    # Extract the potentially duplicated portion
    potential_duplicate_portion = granule[0:-29]
    
    # If the potential duplicate portion is not in the dictionary, add it with the entire element
    if potential_duplicate_portion not in granules_dictionary:
        granules_dictionary[potential_duplicate_portion] = [granule]
    else:
        # If the potential duplicate portion is already in the dictionary, add the entire element to the list
        granules_dictionary[potential_duplicate_portion].append(granule)

# Create pairs from the dictionary values
for granules in granules_dictionary.values():
    if len(granules) > 1:
        duplicate_pairs.append(granules)

### Print the number of duplicates for each granule

In [189]:
print(f'Total CSLC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))

Total CSLC-S1 granules as of 02-02-2024: 223868


In [190]:
duplicates = []
for pair in duplicate_pairs:
    for granule in pair:
        duplicates.append(granule)
print(f'Total CSLC-S1 duplicate tiles as of {datetime.now().strftime("%d-%m-%Y")}: {len(duplicates)} granules')

Total CSLC-S1 duplicate tiles as of 02-02-2024: 48581 granules


In [191]:
one_duplicate = []
for pair in duplicate_pairs:
    if len(pair) > 1 and len(pair) < 3:
        one_duplicate.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 1 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(one_duplicate)} totaling {len(one_duplicate)*2} granules.')

Total CSLC-S1 duplicate tiles with 1 duplicate as of 02-02-2024: 23317 totaling 46634 granules.


In [192]:
two_duplicates = []
for pair in duplicate_pairs:
    if len(pair) > 2:
        two_duplicates.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 2 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(two_duplicates)} totaling {len(two_duplicates)*3} granules.')

Total CSLC-S1 duplicate tiles with 2 duplicate as of 02-02-2024: 649 totaling 1947 granules.


In [193]:
three_duplicates = []
for pair in duplicate_pairs:
    if len(pair)>3:
        three_duplicates.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 3 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(three_duplicates)} totaling {len(three_duplicates)*4} granules.')

Total CSLC-S1 duplicate tiles with 3 duplicate as of 02-02-2024: 0 totaling 0 granules.


In [194]:
### add the url to the duplicate names
duplicate_urls = []
for pair in duplicate_pairs:
    pair_urls = []
    for granule in pair:
        pair_urls.append('https://datapool.asf.alaska.edu/CSLC/OPERA-S1/'+str(granule)+'.h5')
    duplicate_urls.append(pair_urls)
    

### Add the url, burst IDs, and dates to the duplicate names

In [195]:
### burst ids
burst_ids = []
dates = []
for pair in duplicate_pairs:
    burst_ids.append(pair[0][17:32])
    dates.append(pair[0][33:41])

In [196]:
duplicates_df = pd.DataFrame({
    'burst_id': burst_ids,
    'date': dates,
    'duplicates': duplicate_urls
})

### Make dataframe of all duplicates

In [197]:
#df_final = pd.DataFrame(duplicates_df['duplicates'].tolist(), index=duplicates_df[['burst_id', 'date']]).reset_index()
df_final = pd.concat([duplicates_df[['burst_id', 'date']], duplicates_df['duplicates'].apply(lambda x: pd.Series(x))], axis=1)

# Rename the columns
df_final.columns = ['burst_id', 'date', 'duplicate_1', 'duplicate_2', 'duplicate_3']

# Sort by burst_id 
sorted_df = df_final.sort_values(by='burst_id')

In [198]:
sorted_df.head()

Unnamed: 0,burst_id,date,duplicate_1,duplicate_2,duplicate_3
6107,T001-000681-IW1,20231205,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,
6108,T001-000681-IW2,20231205,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,
6110,T001-000681-IW3,20231205,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,
6109,T001-000682-IW1,20231205,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,
6111,T001-000682-IW2,20231205,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,


### Add columns of interest from the original geodataframe for each duplicate and format it nicely (probably could be improved, but should work)

In [199]:
df2.head()

Unnamed: 0,size,concept-type,concept-id,revision-id,native-id,collection-concept-id,provider-id,format,revision-date,BeginningDateTime,...,Projects,ArchiveAndDistributionInformation,DayNightFlag,Identifiers,ProductionDateTime,Platforms,Name,URL,EntryTitle,geometry
0,246.92954,granule,G2841527179-ASF,1,OPERA_L2_CSLC-S1_T136-290041-IW2_20231004T0014...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-01-23T01:46:29.099Z,2023-10-04T00:14:25Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T136-290041-IW2_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T136-290041-...,2024-01-23T01:14:00Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-90.86242 12.59866, -90.01808 12.762..."
1,217.356254,granule,G2841527159-ASF,1,OPERA_L2_CSLC-S1_T136-290041-IW3_20231004T0014...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-01-23T01:46:27.697Z,2023-10-04T00:14:26Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T136-290041-IW3_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T136-290041-...,2024-01-23T01:28:17Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-90.07809 12.81062, -89.33259 12.952..."
2,237.075946,granule,G2841527141-ASF,1,OPERA_L2_CSLC-S1_T136-290042-IW1_20231004T0014...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-01-23T01:46:25.885Z,2023-10-04T00:14:27Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T136-290042-IW1_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T136-290042-...,2024-01-23T01:00:03Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-91.64063 12.55924, -90.83033 12.718..."
3,246.630181,granule,G2841527201-ASF,1,OPERA_L2_CSLC-S1_T136-290042-IW2_20231004T0014...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-01-23T01:46:31.651Z,2023-10-04T00:14:28Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T136-290042-IW2_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T136-290042-...,2024-01-23T01:15:36Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-90.89615 12.76514, -90.05123 12.928..."
4,216.961326,granule,G2841527187-ASF,1,OPERA_L2_CSLC-S1_T136-290042-IW3_20231004T0014...,C2777443834-ASF,ASF,application/vnd.nasa.cmr.umm+json,2024-01-23T01:46:30.167Z,2023-10-04T00:14:29Z,...,[{'ShortName': 'SNWG/OPERA'}],[{'Name': 'OPERA_L2_CSLC-S1_T136-290042-IW3_20...,Unspecified,[{'Identifier': 'OPERA_L2_CSLC-S1_T136-290042-...,2024-01-23T01:29:41Z,"[{'ShortName': 'Sentinel-1A', 'Instruments': [...",UMM-G,https://cdn.earthdata.nasa.gov/umm/granule/v1.6.5,,"POLYGON ((-90.11125 12.97683, -89.36523 13.118..."


In [200]:
# Function to extract acquisition and processing times as datetime objects
def extract_portion(url):
    if pd.notna(url):
        match = re.search(r'([^/]+)\.h5', url)
        if match:
            info_string = match.group(1)
            return info_string
    return None

# Apply the function to extract the portion and create a new column
sorted_df['extracted_portion_duplicate_1'] = sorted_df['duplicate_1'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_2'] = sorted_df['duplicate_2'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_3'] = sorted_df['duplicate_3'].apply(extract_portion)

merged_df1 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_1', right_on='native-id', how='inner')
merged_df2 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_2', right_on='native-id', how='inner')
merged_df3 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_3', right_on='native-id', how='inner')

merged_df1['revision-date-1'] = merged_df1['revision-date']
merged_df2['revision-date-2'] = merged_df2['revision-date']
merged_df3['revision-date-3'] = merged_df3['revision-date']

merged_df1['ProductionDateTime-1'] = merged_df1['ProductionDateTime']
merged_df2['ProductionDateTime-2'] = merged_df2['ProductionDateTime']
merged_df3['ProductionDateTime-3'] = merged_df3['ProductionDateTime']

merged_df = pd.merge(merged_df1, merged_df2, on='duplicate_1', how='left', suffixes=('_df2', '_df1'))
merged_df = pd.merge(merged_df, merged_df3, on='duplicate_1', how='left', suffixes=('_merged', '_df3'))

merged_df['burst_id'] = merged_df['burst_id_df2']
merged_df['duplicate_2'] = merged_df['duplicate_2_df2']
merged_df['duplicate_3'] = merged_df['duplicate_3_df2']

suffixes_to_remove = ['_df1', '_df2', '_df3', '_df4','merged']

# Iterate over the suffixes and drop columns
for suffix in suffixes_to_remove:
    columns_to_drop = [col for col in merged_df.columns if (col.endswith(suffix))]
    merged_df = merged_df.drop(columns=columns_to_drop)


merged_df['extracted_portion_duplicate_1'] = merged_df['duplicate_1'].apply(extract_portion)
#merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']

common_column = 'extracted_portion_duplicate_1'
column_to_include = 'BeginningDateTime'

# Merge the DataFrames based on the common column
merged_df = pd.merge(merged_df, df2[['native-id', column_to_include]], left_on=common_column, right_on='native-id', how='left')

# Drop the duplicate columns and rename the result column
merged_df = merged_df.drop(columns=['extracted_portion_duplicate_1']).rename(columns={column_to_include: 'BeginningDateTime'})

merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']

columns_to_drop = ['date','extracted_portion_duplicate_2', 'extracted_portion_duplicate_3',
       'size', 'concept-type', 'concept-id', 'revision-id', 'native-id_x',
       'provider-id', 'format', 'revision-date', 'BeginningDateTime_x',
       'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
       'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
       'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
       'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
       'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',
       'geometry', 'BeginningDateTime_y','native-id_y']

merged_df = merged_df.drop(columns=columns_to_drop)

# Specify the desired column order
desired_order = ['burst_id', 'duplicate_1', 'duplicate_2', 'duplicate_3','BeginningDateTime','revision-date-1','revision-date-2','revision-date-3','ProductionDateTime-1','ProductionDateTime-2','ProductionDateTime-3']

# Create a new DataFrame with the specified column order
merged_df = merged_df[desired_order]
merged_df.head()

Unnamed: 0,burst_id,duplicate_1,duplicate_2,duplicate_3,BeginningDateTime,revision-date-1,revision-date-2,revision-date-3,ProductionDateTime-1,ProductionDateTime-2,ProductionDateTime-3
0,T001-000681-IW1,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,,2023-12-05T18:32:10Z,2023-12-06T13:40:41.565Z,2023-12-06T15:08:29.492Z,,2023-12-06T12:54:55Z,2023-12-06T14:22:29Z,
1,T001-000681-IW2,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,,2023-12-05T18:32:11Z,2023-12-06T13:40:35.107Z,2023-12-06T15:08:31.375Z,,2023-12-06T13:09:52Z,2023-12-06T14:37:31Z,
2,T001-000681-IW3,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,,2023-12-05T18:32:12Z,2023-12-06T13:40:38.838Z,2023-12-06T15:08:35.254Z,,2023-12-06T13:25:03Z,2023-12-06T14:52:51Z,
3,T001-000682-IW1,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,,2023-12-05T18:32:12Z,2023-12-06T13:40:32.558Z,2023-12-06T15:08:33.176Z,,2023-12-06T12:56:25Z,2023-12-06T14:23:59Z,
4,T001-000682-IW2,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,https://datapool.asf.alaska.edu/CSLC/OPERA-S1/...,,2023-12-05T18:32:13Z,2023-12-06T13:40:41.948Z,2023-12-06T15:08:32.132Z,,2023-12-06T13:11:24Z,2023-12-06T14:39:04Z,


### Output dataframe as csv

In [201]:
# Output the DataFrame to a CSV file
csv_file_path = f'CSLC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.csv'
merged_df.to_csv(csv_file_path, index=False)

## Output duplicates as geojson

In [202]:
# make geodataframe
duplicates_gdf = gdf[gdf['native-id'].isin(duplicates)]

columns_to_drop = ['size', 'concept-type', 'concept-id', 'revision-id', 'native-id',
       'provider-id', 'format', 'revision-date', 'BeginningDateTime',
       'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
       'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
       'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
       'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
       'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',]

duplicates_gdf = duplicates_gdf.drop(columns = columns_to_drop)
duplicates_gdf.to_file(f'CSLC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.geojson', driver='GeoJSON')