## Script to clean wildfire data fetched from the NASA FIRMS API 
This dataset represents dates, times and coordinates rated with a fire confidence level 'high' based on Visible Infrared Imaging Radiometer Suite (VIIRS) sensors
1. Latitude and longitude are rounded to 2 decimal points 
2. Coordinates within a 0.2x0.2 degree tile on the same date are united to a single point (lowest bottom left of a tile)

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('fire_confidence.csv')

# round latitude and longitude to 2 decimal points
df['latitude'] = df['latitude'].round(2)
df['longitude'] = df['longitude'].round(2)

# create new columns latitude_clean and longitude_clean
df['latitude_clean'] = df['latitude']
df['longitude_clean'] = df['longitude']
df.rename(columns={'acq_date': 'date', 'acq_time' : 'time'}, inplace=True)


In [8]:
# determine if coordinates are within the same 0.2x0.2 degree tile
def coordinates_part_of_same_tile(lat1, lon1, lat2, lon2, delta=0.2):
    return abs(lat1 - lat2) <= delta and abs(lon1 - lon2) <= delta

# iterate over each row in the DataFrame to group coordinates
for index, row in df.iterrows():
    lat = row['latitude']
    lon = row['longitude']

    # find all rows that are within the same 0.2x0.2 degree tile
    in_same_tile = df.apply(lambda x: coordinates_part_of_same_tile(lat, lon, x['latitude'], x['longitude']), axis=1)

    #get the subset of the DataFrame that falls within the same tile
    tile_group = df[in_same_tile]

    # find the lowest left coordinate in this tile group
    lowest_lat = tile_group['latitude'].min()
    lowest_lon = tile_group['longitude'].min()

    # assign the lowest left coordinate to the latitude_clean and longitude_clean columns for the rows in this tile group
    print(f"uniting {lowest_lat} {lowest_lon} and {lat} {lon}")
    df.loc[in_same_tile, 'latitude_clean'] = lowest_lat
    df.loc[in_same_tile, 'longitude_clean'] = lowest_lon


df.to_csv('fire_confidence_clean.csv', index=False)

In [4]:
display(df)

df.to_csv('fire_confidence_clean.csv', index=False)

grouped_df = df.groupby(['latitude_clean', 'longitude_clean'])

num_groups = len(grouped_df)

print("Number of groups:", num_groups)

In [7]:
df_ll=df[['latitude_clean','longitude_clean']]
df_ll.drop_duplicates(inplace=True)
display(df_ll)

In [8]:
# # grouped_df = df.groupby('date')
# # processed_rows = []

# def coordinates_part_of_same_tile(lat_corner, lon_corner, lat, lon):
#     if lat_corner <= lat <= lat_corner + 0.2 and lon_corner <= lon <= lon_corner + 0.2:
#         return True
#     else:
#         return False

# def coordinates_same(lat_corner, lon_corner, lat, lon):
#     if lat == lat_corner and lon == lon_corner:
#         return True
#     else:
#         return False


# # # iterate over each date
# # for date, group in grouped_df:

# for row in df:

#     # initialize lowest coordinates from the first row of the group
#     lowest_lat = group.iloc[0]['latitude_clean']
#     lowest_lon = group.iloc[0]['longitude_clean']

#     # iterate over each entry for current date
#     for index, row in group.iterrows():
#         lat = row['latitude_clean']
#         lon = row['longitude_clean']

#         if coordinates_same (lowest_lat, lowest_lon, lat, lon):
#             continue

#         # check if current coordinates are part of the same tile
#         if coordinates_part_of_same_tile(lowest_lat, lowest_lon, lat, lon):
#             print(f"uniting {lowest_lat} {lowest_lon} and {lat} {lon} from {row['date']}")
#             df.loc[index, 'latitude_clean'] = lowest_lat
#             df.loc[index, 'longitude_clean'] = lowest_lon
#         elif coordinates_part_of_same_tile(lat, lon, lowest_lat, lowest_lon):
#             print(f"uniting {lowest_lat} {lowest_lon} and {lat} {lon}  from {row['date']}")
#             lowest_lat = lat
#             lowest_lon = lon # reassign lowest coordinates found so far

#         processed_rows.append(row)


# processed_df = pd.DataFrame(processed_rows)
# processed_df.to_csv('fire_confidence_clean.csv', index=False)
