## Script to clean wildfire data fetched from the NASA FIRMS API 
This dataset represents dates, times and coordinates rated with a fire confidence level 'high' based on Visible Infrared Imaging Radiometer Suite (VIIRS) sensors
1. Latitude and longitude are rounded to 2 decimal points 
2. Coordinates within a 0.2x0.2 degree tile on the same date are united to a single point (lowest bottom left of a tile)

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('fire_confidence.csv')

# round latitude and longitude to 2 decimal points
df['latitude'] = df['latitude'].round(2)
df['longitude'] = df['longitude'].round(2)

# create new columns latitude_clean and longitude_clean
df['latitude_clean'] = df['latitude']
df['longitude_clean'] = df['longitude']
df.rename(columns={'acq_date': 'date', 'acq_time' : 'time'}, inplace=True)


In [8]:
# determine if coordinates are within the same 0.2x0.2 degree tile
def coordinates_part_of_same_tile(lat1, lon1, lat2, lon2, delta=0.2):
    return abs(lat1 - lat2) <= delta and abs(lon1 - lon2) <= delta

# iterate over each row in the DataFrame to group coordinates
for index, row in df.iterrows():
    lat = row['latitude']
    lon = row['longitude']

    # find all rows that are within the same 0.2x0.2 degree tile
    in_same_tile = df.apply(lambda x: coordinates_part_of_same_tile(lat, lon, x['latitude'], x['longitude']), axis=1)

    #get the subset of the DataFrame that falls within the same tile
    tile_group = df[in_same_tile]

    # find the lowest left coordinate in this tile group
    lowest_lat = tile_group['latitude'].min()
    lowest_lon = tile_group['longitude'].min()

    # assign the lowest left coordinate to the latitude_clean and longitude_clean columns for the rows in this tile group
    print(f"uniting {lowest_lat} {lowest_lon} and {lat} {lon}")
    df.loc[in_same_tile, 'latitude_clean'] = lowest_lat
    df.loc[in_same_tile, 'longitude_clean'] = lowest_lon


df.to_csv('fire_confidence_clean.csv', index=False)

uniting 37.17 14.47 and 37.36 14.65
uniting 37.78 13.12 and 37.93 13.31
uniting 38.94 16.89 and 39.1 17.09
uniting 37.67 13.37 and 37.86 13.55
uniting 39.52 8.61 and 39.52 8.61
uniting 37.43 13.7 and 37.61 13.89
uniting 37.43 13.7 and 37.61 13.89
uniting 37.72 13.08 and 37.88 13.28
uniting 37.32 13.56 and 37.51 13.76
uniting 37.34 14.13 and 37.53 14.33
uniting 37.08 14.47 and 37.26 14.67
uniting 37.32 13.42 and 37.46 13.54
uniting 37.34 14.16 and 37.53 14.34
uniting 37.34 14.16 and 37.53 14.34
uniting 40.41 16.33 and 40.6 16.5
uniting 37.66 12.71 and 37.85 12.91
uniting 45.46 10.45 and 45.46 10.45
uniting 37.08 14.52 and 37.21 14.71
uniting 37.16 14.3 and 37.35 14.49
uniting 37.31 13.7 and 37.5 13.89
uniting 37.31 13.71 and 37.5 13.9
uniting 37.34 14.22 and 37.53 14.38
uniting 37.35 13.24 and 37.55 13.44
uniting 37.48 12.94 and 37.59 13.04
uniting 37.41 13.76 and 37.6 13.95
uniting 37.71 12.62 and 37.78 12.81
uniting 37.59 12.74 and 37.79 12.94
uniting 37.63 13.31 and 37.81 13.51
uniti

In [4]:
display(df)

df.to_csv('fire_confidence_clean.csv', index=False)

grouped_df = df.groupby(['latitude_clean', 'longitude_clean'])

num_groups = len(grouped_df)

print("Number of groups:", num_groups)

Unnamed: 0,latitude,longitude,date,time,confidence,latitude_clean,longitude_clean
0,37.36,14.65,2022-09-01,1244,h,37.34,14.59
1,37.93,13.31,2022-09-01,1244,h,37.72,13.29
2,39.10,17.09,2022-09-02,1045,h,39.09,16.87
3,37.86,13.55,2022-09-04,1149,h,37.72,13.29
4,39.52,8.61,2022-09-05,1130,h,39.34,8.61
...,...,...,...,...,...,...,...
1077,37.08,14.52,2024-06-09,1212,h,36.98,14.35
1078,40.38,18.08,2024-06-09,1212,h,40.03,18.00
1079,41.02,16.21,2024-06-12,1255,h,40.84,16.13
1080,41.02,16.21,2024-06-12,1255,h,40.84,16.13


Number of groups: 131


In [7]:
df_ll=df[['latitude_clean','longitude_clean']]
df_ll.drop_duplicates(inplace=True)
display(df_ll)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ll.drop_duplicates(inplace=True)


Unnamed: 0,latitude_clean,longitude_clean,date
0,37.34,14.59,2022-09-01
1,37.72,13.29,2022-09-01
2,39.09,16.87,2022-09-02
3,37.72,13.29,2022-09-04
4,39.34,8.61,2022-09-05
...,...,...,...
1074,40.43,17.20,2024-06-08
1077,36.98,14.35,2024-06-09
1078,40.03,18.00,2024-06-09
1079,40.84,16.13,2024-06-12


In [8]:
# # grouped_df = df.groupby('date')
# # processed_rows = []

# def coordinates_part_of_same_tile(lat_corner, lon_corner, lat, lon):
#     if lat_corner <= lat <= lat_corner + 0.2 and lon_corner <= lon <= lon_corner + 0.2:
#         return True
#     else:
#         return False

# def coordinates_same(lat_corner, lon_corner, lat, lon):
#     if lat == lat_corner and lon == lon_corner:
#         return True
#     else:
#         return False


# # # iterate over each date
# # for date, group in grouped_df:

# for row in df:

#     # initialize lowest coordinates from the first row of the group
#     lowest_lat = group.iloc[0]['latitude_clean']
#     lowest_lon = group.iloc[0]['longitude_clean']

#     # iterate over each entry for current date
#     for index, row in group.iterrows():
#         lat = row['latitude_clean']
#         lon = row['longitude_clean']

#         if coordinates_same (lowest_lat, lowest_lon, lat, lon):
#             continue

#         # check if current coordinates are part of the same tile
#         if coordinates_part_of_same_tile(lowest_lat, lowest_lon, lat, lon):
#             print(f"uniting {lowest_lat} {lowest_lon} and {lat} {lon} from {row['date']}")
#             df.loc[index, 'latitude_clean'] = lowest_lat
#             df.loc[index, 'longitude_clean'] = lowest_lon
#         elif coordinates_part_of_same_tile(lat, lon, lowest_lat, lowest_lon):
#             print(f"uniting {lowest_lat} {lowest_lon} and {lat} {lon}  from {row['date']}")
#             lowest_lat = lat
#             lowest_lon = lon # reassign lowest coordinates found so far

#         processed_rows.append(row)


# processed_df = pd.DataFrame(processed_rows)
# processed_df.to_csv('fire_confidence_clean.csv', index=False)


NameError: name 'group' is not defined