# Merge folder names for the two satellites and fire information

In [1]:
import pandas as pd
import ast
import os
import numpy as np
import csv
import sys
from shapely.geometry import Polygon, Point
from datetime import datetime, timedelta

project_path = os.environ.get('PROJECT_PATH')
sys.path.append(project_path)

from mine_seg_sat import dataset


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# open tiles_path.csv for both sentinel2 and landsat2 then save into two dataframes

# read the csv file
tiles_path_sentinel2 = pd.read_csv('../dataset_tables/sentinel2_tiles_path.csv')
# tiles_path_landsat2 = pd.read_csv('../dataset_tables/landsat2_tiles_path.csv')
fire_coords = pd.read_csv('../dataset_tables/cleaned_wildfire_infomation.csv')

In [3]:
# convert the string coordinates into list of coordinates
tiles_path_sentinel2['coordinates'] = tiles_path_sentinel2['coordinates'].apply(lambda x: [list(coord) for coord in ast.literal_eval(x)])
fire_coords['coordinates'] = fire_coords['coordinates'].apply(lambda x: [float(coord) for coord in ast.literal_eval(x)])

# Convert coordinates list in tiles_path_sentinel2 to Polygon
tiles_path_sentinel2['coordinates'] = tiles_path_sentinel2['coordinates'].apply(lambda x: Polygon(x))

# Convert coordinates list in fire_coords to Point
fire_coords['coordinates'] = fire_coords['coordinates'].apply(lambda x: Point(x))

# Convert date(int) to datetime(YYYY-MM-DD)
fire_coords['date'] = pd.to_datetime(fire_coords['date'], format='%Y%m%d')
tiles_path_sentinel2['date'] = pd.to_datetime(tiles_path_sentinel2['date'], format='%Y%m%d')

In [4]:
# Create a new column in tiles_path_sentinel2 called "if_fire" and set it to None
tiles_path_sentinel2['if_fire'] = False

# compare the columns of data and coordinates(list) in fire_coords
# with the columns of data and coordinates(Polygon) tiles_path_sentinel2
# if the coordinates of fire_coords is within the coordinates of tiles_path_sentinel2
# then set the if_fire column to True
for index, row in fire_coords.iterrows():
    for index2, row2 in tiles_path_sentinel2.iterrows():
        point = row['coordinates']
        polygon = row2['coordinates']
        date = row['date']
        date2 = row2['date']
        if polygon.contains(point) and timedelta(days=0) <= date - date2 <= timedelta(days=2):
            tiles_path_sentinel2.at[index2, 'if_fire'] = True
            print(f"fire time: {date} and tile time: {date2}.")

fire time: 2022-07-26 00:00:00 and tile time: 2022-07-25 00:00:00.
fire time: 2022-08-01 00:00:00 and tile time: 2022-07-30 00:00:00.
fire time: 2022-08-01 00:00:00 and tile time: 2022-08-01 00:00:00.
fire time: 2022-08-02 00:00:00 and tile time: 2022-08-01 00:00:00.
fire time: 2022-08-10 00:00:00 and tile time: 2022-08-09 00:00:00.
fire time: 2022-08-10 00:00:00 and tile time: 2022-08-09 00:00:00.
fire time: 2022-08-11 00:00:00 and tile time: 2022-08-09 00:00:00.
fire time: 2022-08-17 00:00:00 and tile time: 2022-08-16 00:00:00.
fire time: 2022-09-01 00:00:00 and tile time: 2022-08-31 00:00:00.
fire time: 2022-09-20 00:00:00 and tile time: 2022-09-20 00:00:00.
fire time: 2022-09-21 00:00:00 and tile time: 2022-09-20 00:00:00.
fire time: 2022-11-17 00:00:00 and tile time: 2022-11-17 00:00:00.


In [5]:
fire_coords

Unnamed: 0,date,coordinates,points
0,2022-04-07,POINT (-124.511617 49.345583),POINT (-124.511617 49.345583)
1,2022-05-26,POINT (-124.626617 49.288833),POINT (-124.626617 49.288833)
2,2022-05-29,POINT (-123.624117 49.461333),POINT (-123.624117 49.461333)
3,2022-06-28,POINT (-125.9002 50.107333),POINT (-125.9002 50.107333)
4,2022-06-28,POINT (-125.9647 50.1548),POINT (-125.9647 50.1548)
...,...,...,...
160,2022-11-17,POINT (-125.311517 49.42095),POINT (-125.311517 49.42095)
161,2022-11-19,POINT (-123.9036 49.529167),POINT (-123.9036 49.529167)
162,2022-11-20,POINT (-124.669383 49.289117),POINT (-124.669383 49.289117)
163,2022-11-29,POINT (-123.722233 48.6723),POINT (-123.722233 48.6723)


In [6]:
tiles_path_sentinel2

Unnamed: 0,path_name_sentinel2,satellite,imagery_id,date,tile_id,saving_path,coordinates,if_fire
0,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,2022-08-09,3_4,../prepared_dataset/20220809/3_4,POLYGON ((-125.7607154787941 50.29994118390697...,False
1,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,2022-08-09,3_3,../prepared_dataset/20220809/3_3,POLYGON ((-125.8683865696154 50.30289429674448...,False
2,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,2022-08-09,9_4,../prepared_dataset/20220809/9_4,"POLYGON ((-125.788519596963 49.88615910147051,...",False
3,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,2022-08-09,9_3,../prepared_dataset/20220809/9_3,POLYGON ((-125.89527020203685 49.8890694688280...,False
4,../datasets/sentinel2/S2B_9UYR_20220809_0_L2A/...,sentinel2,S2B_9UYR_20220809_0_L2A,2022-08-09,12_5,../prepared_dataset/20220809/12_5,POLYGON ((-125.69590068808976 49.6762660456515...,False
...,...,...,...,...,...,...,...,...
4185,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,2022-10-02,3_12,../prepared_dataset/20221002/3_12,POLYGON ((-130.4608929423896 48.53647182827095...,False
4186,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,2022-10-02,3_13,../prepared_dataset/20221002/3_13,POLYGON ((-130.35688465087512 48.5377448700640...,False
4187,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,2022-10-02,0_13,../prepared_dataset/20221002/0_13,POLYGON ((-130.36245505099888 48.7449584028832...,False
4188,../datasets/sentinel2/S2B_10UCU_20221002_0_L2A...,sentinel2,S2B_10UCU_20221002_0_L2A,2022-10-02,1_13,../prepared_dataset/20221002/1_13,POLYGON ((-130.36059120428123 48.6758881108587...,False


In [7]:
# count the number of rows where if_fire is True
(tiles_path_sentinel2['if_fire'] == True).sum()


12

In [8]:
# Randomly select 5% of the rows in each month where 'if_fire' is False
# and merge these rows with the rows where 'if_fire' is True

tiles_path_sentinel2['month'] = tiles_path_sentinel2['date'].dt.month

# For rows where 'if_fire' is False, group by month and randomly select 5% of the rows in each group
df_false = tiles_path_sentinel2[tiles_path_sentinel2['if_fire'] == False].groupby('month').apply(lambda x: x.sample(frac=0.05))

# Get the rows where 'if_fire' is True
df_true = tiles_path_sentinel2[tiles_path_sentinel2['if_fire'] == True]

# Merge the two dataframes
df_sentinel2 = pd.concat([df_false, df_true])

df_sentinel2


  df_false = tiles_path_sentinel2[tiles_path_sentinel2['if_fire'] == False].groupby('month').apply(lambda x: x.sample(frac=0.05))


Unnamed: 0,path_name_sentinel2,satellite,imagery_id,date,tile_id,saving_path,coordinates,if_fire,month
"(7, 272)",../datasets/sentinel2/S2A_9UXR_20220725_0_L2A/...,sentinel2,S2A_9UXR_20220725_0_L2A,2022-07-25,8_12,../prepared_dataset/20220725/8_12,POLYGON ((-126.32028067944972 49.9687669997153...,False,7
"(7, 2164)",../datasets/sentinel2/S2B_9UYR_20220730_0_L2A/...,sentinel2,S2B_9UYR_20220730_0_L2A,2022-07-30,11_5,../prepared_dataset/20220730/11_5,POLYGON ((-125.69121414952446 49.7452287157063...,False,7
"(7, 321)",../datasets/sentinel2/S2A_9UXR_20220725_0_L2A/...,sentinel2,S2A_9UXR_20220725_0_L2A,2022-07-25,9_11,../prepared_dataset/20220725/9_11,POLYGON ((-126.43093758190167 49.9021850004495...,False,7
"(7, 780)",../datasets/sentinel2/S2A_9UYQ_20220725_0_L2A/...,sentinel2,S2A_9UYQ_20220725_0_L2A,2022-07-25,2_4,../prepared_dataset/20220725/2_4,POLYGON ((-125.81579145711164 49.4707196283744...,False,7
"(7, 3302)",../datasets/sentinel2/S2B_9UWS_20220730_0_L2A/...,sentinel2,S2B_9UWS_20220730_0_L2A,2022-07-30,9_12,../prepared_dataset/20220730/9_12,POLYGON ((-127.69186317006282 50.8223025373486...,False,7
...,...,...,...,...,...,...,...,...,...
2142,../datasets/sentinel2/S2B_9UYR_20220730_0_L2A/...,sentinel2,S2B_9UYR_20220730_0_L2A,2022-07-30,3_4,../prepared_dataset/20220730/3_4,POLYGON ((-125.7607154787941 50.29994118390697...,True,7
2389,../datasets/sentinel2/S2A_9UYR_20220920_0_L2A/...,sentinel2,S2A_9UYR_20220920_0_L2A,2022-09-20,8_7,../prepared_dataset/20220920/8_7,POLYGON ((-125.46333176774317 49.9457832286288...,True,9
3544,../datasets/sentinel2/S2A_9UYR_20220801_0_L2A/...,sentinel2,S2A_9UYR_20220801_0_L2A,2022-08-01,3_4,../prepared_dataset/20220801/3_4,POLYGON ((-125.7607154787941 50.29994118390697...,True,8
3555,../datasets/sentinel2/S2A_9UYR_20220801_0_L2A/...,sentinel2,S2A_9UYR_20220801_0_L2A,2022-08-01,3_2,../prepared_dataset/20220801/3_2,POLYGON ((-125.97607529975805 50.3057477994724...,True,8


In [9]:
df_sentinel2.drop(columns=['month'], inplace=True)

### Spliting data into training(60%), test(20%) and validation(20%) sets.

In [10]:
def split_data(df):
    train_size = int(len(df) * 0.6)
    test_size = int(len(df) * 0.2)

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    df['split'] = 'val'

    df.loc[:train_size, 'split'] = 'train'
    df.loc[train_size:train_size+test_size, 'split'] = 'test'

    return df

df_true = df_sentinel2[df_sentinel2['if_fire'] == True]
df_false = df_sentinel2[df_sentinel2['if_fire'] == False]

df_sentinel2 = pd.concat([split_data(df_true), split_data(df_false)])

In [11]:
# move column split to the front
cols = list(df_sentinel2.columns)
cols = [cols[-1]] + cols[:-1]
df_sentinel2 = df_sentinel2[cols]

In [12]:
df_sentinel2.to_csv('../prepared_dataset/dataset_splits.csv', index=False)

### Given a list of directories, get a list of files inside, and move to the selected directory

For now, I will use the shorter list - `subfolders_with_keyword_list`.

In [13]:
# def get_list_of_files_in_directory(directory_name: str, keyword: str = ".tif") -> list:
#     return [f"{directory_name}/{f}" for f in os.listdir(directory_name) if f.endswith(keyword)]

# test_get_list_of_files_in_directory = get_list_of_files_in_directory(subfolders_with_keyword_list[0])
# print(test_get_list_of_files_in_directory)