# 3. Combine Dataset

xxx

In [1]:
import pandas as pd
import datetime as dt
import os

In [2]:
VALIDATION_PATHS = ["../data/interim/representative_locations/validation/",
                    "../data/interim/main_locations/validation"]

CLEANED_PATHS = ["../data/interim/representative_locations/weather/monthly_agg/",
                 "../data/interim/main_locations/weather/monthly_agg"]

INDIV_OUT_PATH = "../data/cleaned/individual/"
COMBINED_OUT_PATH = "../data/cleaned/"

In [3]:
def clean_df(v_path, c_path):
    # Rearrange the columns in the validation dataset
    validation_columns_format = ['location', 'bloom_doy', 'year', 'lat', 'long', 'bloom_date', 'alt']
    
    # Load in validation df
    v_df = pd.read_csv(v_path)
    v_df = v_df[validation_columns_format]

    # Make extra column to ease merge
    v_df['pred_year'] = v_df['year'] - 1

    # Drop unused column
    v_df = v_df.drop(columns=['year'])

    # Load in cleaned df
    c_df = pd.read_csv(c_path)

    # Drop unused column
    c_df = c_df.drop(columns=["location"])
    
    # Combine them on the 'pred_year'='year'
    combined_df = pd.merge(v_df, c_df, left_on='pred_year', right_on='year', how='inner')

    # Drop unused columns
    combined_df = combined_df.drop(columns=["pred_year", 'year'])

    # Cast column to datetime
    combined_df['bloom_date'] = pd.to_datetime(combined_df['bloom_date'])

    # Recreate year column
    combined_df.insert(1, 'year', combined_df['bloom_date'].dt.year)
    
    return combined_df

In [4]:
def combine_files(validation_path, cleaned_path):
    # Check if the path exists
    if not os.path.exists(validation_path):
        print(f"Path '{validation_path}' does not exist.")
        return

    if not os.path.exists(cleaned_path):
        print(f"Path '{cleaned_path}' does not exist.")
        return

    df_dict = {}
    
    # Iterate through files in the directory
    for filename in os.listdir(validation_path):
        if filename.endswith('.csv'):
            # Construct the full file path
            v_file_path = os.path.join(validation_path, filename)
            c_file_path = os.path.join(cleaned_path, filename)
            
            # Read the CSV file into a pandas DataFrame
            df = clean_df(v_file_path, c_file_path)

            # Append the cleaned df to the list
            df_dict[filename] = df
    
    # Return the list of cleaned dfs
    return df_dict

In [5]:
combined_df = pd.DataFrame()

for index in range(0, len(VALIDATION_PATHS)):
    # Process the raw data
    df_dict = combine_files(VALIDATION_PATHS[index], CLEANED_PATHS[index])
    
    # Output the aggregated files
    for file_name in df_dict:
        df_dict[file_name].to_csv(f"{INDIV_OUT_PATH}{file_name}", index=False)
        
        combined_df = pd.concat([combined_df, df_dict[file_name]], ignore_index=True)

combined_df = combined_df.reset_index(drop=True)

In [6]:
# To handle missing values, use the mean of each feature
combined_df = combined_df.fillna(combined_df.mean(numeric_only=True))

In [7]:
combined_df.to_csv(f"{COMBINED_OUT_PATH}combined.csv", index=False)

### Create a map to show locations represented

In [8]:
import folium

mymap = folium.Map(zoom_start=5, tiles='OpenStreetMap')

# Add markers for each location in the dataset
for index, row in combined_df.iterrows():
    folium.Marker(location=[row['lat'], row['long']]).add_to(mymap)

# Save the map to an HTML file
mymap.save('../figures/included_locations.html')