In [3]:
import pandas as pd
import datetime
from datetime import datetime
from datetime import timedelta

# When compare the date between data sets, compare the following day delta 
days_delta_range = [0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8, 8, -9, 9, -10, 10, 
                    -11, 11, -12, 12, -13, 13, -14, 14, -15, 15, -16, 16, -17, 17, -18, 18, -19, 19, 
                    -21, 21, -22, 22, -23, 23, -24, 24, -25, 25, -26, 26, -27, 27, -28, 28, -29, 29, -20, 20,
                    -31, 31, -32, 32, -33, 33, -34, 34, -35, 35, -36, 36, -37, 37, -38, 38, -39, 39, -30, 30
                    ]
# If a datasize size is less this is number, we will check its adjacent counties 
optimial_dataset_size = 10

# Extract the whole volume 
def filter_dataframe_by_value(df, column_name, value_to_find):
    filtered_rows = df[df[column_name] == value_to_find]
    return filtered_rows

def load_csv(target):
    # Load the target dataset
    target_dataset = pd.read_csv(target['file_name']) #, index_col=target['date_column_name'])
    print(f"The file {target['file_name']} contains data {target_dataset.shape}")
    return target_dataset

def save_csv(target, target_dataset):  
    # Drop all the index columns unnamed: 0 before saving 
    target_dataset.drop(target_dataset.filter(regex="Unname"),axis=1, inplace=True)
    # Save the data to its original file
    target_dataset.to_csv(target['file_name'])
    print(f"Saved to {target['file_name']}")

# Copy the matched column from the referral dataset to the target dataset
# The match key is the date. Note the date column might have different names
# The datetime must be close to each other between two datasets
def merge_dataframes_on_near_date(target_df, target, 
                                  referral_df, referral):

    #referral_df.head()
    
    # Sort the referral data set by Date, ascending 
    referral_df = referral_df.sort_values(referral['date_column_name'])

    for index1, row1 in target_df.iterrows():
        # Get the target datetime. 
        # Need to convert the string to a datetime object
        row1_datetime = datetime.strptime(str(row1[target['date_column_name']]),  target['datetime_format'])

        row1_near_date = []
        for i in days_delta_range:

            # Convert the day to int in order to compare with the referral_df.date, which is loaded as int by default
            next_day = row1_datetime + timedelta( days = i)
            #referral_next_day = int(next_day.strftime(referral['datetime_format']))
            referral_next_day = next_day.strftime(referral['datetime_format'])
            # If it is int ( not a string), convert to int as it is fast
            if  referral_df.dtypes[referral['date_column_name']] == 'int':
                referral_next_day = int(referral_next_day)
            row1_near_date.append( referral_next_day)

            # Find in the referral dataframe by date
            found_df = referral_df[referral_df[referral['date_column_name']] == referral_next_day]
            if (found_df.size > 0):
                print(f"Add data to the original {row1_datetime} <- {referral_next_day}  delta days {i}: ")
                for f in referral["data_fields_to_be_copied"]:
                    # Found the value in the first cell and assign to the target dataframe
                    # The target columne name might be different from the referral's. 
                    ### Hard code for now
                    target_column_name = f;
                    if target_column_name == "percent":
                        target_column_name = "fmc"

                    target_df.at[index1, target_column_name] = found_df.iloc[0][f]
                    print(f"{f} = {target_df.at[index1, target_column_name]}")
                
                # Finished fo this date
                break;

#
# Check the adjacent county if the original county has not data in a referral dataset
#
def get_datasize_by_fips(referral_dataset, referral, county_fips_code):
    referral_count = referral_dataset[referral_dataset[referral['fips_column_name']] == county_fips_code].shape[0]
    print(f"The county {county_fips_code} has {referral_count} rows of data in the {referral['file_name']}")  
    return referral_count

#
# Check all adjacent counties to find the max value
# The adj county with the max value will replace the referral[county] data
# Return the max value 
#
def count_adjacent_county(referral_dataset, referral):
    
    adjacent_county_df = pd.read_csv("Datasets/county_adjacency.csv")
    rows = None
    rows = adjacent_county_df[adjacent_county_df['fips'] == referral['fips_code']]
    i = 1
    max_count = 0
    max_county_name = ""
    max_county_fips = 0
    while ( rows is not None):
        index = rows.first_valid_index()
        if ( index is None):
            break;
        next_row = None
        try:
            next_row = adjacent_county_df.iloc[index + i]
        except Exception as e:
            # We might have reached to the end of the file
            break
        i += 1
        if ( next_row is not None):
            # check if the row has a county name, which means the end of ajacent county list
            if ( pd.isna(next_row['county']) == False):
                break
            
            adjacent_county = next_row['adjacent_county'].split(', ')[0]
            adjacent_county_fips = next_row[3] #'adjacent_county_fips'
            referral_count = get_datasize_by_fips(referral_dataset, referral, adjacent_county_fips)
            if (referral_count > max_count):
                max_count = referral_count
                max_county_name = adjacent_county
                max_county_fips = adjacent_county_fips
    # End of While

    # If we have an adj county with max size
    if ( max_count > 0):
        print(f"*** Using the adjacent county {max_county_name}, {max_county_fips} of {referral['referral_county_name']}. It has {max_count} rows of data in the {referral['file_name']}")  
        referral['referral_county_name'] = max_county_name
        referral['fips_code'] = max_county_fips
    # return 0 if not adjcent counties have any data
    # print("The county doesn't exist in the referral dataset. Exits the program.")
    return max_count

#
# Merge data from a referral dataset to a target 
# Return the number of rows merged
#
def merge_data_referral_to_target(target, referral): 

    # Load the target dataset
    target_dataset = load_csv(target)

    # Initial the referral dataset from a csv
    referral_dataset = load_csv(referral)

    # Check if the county exists in the referral dataset
    count = get_datasize_by_fips(referral_dataset, referral, referral['fips_code'])
    # If enough data is found, don't chck its adjacent counties 
    if ( count < optimial_dataset_size):
        # Check adjacent county if not enough data found in the original county
        count = count_adjacent_county( referral_dataset, referral)
        # If no data is found, exit the program
        if ( int(count) <= 0):
            print("!!! ### !!! Neither the given county nor its adjcent counties exist in the referral dataset. Exits the program.")
            return 0
    
    # Create a new data frame containing only rows with the specific value
    referral_dataset_fips_only = filter_dataframe_by_value(referral_dataset, referral['fips_column_name'], referral['fips_code'])
    print(f"referral_dataset with fips {referral['fips_code']} = {referral_dataset_fips_only.shape}")
                
    # Add the new data from the referral dataset to the target dataset
    # merge_dataframes_on_match
    merge_dataframes_on_near_date(target_dataset, target, 
                                  referral_dataset_fips_only, referral)

    # Save the data to its original file
    save_csv(target, target_dataset)

    return target_dataset.shape[0]

def generate_csv_for_fips(target):

    fire_data = pd.read_csv("Datasets/aggregated_wildfire.csv")
    # Create a new DataFrame containing only rows with the specific value
    firedata_fips = filter_dataframe_by_value(fire_data, 'FIPS', target["fips"])
    #new columns for merged features
    firedata_fips["tmax"] = pd.NA
    firedata_fips["tmin"] = pd.NA
    firedata_fips["tavg"] = pd.NA
    firedata_fips["prcp"] = pd.NA
    firedata_fips["aws"] = pd.NA
    firedata_fips["fmc"] = pd.NA
    save_csv(target, firedata_fips)

# Drop rows with nan columns
def drop_ana(target):
    df = load_csv(target)
    df = df.dropna() #(axis=0, how='any')
    save_csv(target, df)