# Fake Data Natural Simulator

## Anomaly Maker

The introduce_anomalies function: This function takes two parameters: file_path, which is the path to the CSV file containing the groundwater time series data, and num_sites_to_consider, which is the number of unique sites to consider for introducing anomalies.

Load the dataset: The function reads the CSV file specified by file_path into a pandas DataFrame df.

Count unique sites: The function calculates the total number of unique sites in the dataset and prints it.

Check the number of sites to consider: It checks if the num_sites_to_consider is greater than the total number of unique sites. If so, it prints a message indicating that the number of sites to consider is too high, and the function returns.

Random site selection: If the num_sites_to_consider is valid, the function randomly selects num_sites_to_consider sites from the unique sites.

Data modification loop: The function iterates through the selected sites. For each site, it filters the data related to that site from the DataFrame. Then, it randomly selects 25 data points and multiplies their 'level' values by a random multiplier between 2 and 6.

Helper function: There is a helper function called has_max_modifications_for_year to keep track of how many data points have been modified for a given year.

Display modified data: The modified data is printed on the console, showing the changes made to the original data.

Save modified data: The modified data is saved to a new CSV file named "modified_groundwater_timeseries_data.csv".

__name__ == "__main__" block: This block is executed only when the script is run directly (not imported as a module). It prompts the user to input the number of sites to consider for introducing anomalies and calls the introduce_anomalies function with the provided input.

In [48]:
import pandas as pd
import random

def introduce_anomalies(file_path, num_sites_to_consider):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Count the number of unique sites
    unique_sites = df['site'].unique()
    num_unique_sites = len(unique_sites)
    print(f"Total unique sites: {num_unique_sites}")

    if num_sites_to_consider > num_unique_sites:
        print("The number of sites to consider is greater than the total unique sites.")
        return

    # Randomly select sites to consider
    selected_sites = random.sample(unique_sites.tolist(), num_sites_to_consider)

    # Create a DataFrame to store the modified data
    modified_data = df.copy()

    # Helper function to check if there are already 3 modified data points for a year
    def has_max_modifications_for_year(year, modified_years):
        count = modified_years.count(year)
        return count >= 3

    modified_years = []

    for site in selected_sites:
        # Filter data for the selected site
        site_data = df[df['site'] == site]

        # Select 25 random level values and multiply them by a random number between 2 and 6
        random_indices = random.sample(range(len(site_data)), 25)
        for idx in random_indices:
            year = pd.to_datetime(site_data.iloc[idx]['date']).year
            if not has_max_modifications_for_year(year, modified_years):
                random_multiplier = random.uniform(2, 6)
                modified_data.loc[site_data.index[idx], 'level'] *= random_multiplier
                modified_years.append(year)
                print(f"Modified: Site - {site}, Date - {site_data.iloc[idx]['date']}")

    # # Display the modified data
    # modified_data = modified_data.round(2)
    # print("\nModified Data:")
    # print(modified_data[modified_data != df].dropna(how='all'))

    # Save the modified data to a new CSV file
    modified_data.to_csv("modified_groundwater_timeseries_data.csv", index=False)
    print("\nModified data has been saved to 'modified_groundwater_timeseries_data.csv'.")

if __name__ == "__main__":
    file_path = "groundwater_timeseries_data_Negative.csv"
    num_sites_to_consider = int(input("Enter the number of sites to consider for introducing anomalies: "))
    introduce_anomalies(file_path, num_sites_to_consider)


Enter the number of sites to consider for introducing anomalies:  255


Total unique sites: 3200
Modified: Site - Site_1307, Date - 2009-10-01
Modified: Site - Site_1307, Date - 2004-12-01
Modified: Site - Site_1307, Date - 2009-11-01
Modified: Site - Site_1307, Date - 1986-10-01
Modified: Site - Site_1307, Date - 1990-11-01
Modified: Site - Site_1307, Date - 1998-11-01
Modified: Site - Site_1307, Date - 2016-07-01
Modified: Site - Site_1307, Date - 2018-11-01
Modified: Site - Site_1307, Date - 2004-01-01
Modified: Site - Site_1307, Date - 1997-02-01
Modified: Site - Site_1307, Date - 2019-06-01
Modified: Site - Site_1307, Date - 2017-03-01
Modified: Site - Site_1307, Date - 2006-07-01
Modified: Site - Site_1307, Date - 1994-08-01
Modified: Site - Site_1307, Date - 2010-10-01
Modified: Site - Site_1307, Date - 2008-12-01
Modified: Site - Site_1307, Date - 2020-11-01
Modified: Site - Site_1307, Date - 1999-10-01
Modified: Site - Site_1307, Date - 2020-10-01
Modified: Site - Site_1307, Date - 2005-07-01
Modified: Site - Site_1307, Date - 2005-09-01
Modified:

## Gap Maker

In [50]:
import pandas as pd
import random

def delete_random_months(df, num_years):
    # Helper function to get random months from a year
    def get_random_months():
        return random.sample(range(1, 13), random.randint(1, 3))

    selected_years = random.sample(range(1985, 2023), num_years)

    for site in df['site'].unique():
        site_data = df[df['site'] == site]
        for year in selected_years:
            months_to_delete = get_random_months()
            for month in months_to_delete:
                idx_to_delete = site_data[(site_data['date'].dt.year == year) & (site_data['date'].dt.month == month)].index
                df.loc[idx_to_delete, 'level'] = None
                print(f"Deleted level values for Site: {site}, Year: {year}, Month: {month}")

    return df

def main():
    # Load the CSV file into a DataFrame
    file_path = "modified_groundwater_timeseries_data.csv"
    df = pd.read_csv(file_path, parse_dates=['date'])

    # Count the number of unique sites
    unique_sites = df['site'].unique()
    num_unique_sites = len(unique_sites)
    print(f"Total unique sites: {num_unique_sites}")

    # Select 15% of unique sites randomly
    num_sites_to_select = int(num_unique_sites * 0.01)
    selected_sites = random.sample(unique_sites.tolist(), num_sites_to_select)

    # Select 4 random years from each of the selected sites
    num_years_per_site = 4
    df_modified = delete_random_months(df[df['site'].isin(selected_sites)], num_years_per_site)

    # Create a new DataFrame for the modified data
    modified_data = df.copy()
    modified_data.loc[df_modified.index, 'level'] = df_modified['level']

    # Save the modified data to a new CSV file
    modified_data.to_csv("MGA_GWD.csv", index=False)
    print("\nModified data has been saved to 'MGA_GWD.csv'.")

if __name__ == "__main__":
    main()


Total unique sites: 3200
Deleted level values for Site: Site_31, Year: 2022, Month: 5
Deleted level values for Site: Site_31, Year: 2022, Month: 9
Deleted level values for Site: Site_31, Year: 2009, Month: 6
Deleted level values for Site: Site_31, Year: 2009, Month: 3
Deleted level values for Site: Site_31, Year: 2009, Month: 9
Deleted level values for Site: Site_31, Year: 1990, Month: 5
Deleted level values for Site: Site_31, Year: 1989, Month: 9
Deleted level values for Site: Site_31, Year: 1989, Month: 2
Deleted level values for Site: Site_31, Year: 1989, Month: 6
Deleted level values for Site: Site_99, Year: 2022, Month: 8
Deleted level values for Site: Site_99, Year: 2009, Month: 8
Deleted level values for Site: Site_99, Year: 1990, Month: 7
Deleted level values for Site: Site_99, Year: 1990, Month: 12
Deleted level values for Site: Site_99, Year: 1989, Month: 6
Deleted level values for Site: Site_99, Year: 1989, Month: 9
Deleted level values for Site: Site_99, Year: 1989, Month: 

## Checker

In [51]:
import pandas as pd

def find_missing_values(df):
    missing_values = df[df['level'].isnull()]
    print(f"\nTotal missing level values: {len(missing_values)}")
    if not missing_values.empty:
        print("\nMissing level values:")
        print(missing_values[['site', 'date']])

def find_anomalies(df):
    def is_anomaly(series):
        # Assuming anomalies are values below a certain threshold (e.g., -100)
        threshold = -100
        return series < threshold

    anomalies = df.groupby('site')['level'].apply(is_anomaly)
    anomaly_dates = df[anomalies].dropna()
    print("\nAnomaly dates and level values:")
    print(anomaly_dates[['site', 'date', 'level']])

def main():
    # Load the CSV file into a DataFrame
    file_path = "MGA_GWD.csv"
    df = pd.read_csv(file_path, parse_dates=['date'])

    # Find missing values
    find_missing_values(df)

    # Find anomalies
    find_anomalies(df)

if __name__ == "__main__":
    main()



Total missing level values: 254

Missing level values:
              site       date
13729      Site_31 1989-02-01
13733      Site_31 1989-06-01
13736      Site_31 1989-09-01
13744      Site_31 1990-05-01
13970      Site_31 2009-03-01
...            ...        ...
1437381  Site_3153 1990-10-01
1437606  Site_3153 2009-07-01
1437758  Site_3153 2022-03-01
1437760  Site_3153 2022-05-01
1437766  Site_3153 2022-11-01

[254 rows x 2 columns]


  anomaly_dates = df[anomalies].dropna()


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).