In [5]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
from scipy.interpolate import LinearNDInterpolator
import plotly.express as px
import joblib
import matplotlib.pyplot as plt

In [7]:
# GOES FROM COLLOCATED MSS TO MONTLY OBSERVATIONS OF MSS ANOMALY GRIDDED TO 1x1
directory = "C:/Users/syversk/Desktop/mss_collocated"
files = os.listdir(directory)
df_list = []
for i in tqdm(range(len(files))):
    df = pd.read_csv(directory + "/" + files[i])
    if "Var5" in df:
        df = df.drop(['Var5'], axis=1)
    df = df.dropna()
    df = calculate_grided_mss_anomaly_for_each_day_df_GDT(df)
    df_list.append(df)
    # New month check
    if (i < len(files) -1) and files[i][0:7] != files[i+1][0:7]:
        df = pd.concat(df_list)
        df = group_dataframe_temporally(df)
        df.to_csv( "C:/Users/syversk/Desktop/GBDT/" + files[i][0:7] + ".csv" ,index=False)
        df_list = []
    # Last file check
    if (i == len(files) -1):
        df = pd.concat(df_list)
        df = group_dataframe_temporally(df)
        df.to_csv( "C:/Users/syversk/Desktop/GBDT/" + files[i][0:7] + ".csv" ,index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1681/1681 [1:11:06<00:00,  2.54s/it]


In [6]:
def init_df(df):
    df = remove_extra(df)
    df.to_csv( "C:/Users/syversk/Desktop/mss_collocated/" + file ,index=False)
    df_cr1 = reduce_area_of_df(df, "cr1")
    df_cr1.to_csv( "C:/Users/syversk/Desktop/cr1/" + file ,index=False)
    df = reduce_area_of_df(df, "cr2")
    df.to_csv( "C:/Users/syversk/Desktop/cr2/" + file ,index=False)
    
def calculate_grided_mss_anomaly_for_each_day_df(df):
    df = df[df['era_wind'] < 11]
    df = df[df['era_wind'] > 3]
    df['mss_anomaly_wind'] = df.apply(lambda row: calculate_mss_anomaly(row.mss, row.era_wind, self_trained = False), axis = 1)
    df['mss_anomaly_delta'] = df.apply(lambda row: calculate_mss_anomaly(row.mss, row.delta, self_trained = False), axis = 1)
    df = make_data_grided(df)
    df = group_dataframe_temporally(df)
    return df

def calculate_grided_mss_anomaly_for_each_day_df_GDT(df):
    df = df[df['era_wind'] < 11]
    df = df[df['era_wind'] > 3]
    wind = df["era_wind"].to_numpy()
    delta = df["delta"].to_numpy()
    wind = wind.reshape((wind.shape[0], 1))
    delta = delta.reshape((wind.shape[0], 1))  
    use_scaler = False
    if use_scaler:
        model = joblib.load("model_full_data.pkl")
        scaler=joblib.load('std_scaler.bin')
        rescaled_wind = scaler.transform(wind)
        rescaled_delta = scaler.transform(delta)
        mss_wind = model.predict(rescaled_wind)
        mss_delta = model.predict(rescaled_delta)
    else:
        model = joblib.load("model_full_dataFalse.pkl")
        mss_wind = model.predict(wind)
        mss_delta = model.predict(delta)
    
    mss_anomaly_wind = (df['mss']-mss_wind)/mss_wind
    mss_anomaly_delta = (df['mss']-mss_delta)/mss_delta
    df['mss_anomaly_wind'] = mss_anomaly_wind
    df['mss_anomaly_delta'] = mss_anomaly_delta
    df = make_data_grided(df)
    df = group_dataframe_temporally(df)
    return df

def calculate_mss_anomaly(mss, wind_or_delta, self_trained = False):
    if self_trained:
        c = [0.00370243 , 1.07334119 , 1.33996524 , 5.66160183 , -1.01960507]
    else:
        c = [0.0035, 1, 0.62, 6, -3.39]
    if wind_or_delta <= 3.49:
        mss_mod = c[0]*(c[1]*wind_or_delta + c[2])
    else:
        mss_mod = c[0]*(c[3]*np.log(wind_or_delta) + c[4])
    return (mss-mss_mod)/mss_mod

def reduce_area_of_df(df, cr):
    if cr == "cr1":
        df = df[df.lat <= -10]
        df = df[df.lat >= -25]
        df = df[df.lon >= 105]
        df = df[df.lon <= 120]
    else:
        df = df[df.lat >= 10]
        df = df[df.lat <= 20]
        df = df[df.lon >= 128]
        df = df[df.lon <= 143]
    return df

def remove_extra(df):
    rel_wind_current = np.sqrt((df["era_u10"] - df["oscar_u"])**2 + (df["era_v10"] - df["oscar_v"])**2)
    df.drop(['era_u10', "oscar_u", "era_v10", "oscar_v", "oscar_current"], inplace=True, axis=1)
    df["delta"] = rel_wind_current
    return df

#Average spatially
def make_data_grided(df):
    df["lon"] = df["lon"].apply(lambda lon: round(lon))
    df["lat"] = df["lat"].apply(lambda lat: round(lat))
    return df

def interp_microplastics(df):
    directory = "C:/Users/syversk/Desktop"
    micro_df = pd.read_csv(directory + "/" + "micro_df.csv")
    interp_micro_mass = LinearNDInterpolator(list(zip(mic_df['lon'], mic_df['lat'])), mic_df['vansebillemodel_mass_log'])
    interp_micro_abu = LinearNDInterpolator(list(zip(mic_df['lon'], mic_df['lat'])), mic_df['vansebillemodel_abundance_log'])
    df['micro_mass'] = interp_micro_mass(df['lon'], df['lat'])
    df['abundace'] = interp_micro_abu(df['lon'], df['lat'])
    return df

#Average all temporally
def group_dataframe_temporally(df):
    df = df.groupby(['lon', 'lat'], as_index=False)[['mss_anomaly_delta', 'mss_anomaly_wind']].mean()
    return df

def hours_to_date(d0, hours):
    day = d0 + timedelta(hours=hours)
    return day

#Average temporally to dates
def group_dataframe(df):
    d0 = date(1992, 10, 5)
    df["time"] = df["time"].apply(lambda time: hours_to_date(d0,time))
    df = df.groupby(['lon', 'lat', "time"], as_index=False)[['mss_anomaly_delta', 'mss_anomaly_wind']].mean()
    df = df.rename(columns={"time": "date"})
    return df

def reduce_based_on_sd(df):
    z_scores = stats.zscore(df)
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < 3).all(axis=1)
    return df[filtered_entries]