In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
from scipy.interpolate import LinearNDInterpolator
import plotly.express as px
import joblib
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from os import path

In [4]:
# GOES FROM COLLOCATED MSS TO MONTLY OBSERVATIONS OF MSS ANOMALY GRIDDED TO 1x1
directory = "C:/Users/syversk/Desktop/mss_v3.0/"
files = os.listdir(directory + "mss_collocated_v3.0_new")
df_list = []
for i in tqdm(range(len(files))):
    df = pd.read_csv(directory + "mss_collocated_v3.0_new" +"/" + files[i])
    df = df[df['era_wind'] < 11]
    df = df[df['era_wind'] > 3]
    if "Var5" in df:
        print("Var5")
        df = df.drop(['Var5'], axis=1)
    df = df.dropna()
    df["delta"] = df.apply(lambda row: np.sqrt((row.era_u10 - row.oscar_u)**2 + (row.era_v10 - row.oscar_v)**2), axis = 1)
    df = calculate_grided_mss_anomaly_for_each_day_df(df)
    df_list.append(df)
    # New month check
    if (i < len(files) -1) and files[i][0:7] != files[i+1][0:7]:
        df_month = pd.concat(df_list)
        df_month = group_dataframe_temporally(df_month)
        df_month.to_csv( directory + "monthly_mss_ano/" + files[i][0:7] + ".csv" ,index=False)
        df_list = []
    # Last file check
    if (i == len(files) -1):
        df_month = pd.concat(df_list)
        df_month = group_dataframe_temporally(df_month)
        df_month.to_csv( directory + "monthly_mss_ano/" + files[i][0:7] + ".csv" ,index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1267/1267 [27:07:38<00:00, 77.08s/it]


In [None]:
# PRIOR TO FITTING EMPIRICAL RELATHIONSHIP BETWEEN WIND AND MSS

directory = "C:/Users/syversk/Desktop/mss_v3.0/"
files = os.listdir(directory + "mss_collocated_v3.0_new")
for file in tqdm(files):
    df = pd.read_csv(directory + "mss_collocated_v3.0_new" +"/" + file)
    
    df = remove_extra(df)
        
    df_cr1 = reduce_area_of_df(df, "cr1")
    df_cr1.to_csv( directory + "cr1/" + file ,index=False)
    
    df_cr2 = reduce_area_of_df(df, "cr2")
    df_cr2.to_csv( directory + "cr2/" + file ,index=False)
    
    df_cr1_poly = reduce_area_of_df_poly(df, "north")
    df_cr1_poly.to_csv( directory + "cr1_north_pasific/" + file ,index=False)
    
    df_cr2_poly = reduce_area_of_df_poly(df, "south")
    df_cr2_poly.to_csv( directory + "cr2_south_pasific/" + file ,index=False)
    

In [5]:
def reduce_area_of_df_poly(input_df, cr):
    directory = "C:/Users/syversk/Desktop/"
    if cr == "south":
        df = pd.read_csv(directory + "low_microplastics_region_south_pacific.csv")
    else:
        df = pd.read_csv(directory + "low_microplastics_region_north_pacific.csv")
    tup_list = []
    for row in df.iterrows():
        #Plus 360 cause both regions lie in negative longtiude according to def.
        tup_list.append((row[1][0]+360, row[1][1]))
    polygon = Polygon(tup_list)
    
    input_df["in_poly"] = input_df.apply(lambda row: polygon.contains(Point(row.lon, row.lat)), axis = 1)
    input_df = input_df[input_df.in_poly == True]
    input_df = input_df.drop(['in_poly'], axis=1)
    return input_df

def calculate_grided_mss_anomaly_for_each_day_df(df):
    df = mss_katzberg(df, True)
    df = mss_katzberg(df, False)
    
    df = mss_anomaly_gbdt(df, True)
    df = mss_anomaly_gbdt(df, False)
    
    df = make_data_grided(df)
    df = group_dataframe_temporally(df)
    
    return df

def mss_katzberg(df, self_trained):
    
    if self_trained:
        colmn_name_1 = "mss_ano_w_refitted" 
        colmn_name_2 = "mss_ano_d_refitted"
    else:
        colmn_name_1 = "mss_ano_w_towards" 
        colmn_name_2 = "mss_ano_d_towards"
        
    df[colmn_name_1] = df.apply(lambda row: calculate_mss_anomaly(row.mss, row.era_wind, self_trained), axis = 1)
    df[colmn_name_2] = df.apply(lambda row: calculate_mss_anomaly(row.mss, row.delta, self_trained), axis = 1)
    
    return df

def mss_anomaly_gbdt(df, pasific = True):
    
    wind = df["era_wind"].to_numpy()
    delta = df["delta"].to_numpy()
    wind = wind.reshape((wind.shape[0], 1))
    delta = delta.reshape((wind.shape[0], 1))  
    
    if pasific:
        # MODEL BASED ON CR1 AND CR2
        model = joblib.load("v3.0_results/model_cr_pasific.pkl")
    else:
        # THIS IS BASED ON CR SOUTH AND NORTH PACSIFIC SUGGESTED BY MOSTAFA
        model = joblib.load("v3.0_results/model_cr_towards.pkl")
    
    
    mss_wind = model.predict(wind)
    mss_delta = model.predict(delta)
        
    mss_anomaly_wind = (df['mss']-mss_wind)/mss_wind
    mss_anomaly_delta = (df['mss']-mss_delta)/mss_delta
    if pasific:
        df['mss_ano_w_gdt_pasific'] = mss_anomaly_wind
        df['mss_ano_d_gdt_pasific'] = mss_anomaly_delta
    else:
        df['mss_ano_w_gdt_towards_cr'] = mss_anomaly_wind
        df['mss_ano_d_gdt_towards_cr'] = mss_anomaly_delta
    return df
    
def calculate_mss_anomaly(mss, wind_or_delta, self_trained):
    if self_trained:
        # THIS IS BASED ON CR1 AND CR2
        c = [0.00370243 , 1.07334119 , 1.33996524 , 5.66160183 , -1.01960507]
        
        
        # THIS IS BASED ON CR SOUTH AND NORTH PACSIFIC SUGGESTED BY MOSTAFA V 2.1 
        c = [0.00349525 , 0.67100802 , 3.28279911 , 6.58959564 , -2.53669967]
    else:
        c = [0.0035, 1, 0.62, 6, -3.39]
    if wind_or_delta <= 3.49:
        mss_mod = c[0]*(c[1]*wind_or_delta + c[2])
    else:
        mss_mod = c[0]*(c[3]*np.log(wind_or_delta) + c[4])
    return (mss-mss_mod)/mss_mod

def reduce_area_of_df(df, cr):
    if cr == "cr1":
        df = df[df.lat <= -10]
        df = df[df.lat >= -25]
        df = df[df.lon >= 105]
        df = df[df.lon <= 120]
    else:
        df = df[df.lat >= 10]
        df = df[df.lat <= 20]
        df = df[df.lon >= 128]
        df = df[df.lon <= 143]
    return df

def remove_extra(df):
    if "Var5" in df:
        df = df.drop(['Var5'], axis=1)
    rel_wind_current = np.sqrt((df["era_u10"] - df["oscar_u"])**2 + (df["era_v10"] - df["oscar_v"])**2)
    df.drop(['era_u10', "oscar_u", "era_v10", "oscar_v"], inplace=True, axis=1)
    df["delta"] = rel_wind_current
    return df

#Average spatially
def make_data_grided(df):
    df["lon"] = df["lon"].apply(lambda lon: round(lon))
    df["lat"] = df["lat"].apply(lambda lat: round(lat))
    return df

def interp_microplastics(df):
    directory = "C:/Users/syversk/Desktop"
    micro_df = pd.read_csv(directory + "/" + "micro_df.csv")
    interp_micro_mass = LinearNDInterpolator(list(zip(mic_df['lon'], mic_df['lat'])), mic_df['vansebillemodel_mass_log'])
    interp_micro_abu = LinearNDInterpolator(list(zip(mic_df['lon'], mic_df['lat'])), mic_df['vansebillemodel_abundance_log'])
    df['micro_mass'] = interp_micro_mass(df['lon'], df['lat'])
    df['abundace'] = interp_micro_abu(df['lon'], df['lat'])
    return df

#Average all temporally
def group_dataframe_temporally(df):
    df = df.groupby(['lon', 'lat'], as_index=False)[['mss_ano_w_gdt_pasific', 'mss_ano_d_gdt_pasific'
                                                    , 'mss_ano_w_gdt_towards_cr', 'mss_ano_d_gdt_towards_cr'
                                                    , 'mss_ano_w_refitted', 'mss_ano_d_refitted',
                                                    "mss_ano_w_towards", "mss_ano_d_towards"]].mean()
    return df

def hours_to_date(d0, hours):
    day = d0 + timedelta(hours=hours)
    return day

#Average temporally to dates
def group_dataframe(df):
    d0 = date(1992, 10, 5)
    df["time"] = df["time"].apply(lambda time: hours_to_date(d0,time))
    df = df.groupby(['lon', 'lat', "time"], as_index=False)[['mss_anomaly_delta_refitted', 'mss_anomaly_wind_refitted'
                                                            , 'mss_anomaly_delta_towards', 'mss_anomaly_wind_towards'
                                                            , 'mss_anomaly_delta_gdt', 'mss_anomaly_wind_gdt']].mean()
    df = df.rename(columns={"time": "date"})
    return df

def reduce_based_on_sd(df):
    z_scores = stats.zscore(df)
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < 3).all(axis=1)
    return df[filtered_entries]