# File Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from netCDF4 import dataset
import scipy as sp
import xarray as xr
import datetime
import pysmile
import pysmile_license
import pickle
from sklearn.cluster import KMeans
import folium
import scipy as sp

In [2]:
with open('Processed_data_and_output/clusters.json','rb') as fp:
    clust_dict = pickle.load(fp)

In [3]:
# See if I can plot the clusters
df_nanumea_TOB = pd.read_csv('Processed_data_and_output/modified_dataset.csv')

In [4]:
df_nanumea_TOB['cluster'] = [clust_dict[x] for x in df_nanumea_TOB['position']]

In [5]:
token = "pk.eyJ1Ijoic2hhbm5vbi1iZW5ndHNvbiIsImEiOiJja3F1Y2Q0dHEwMzYwMm9wYmtzYzk2bDZuIn0.5jGMyEiJdmXs1HL7x3ThPw" # your mapbox token
tileurl = 'https://api.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}@2x.png?access_token=' + str(token)
location = [-5.676533, 176.122640]


In [6]:
df_nanumea_TOB_one_year = df_nanumea_TOB[df_nanumea_TOB.year2==2009]

map_osm = folium.Map(location=location, zoom_start=15, tiles=tileurl, attr='Mapbox')
name = 'name'
c = 'black'

c_list = ['blue','white','green','orange','red','yellow','pink']
lgd_txt = '<span style="color: {col};">{txt}</span>'
feature_group = folium.FeatureGroup(name= lgd_txt.format( txt=name, col= c))

for c,(index,group) in zip(c_list,df_nanumea_TOB_one_year.groupby('cluster')):
    # plt.scatter(group.x,group.y)
    lgd_txt = '<span style="color: {col};">{txt}</span>'
    
    for index,row in group.iterrows():
        mk = folium.CircleMarker(location=[row.lat,row.lon],color=c,
        fill=True,
        fill_color=c,
        fill_opacity=0.7,radius=3)
        
        feature_group.add_child(mk)

    
map_osm.add_child(feature_group)


folium.LayerControl().add_to(map_osm) 

map_osm

In [7]:
# cluster = 1

# fig = plt.figure(figsize=(10,10*12/3))
# # ax1 = plt.subplot2grid((12,2),(i,ds))

# ax_dict = {}
# for window in np.arange(0,12,1):
#     for ds in [0,1]:
#         ax1 = plt.subplot2grid((12,2),(window-12*int(window/12),ds))
#         ax1.set_title(window)
#         ax_dict.update({
#             (window,ds):ax1
#         })

# i=0
# for window in np.arange(0,36,1): 
#     for ds in [0,1]:
#         # ax1 = plt.subplot2grid((12,2),(window-12*int(window/12),ds))
#         ax1 = ax_dict[(window-12*int(window/12),ds)]
#         cluster_dict = dirr_bounds_dict[window][ds][cluster]
#         df_test = cluster_dict['scatters']
#         df_mv_avg = cluster_dict['moving_avg']

#         ax1.scatter(df_test.dirr,df_test['corr'],alpha=0.05,c='k')
#         ax1.plot(df_mv_avg.index,df_mv_avg['corr'])
        
#         if window<12:
#             if (max_window_2_dict[cluster]==window)|(max_window_1_dict[cluster]==window):
#                 ax1.plot(avg_of_avg[(cluster,window)].index,avg_of_avg[(cluster,window)][0],c='r')
#             else:
#                 ax1.plot(avg_of_avg[(cluster,window)].index,avg_of_avg[(cluster,window)][0],c='k')
#         ax1.plot([0,360],[0,0],c='0.5',zorder=-1100)

#         ax1.set_ylim([-1,1])
#         ax1.set_ylabel('Correlation')
#         ax1.set_xlabel('Direction')
    
#     i+=1

# # plt.savefig('Processed_data_and_output/figures/integrated_freq_corrs_opts_identified.png')
# plt.show()

In [8]:
def find_coordinate_index(data_array, coordinate_name, value):
    """
    Find the index of a coordinate in an xarray equal to a particular value.
    
    Parameters:
        data_array (xarray.DataArray): The xarray data array.
        coordinate_name (str): The name of the coordinate to search.
        value: The value to search for in the coordinate.
    
    Returns:
        tuple or None: The index of the coordinate if found, or None if not found.
    """
    coordinate = data_array[coordinate_name]
    index = coordinate.to_index().get_loc(value, method='nearest')
    if isinstance(index, int):
        return (index,)
    elif isinstance(index, slice):
        return index.indices(len(coordinate))
    else:
        return None

# Load Dataset

In [9]:
# Load correlations matrix
xr_corr = xr.load_dataarray('Processed_data_and_output/correlations.nc')

# Load dataframe with shoreline change according to TOB and additional variables created
df_nanumea_TOB = pd.read_csv('Processed_data_and_output/modified_dataset.csv')

# Load raw spectral data
nanumea_spectra_dict = sp.io.loadmat('Preprocessed_datasets/spectra/Nanumea_spec.mat')

days_since_jan_1979 = (nanumea_spectra_dict['tm']-np.min(nanumea_spectra_dict['tm']))[0,:]

dates_list = []
for timedelta in days_since_jan_1979:
    dates_list.append(datetime.datetime(1979,1,1)+datetime.timedelta(days=int(timedelta)))
    
dirs_list = [x[0] for x in nanumea_spectra_dict['dirs']]
dirs_list = dirs_list-np.min(dirs_list)
frqs = [x[0] for x in nanumea_spectra_dict['frqs']]

xr_E_MM = xr.DataArray(data=nanumea_spectra_dict['E_MM'],coords=[dates_list,frqs,dirs_list],
                    dims=['time','freq','dirr'])

xr_E_95 = xr.DataArray(data=nanumea_spectra_dict['E_95'],coords=[dates_list,frqs,dirs_list],
                    dims=['time','freq','dirr'])

nanumea_dates = list(pd.read_csv('Preprocessed_datasets/Nanumea_dates.tsv',sep='\t'))
nanumea_dates = \
    [pd.to_datetime(x) for x in nanumea_dates]

nanumea_dict = {date.year:date for date in nanumea_dates}

with open('Processed_data_and_output/clusters.json','rb') as fp:
    clust_dict = pickle.load(fp)
    
list_of_clusts = np.unique(list(clust_dict.values()))

In [10]:
xr_corr

# Plot of the correlations with freq integrated over different time horizons

In [11]:
xr_corr_int = xr_corr.integrate('freq')

In [12]:
xr_corr_int.shape

(6169, 36, 12)

In [13]:
df_clusts = pd.DataFrame.from_dict(clust_dict,orient='index')
df_temp = df_clusts.diff()
df_temp = df_temp[df_temp[0]==1]
cluster_bounds = list(df_temp.index)
cluster_bounds = list(df_clusts.reset_index()[df_clusts.reset_index()['index'].isin(cluster_bounds)].index)

In [14]:
xr_corr_int = xr_corr.integrate('freq')
position = list(np.array(xr_corr.position))
xr_corr_int = xr_corr_int[list(clust_dict.keys()),:,:]
xr_corr_int['position'] = position

fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.4)

i = 0
for window in [0,5,11]:#np.arange(0,12,1):
    for ds in [0,1]:
        ax1 = plt.subplot2grid((3,2),(i,ds))
        xr.plot.pcolormesh(xr_corr_int[:,:,window],cmap='bwr',vmin=-1,vmax=1,add_colorbar=True,ax=ax1)

        df_year = df_nanumea_TOB[df_nanumea_TOB.year2==np.min(df_nanumea_TOB.year2)]
        df_year = df_year.sort_values('position')
        
        for cluster_bound in cluster_bounds:
            ax1.plot([0,360],[cluster_bound,cluster_bound])
        ax1.set_ylabel('')
        ax1.set_yticks([])
    i+=1
            
plt.savefig('Processed_data_and_output/figures/integrated_freq_corrs_reordered.png')

ValueError: conflicting sizes for dimension 'position': length 199 on <this-array> and length 6169 on {'window_size': 'window_size', 'position': 'position', 'dirr': 'dirr'}

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.4)

i=0
for window in [0,5,11]:#np.arange(0,12,1):
    print(window)
    ax1 = plt.subplot2grid((3,2),(i,ds))
    xr.plot.pcolormesh(xr_corr_int[:,:,window],cmap='bwr',vmin=-1,vmax=1,add_colorbar=True,ax=ax1)

    df_year = df_nanumea_TOB[df_nanumea_TOB.year2==np.min(df_nanumea_TOB.year2)]
    df_year = df_year.sort_values('position')
    ax1.plot(df_year.shoreline_direction,df_year.position,c='k',lw=0.5)
    # ax1.set_xlabel('')
    i+=1

plt.savefig('Processed_data_and_output/figures/integrated_freq_corrs.png')

In [None]:
cluster_bounds_for_iter = [0]+cluster_bounds+[np.array(xr_corr_int.position).max()]

In [None]:
corr_threshold = 0.05

dirr_bounds_dict = {}
for window in np.arange(0,12,1):
    dataset_dict = {}
    for ds in [0,1]:
        cluster_dict = {}
        i = 0
        for cluster_bound_lower,cluster_bound_upper in zip(cluster_bounds_for_iter[:-1],cluster_bounds_for_iter[1:]):
            
            xr_test = xr_corr_int[cluster_bound_lower:cluster_bound_upper,:,window]
            df_test = xr_test.to_dataframe('corr')
            df_test = df_test.reset_index()
            df_mv_avg = df_test.groupby('dirr').mean()   
                
            #An alternative way to do this
            xr_test_mean = xr_test.mean(dim='position')
            xr_test_neg = xr_test_mean.where(xr_test_mean<0,0)
            xr_test_pos = xr_test_mean.where(xr_test_mean>0,0)
                
            cluster_dict.update({
                i:{
                    'scatters':df_test,
                    'moving_avg':df_mv_avg,
                    'pos_weights':xr_test_pos,
                    'neg_weights':xr_test_neg
                }
            })
            i+=1
        dataset_dict.update({
            ds:cluster_dict
        })
    dirr_bounds_dict.update({
        window:dataset_dict
    })


In [None]:
avg_of_avg = {}
avg_clust_diffs = {}

for cluster in list_of_clusts:
    for window in np.arange(0,12,1): 
        df_mv_avg_1 = dirr_bounds_dict[window][0][cluster]['moving_avg']
        
        df_mv_avg_1 = df_mv_avg_1[['corr']]
        df_mv_avg_1.columns = [1]
        
        df_mv_avg_avg = pd.DataFrame(df_mv_avg_1).mean(axis=1)
        avg_of_avg.update({
            (cluster,window):df_mv_avg_avg
        })
        
        avg_clust_diffs.update({
            (cluster,window):np.abs(df_mv_avg_avg).sum()
        })
        
max_window_1_dict = {}
for cluster in list_of_clusts:
    df_clust_deviation = pd.DataFrame.from_dict(avg_clust_diffs,orient='index').copy()
    df_clust_deviation['cluster'] = [x[0] for x in df_clust_deviation.index]
    df_clust_deviation['window'] = [x[1] for x in df_clust_deviation.index]
    df_clust_deviation = df_clust_deviation[df_clust_deviation.cluster==cluster]
    max_window_1 = df_clust_deviation.loc[df_clust_deviation[0]==np.max(df_clust_deviation[0]),'window'].values[0]
    max_window_1_dict.update({
        cluster:max_window_1
    })

# find opposing signal
max_window_2_dict = {}
for cluster in list_of_clusts:
    max_window_1 = max_window_1_dict[cluster].copy()
    max_diff = {}
    for window in np.arange(0,12,1):
        df_diff = avg_of_avg[(cluster,window)]-avg_of_avg[(cluster,max_window_1)]
        max_diff.update({
            np.abs(df_diff).sum():window
        })
    max_window_2 = max_diff[np.max(list(max_diff.keys()))]
    
    max_window_2_dict.update({
        cluster:max_window_2
    })

In [None]:
min_first_dict = {}
max_first_dict = {}

for cluster in list_of_clusts:
    if int(avg_of_avg[(cluster,max_window_1_dict[cluster])].idxmin())<int(avg_of_avg[(cluster,max_window_1_dict[cluster])].idxmax()):
        min_first_dict.update({
            cluster:max_window_1_dict[cluster]
        })
        max_first_dict.update({
            cluster:max_window_2_dict[cluster]
        })
    else:
        min_first_dict.update({
            cluster:max_window_2_dict[cluster]
        })
        max_first_dict.update({
            cluster:max_window_1_dict[cluster]
        })

In [None]:
cluster = 0

fig = plt.figure(figsize=(10,10*12/3))
# ax1 = plt.subplot2grid((12,2),(i,ds))

ax_dict = {}
for window in np.arange(0,12,1):
    for ds in [0,1]:
        ax1 = plt.subplot2grid((12,2),(window-12*int(window/12),ds))
        ax1.set_title(window)
        ax_dict.update({
            (window,ds):ax1
        })

i=0
for window in np.arange(0,12,1): 
    for ds in [0,1]:
        # ax1 = plt.subplot2grid((12,2),(window-12*int(window/12),ds))
        ax1 = ax_dict[(window-12*int(window/12),ds)]
        cluster_dict = dirr_bounds_dict[window][ds][cluster]
        df_test = cluster_dict['scatters']
        df_mv_avg = cluster_dict['moving_avg']

        for pos,group in df_test.groupby('position'):
            ax1.plot(group.dirr,group['corr'],alpha=0.05,c='k')
            
        ax1.plot(df_mv_avg.index,df_mv_avg['corr'])
        
        if window<12:
            if (max_window_2_dict[cluster]==window)|(max_window_1_dict[cluster]==window):
                # ax1.plot(avg_of_avg[(cluster,window)].index,avg_of_avg[(cluster,window)][0],c='r')
                ax1.plot(avg_of_avg[(cluster,window)],c='r')
            else:
                # ax1.plot(avg_of_avg[(cluster,window)].index,avg_of_avg[(cluster,window)][0],c='k')
                ax1.plot(avg_of_avg[(cluster,window)],c='k')
        ax1.plot([0,360],[0,0],c='0.5',zorder=-1100)

        ax1.set_ylim([-1,1])
        ax1.set_ylabel('Correlation')
        ax1.set_xlabel('Direction')
    
    i+=1

plt.savefig('Processed_data_and_output/figures/integrated_freq_corrs_opts_identified.png')
plt.show()

In [None]:
colour_dict = {
    0:'r',
    1:'b',
    2:'g',
    3:'orange',
    4:'y',
    5:'c'
}

fig = plt.figure(figsize=(10,10*12/3))

ax1 = plt.subplot2grid((6,2),(0,0))
ax2 = plt.subplot2grid((6,2),(1,0))
ax3 = plt.subplot2grid((6,2),(2,0))
ax4 = plt.subplot2grid((6,2),(3,0))
ax5 = plt.subplot2grid((6,2),(4,0))
ax6 = plt.subplot2grid((6,2),(5,0))
ax7 = plt.subplot2grid((6,2),(0,1))
ax8 = plt.subplot2grid((6,2),(1,1))
ax9 = plt.subplot2grid((6,2),(2,1))
ax10 = plt.subplot2grid((6,2),(3,1))
ax11 = plt.subplot2grid((6,2),(4,1))
ax12 = plt.subplot2grid((6,2),(5,1))

i=0
for cluster in list_of_clusts:
    for window,ax in zip(np.arange(0,12,1),[ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10,ax11,ax12]): 

        if (min_first_dict[cluster]==window):#
            # ax.plot(avg_of_avg[(cluster,window)].index,avg_of_avg[(cluster,window)][0],c=colour_dict[cluster])
            ax.plot(avg_of_avg[(cluster,window)],c=colour_dict[cluster])
        elif (max_first_dict[cluster]==window):
            # ax.plot(avg_of_avg[(cluster,window)].index,avg_of_avg[(cluster,window)][0],c=colour_dict[cluster],ls='--')
            ax.plot(avg_of_avg[(cluster,window)],c=colour_dict[cluster],ls='--')
        else:
            # ax.plot(avg_of_avg[(cluster,window)].index,avg_of_avg[(cluster,window)][0],c='k')
            ax.plot(avg_of_avg[(cluster,window)],c='k')

        ax.plot([0,360],[0,0],c='0.5',zorder=-1100)


        ax.set_ylim([-1,1])
        ax.set_ylabel('Correlation')
        ax.set_xlabel('Direction')


# plt.savefig('Processed_data_and_output/figures/integrated_freq_corrs_opts_identified.png')
plt.show()

# Visualise the clusters by the time lag

In [None]:
min_first_dict

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
cmap = plt.cm.get_cmap('twilight_shifted')

new_cmap = ListedColormap(cmap(np.linspace(0,1,12)))
new_cmap

In [None]:
df_nanumea_TOB_one_year

# Create Features for BN

In [None]:
xr_E_MM_int = xr_E_MM.integrate('freq')

wave_node_values_per_year = {}
for year in np.unique(list(nanumea_dict.keys())):
    wave_node_values = {}
    for cluster in list_of_clusts:
        for month in [max_window_1_dict[cluster],max_window_2_dict[cluster]]:
            current_time_index = find_coordinate_index(xr_E_MM_int,'time',np.array(xr_E_MM_int[xr_E_MM_int.time>nanumea_dict[year]].time)[0])[0]-1
            # previous_time_index = current_time_index-month-1
            month+=1 # NEED TO HAVE THIS BECAUSE THE MONTH COUNT HERE STARTS AT ZERO NOT 1
            xr_E_MM_int_time_slice = xr_E_MM_int[(current_time_index-month):(current_time_index-(month-1)),:]
            xr_E_MM_int_time_slice = xr_E_MM_int_time_slice.mean('time')
            xr_E_MM_int_time_slice = xr_E_MM_int_time_slice.sortby('dirr')
                    
            df_directions = avg_of_avg[(cluster,max_window_1_dict[cluster])]
            min_directions = [int(round(x,0)) for x in list(df_directions[df_directions<0].index)]
            max_directions = [int(round(x,0)) for x in list(df_directions[df_directions>0].index)]
            
            mean_min = np.array(xr_E_MM_int_time_slice[[True if x in min_directions else False for x in list(np.array(xr_E_MM_int_time_slice.dirr))]]).mean()
            mean_max = np.array(xr_E_MM_int_time_slice[[True if x in max_directions else False for x in list(np.array(xr_E_MM_int_time_slice.dirr))]]).mean()
            
            wave_node_values.update({
                (month,cluster,'minn'):mean_min,
                (month,cluster,'maxx'):mean_max
            })

    # df_wave_node_values = pd.DataFrame.from_dict(wave_node_values,orient='index')
    wave_node_values_per_year.update({
        year:wave_node_values
    })
        
wave_node_values_per_clust = {}

df = pd.DataFrame.from_dict(wave_node_values_per_year)
for clust, group in df.reset_index().groupby('level_1'):
    wave_node_values_per_clust.update({
        clust:group.rename(columns={'level_0':'month','level_2':'MinOrMax'}).drop('level_1',axis=1).set_index(['month','MinOrMax']).T
    })

In [None]:
testing_features_dict = {}
for cluster in list_of_clusts:
    df = df_nanumea_TOB[df_nanumea_TOB[f'clust_{len(list_of_clusts)}']==cluster]
    df = df[['position','year2','intersect_distance']].set_index('year2')
    df_features = wave_node_values_per_clust[cluster]
    df = df.join(df_features)
    
    testing_features_dict.update({
        cluster:df
    })
    

In [None]:
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot2grid((2,2),(0,0))
ax2 = plt.subplot2grid((2,2),(0,1))
ax3 = plt.subplot2grid((2,2),(1,0))
ax4 = plt.subplot2grid((2,2),(1,1))


cluster = 3
df = testing_features_dict[cluster]
independent_variables = list(df.columns)
independent_variables = [x for x in independent_variables if (x!='intersect_distance')&(x!='position')]

for position,group in df.groupby('position'):
    group = group.sort_values('intersect_distance')
    group['intersect_distance'] = group['intersect_distance']-group['intersect_distance'].mean()
    for col,ax in zip(independent_variables,[ax1,ax2,ax3,ax4]):
        ax.scatter(group.intersect_distance,group[col],alpha=0.1,c='r')
        # else:
        #     ax2.scatter(group.intersect_distance,group[col],alpha=0.1,c='r')

for ax in [ax1,ax2,ax3,ax4]:
    ax.set_xlim([-10,10])

In [None]:
df.groupby('position').mean()

In [None]:
df.corr()

In [None]:
asdf

# Load Tairua Data

In [None]:
!pwd

In [2]:
df_tairua_data = pd.read_csv('Tairua_data/DL_shoreline_prediction/data/inputs_target.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Tairua_data/DL_shoreline_prediction/data/inputs_target.csv'

In [None]:
df_tairua_data = df_tairua_data.set_index('Datetime')

In [None]:
df_tairua_data.index = [datetime.datetime(int(x.split('-')[0]),int(x.split('-')[1]),int(x.split('-')[2])) for x in df_tairua_data.index]

In [None]:
# for index,row in df_tairua_data.iterrows():
#     index-datetime.timedelta(month=
    

In [None]:
df_tairua_data

# Setup and test in BN

In [None]:
pd.options.mode.chained_assignment = None
summary_results_dict = {}
true_positive_results_dict = {}
true_negative_results_dict = {}
false_positive_results_dict = {}
false_negative_results_dict = {}

individual_results_list = []


for cluster_selected in wave_node_values_per_clust.keys():
    for test_year in np.unique(df_nanumea_TOB.year2):
        if test_year==2005:
            continue
        df_waves_one_cluster = wave_node_values_per_clust[cluster_selected].copy()
        df_waves_one_cluster.columns = [y+str(x) for (x,y) in list(df_waves_one_cluster)]
        # df_waves_one_cluster = df_waves_one_cluster[[x for x in df_waves_one_cluster if (x in ['maxx'+str(x) for x in pos_selected_nodes_dict[cluster_selected]])|(x in ['minn'+str(x) for x in neg_selected_nodes_dict[cluster_selected]])]]
        df_nanumea_all_cluster = df_nanumea_TOB.set_index('position').join(pd.DataFrame.from_dict(clust_dict,orient='index'))[['intersect_distance','previous','year2',0]]
        df_nanumea_all_cluster = df_nanumea_all_cluster.reset_index()
        df_nanumea_all_cluster.columns = ['position','TOB','previous','year2','cluster']
        df_nanumea_one_cluster = df_nanumea_all_cluster[df_nanumea_all_cluster.cluster==cluster_selected]
        df_nanumea_one_cluster.drop('cluster',axis=1,inplace=True)
        df_nanumea_one_cluster = df_nanumea_one_cluster.set_index('year2').join(df_waves_one_cluster)
        df_nanumea_one_cluster = df_nanumea_one_cluster.reset_index().rename(columns={'index':'year2'}).set_index(['year2','position'])
        
        shoreline_threshold = 5
        # Discretised the results
        tob_list = []
        for index,row in df_nanumea_one_cluster.iterrows():
            if row.TOB<-shoreline_threshold:
                val = 'erosion'
            elif row.TOB>shoreline_threshold:
                val = 'accretion'
            else:
                val = 'stable'
            tob_list.append(val)
        df_nanumea_one_cluster['TOB'] = tob_list

        previous_list = []
        for index,row in df_nanumea_one_cluster.iterrows():
            if row.previous<-shoreline_threshold:
                val = 'erosion'
            elif row.previous>shoreline_threshold:
                val = 'accretion'
            else:
                val = 'stable'
            previous_list.append(val)
        df_nanumea_one_cluster['previous'] = previous_list

        parameter_cols = [x for x in list(df_nanumea_one_cluster) if ('minn'in x)|('maxx'in x)]


        p33 = df_nanumea_one_cluster[parameter_cols].quantile(0.33) #0.33
        p67 = df_nanumea_one_cluster[parameter_cols].quantile(0.67) #0.67

        for col in parameter_cols:
            val_list = []
            for index,row in df_nanumea_one_cluster.iterrows():
                if (row[col]<p33[col]):
                    val = "low"
                elif (row[col]>=p33[col])&(row[col]<=p67[col]):
                    val = "mid"
                elif (row[col]>=p67[col]):
                    val = "high"
                else:
                    print('binning error')

                val_list.append(val)
            df_nanumea_one_cluster[col] = val_list

        # Create pysmile network
        net = pysmile.Network()

        for col in list(df_nanumea_one_cluster):
            net.add_node(pysmile.NodeType.CPT,col)
            for val in np.unique(df_nanumea_one_cluster[col]):
                net.add_outcome(col,val)
            for delete_outcome in ['State0','State1']:
                net.delete_outcome(col,delete_outcome)

        for col in list(df_nanumea_one_cluster):
            if col!='TOB':
                net.add_arc('TOB',col)

        # format ready for pySMILE
        df_training = df_nanumea_one_cluster[df_nanumea_one_cluster.index.get_level_values(0)!=test_year]
        df_testing = df_nanumea_one_cluster[df_nanumea_one_cluster.index.get_level_values(0)==test_year]
        df_training.reset_index(drop=True,inplace=True)
        df_testing.reset_index(drop=True,inplace=True)

        # Training the BN
        df_training.to_csv('Processed_data_and_output/training_data.csv',index=False)
        ds_training = pysmile.learning.DataSet()
        ds_training.read_file('Processed_data_and_output/training_data.csv')

        em = pysmile.learning.EM()
        em.learn(data=ds_training,net=net,matching=ds_training.match_network(net))

        net.write_file(f"Processed_data_and_output/network_{cluster_selected}_{test_year}.xdsl")

        net.clear_all_evidence()

        node_id = net.get_node('TOB')

        children = net.get_child_ids('TOB')

        results_dict = {}
        for index,row in df_testing.iterrows():
            for child in children:
                child_outcomes = net.get_outcome_ids(child)
                child_evidence_dict = {}
                for child_outcome in child_outcomes:
                    if child_outcome==row[child]:
                        val=1
                    else:
                        val=0
                    child_evidence_dict.update({
                        child_outcome:val
                    })

                evidence_list = [child_evidence_dict[x] for x in child_outcomes]
                net.set_virtual_evidence(child,evidence_list)

            # Update beliefs
            net.update_beliefs()
            predictions = {outcome:val for outcome,val in zip(net.get_outcome_ids('TOB'),net.get_node_value('TOB'))}
            row['most_likely_state'] = max(predictions, key= lambda x: predictions[x])
            results_dict.update({
                index:row.append(pd.Series(predictions))
            })

        df_testing_results = pd.DataFrame.from_dict(results_dict).T
        df_testing_results['position'] = list(df_nanumea_one_cluster.reset_index().loc[df_nanumea_one_cluster.reset_index().year2==test_year,'position'])
        df_testing_results['year2'] = test_year
        
        individual_results_list.append(df_testing_results)
        
        for state in ['erosion','accretion','stable']:
            true_positive = len(df_testing_results[(df_testing_results['TOB']==state)&(df_testing_results['most_likely_state']==state)])
            true_negative = len(df_testing_results[(df_testing_results['TOB']!=state)&(df_testing_results['most_likely_state']!=state)])
            false_positive = len(df_testing_results[(df_testing_results['TOB']!=state)&(df_testing_results['most_likely_state']==state)])
            false_negative = len(df_testing_results[(df_testing_results['TOB']==state)&(df_testing_results['most_likely_state']!=state)])

            summary_results_dict.update({
                (state,test_year,cluster_selected):len(df_testing_results[df_testing_results['TOB']==df_testing_results['most_likely_state']])/len(df_testing_results)
            })

            true_positive_results_dict.update({
                (state,test_year,cluster_selected):true_positive
            })

            false_positive_results_dict.update({
                (state,test_year,cluster_selected):false_positive
            })

            true_negative_results_dict.update({
                (state,test_year,cluster_selected):true_negative
            })

            false_negative_results_dict.update({
                (state,test_year,cluster_selected):false_negative
            })
            


In [None]:
df_nanumea_one_cluster

In [None]:
df_testing_results = pd.DataFrame.from_dict(results_dict).T


In [None]:
df_true_positive_results = pd.DataFrame.from_dict(true_positive_results_dict,orient='index')
df_false_positive_results = pd.DataFrame.from_dict(false_positive_results_dict,orient='index')
df_true_negative_results = pd.DataFrame.from_dict(true_negative_results_dict,orient='index')
df_false_negative_results = pd.DataFrame.from_dict(false_negative_results_dict,orient='index')


In [None]:
# 
df_true_negative_results['state'] = [x[0] for x in df_true_negative_results.index]
df_true_negative_results['year'] = [x[1] for x in df_true_negative_results.index]
df_true_negative_results['clust'] = [x[2] for x in df_true_negative_results.index]

df_true_positive_results['state'] = [x[0] for x in df_true_positive_results.index]
df_true_positive_results['year'] = [x[1] for x in df_true_positive_results.index]
df_true_positive_results['clust'] = [x[2] for x in df_true_positive_results.index]

df_false_negative_results['state'] = [x[0] for x in df_false_negative_results.index]
df_false_negative_results['year'] = [x[1] for x in df_false_negative_results.index]
df_false_negative_results['clust'] = [x[2] for x in df_false_negative_results.index]

df_false_positive_results['state'] = [x[0] for x in df_false_positive_results.index]
df_false_positive_results['year'] = [x[1] for x in df_false_positive_results.index]
df_false_positive_results['clust'] = [x[2] for x in df_false_positive_results.index]


In [None]:
df_results = pd.DataFrame.from_dict(summary_results_dict,orient='index').reset_index()
df_results['state'] = [x[0] for x in df_results['index']]
df_results['year'] = [x[1] for x in df_results['index']]
df_results['clust'] = [x[2] for x in df_results['index']]
df_results.columns = ['index','accuracy','state','year','clust']
df_results.drop('index',axis=1,inplace=True)

In [None]:
df_results

In [None]:
round(df_results.groupby('year').mean(),2)[['accuracy']]
# Should compare to if 'stable' is not considered as wrong


In [None]:
round(df_results.groupby('clust').mean(),2)[['accuracy']]

In [None]:
# erosion
true_negative = df_true_negative_results.loc[(df_true_negative_results.state=='erosion'),0].sum()
true_positive = df_true_positive_results.loc[(df_true_positive_results.state=='erosion'),0].sum()
false_negative = df_false_negative_results.loc[(df_false_negative_results.state=='erosion'),0].sum()
false_positive = df_false_positive_results.loc[(df_false_positive_results.state=='erosion'),0].sum()

print('true negative',true_negative)
print('true positive',true_positive)
print('false negative',false_negative)
print('false positive',false_positive)
print('')
print('True Positive Rate')
print(true_positive/(true_positive+false_negative))
print('')
print('True Negative Rate')
print(true_negative/(true_negative+false_positive))
print('')
print('F-score')
print(true_positive/(true_positive+0.5*(false_positive+false_negative)))


In [None]:
# stable
true_negative = df_true_negative_results.loc[(df_true_negative_results.state=='stable'),0].sum()
true_positive = df_true_positive_results.loc[(df_true_positive_results.state=='stable'),0].sum()
false_negative = df_false_negative_results.loc[(df_false_negative_results.state=='stable'),0].sum()
false_positive = df_false_positive_results.loc[(df_false_positive_results.state=='stable'),0].sum()

print('true negative',true_negative)
print('true positive',true_positive)
print('false negative',false_negative)
print('false positive',false_positive)
print('')
print('True Positive Rate')
print(true_positive/(true_positive+false_negative))
print('')
print('True Negative Rate')
print(true_negative/(true_negative+false_positive))
print('')
print('F-score')
print(true_positive/(true_positive+0.5*(false_positive+false_negative)))


In [None]:
# accretion
true_negative = df_true_negative_results.loc[(df_true_negative_results.state=='accretion'),0].sum()
true_positive = df_true_positive_results.loc[(df_true_positive_results.state=='accretion'),0].sum()
false_negative = df_false_negative_results.loc[(df_false_negative_results.state=='accretion'),0].sum()
false_positive = df_false_positive_results.loc[(df_false_positive_results.state=='accretion'),0].sum()

print('true negative',true_negative)
print('true positive',true_positive)
print('false negative',false_negative)
print('false positive',false_positive)
print('')
print('True Positive Rate')
print(true_positive/(true_positive+false_negative))
print('')
print('True Negative Rate')
print(true_negative/(true_negative+false_positive))
print('')
print('F-score')
print(true_positive/(true_positive+0.5*(false_positive+false_negative)))
