Seasonal Causal network for log(increments)

- Augumented Dickey-Fuller (ADF) test
- Jarque-Bera residuals test
- GC statistics PDF and Decumulative distribution
- Network building
- Synoptic grids
- Barplot nodes strenght
- Network dregree across years
- Paiwise analyisis
- Correlation Heatmaps
  Spearmann, Kendall-Tau
- Newtork analysis: Clustering coefficient,mean distance, diameter


In [None]:
#ADF
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load the data
data = pd.read_csv('75gauges.csv')
data['DATETIME'] = pd.to_datetime(data['DATETIME'])

# Define the station IDs you want to include
stations_of_interest = [729, 750, 779, 718, 706, 695, 683, 764, 756]

# Filter data for the stations of interest and for winter months of 2012
filtered_data = data[(data['STATION'].isin(stations_of_interest)) & 
                     (data['DATETIME'].dt.month.isin([6, 7, 8])) &
                     (data['DATETIME'].dt.year == 2022)]

# Replace NaN values with zeros (if any exist)
filtered_data['VALUE'].fillna(0, inplace=True)

def adf_test(series):
    """ Helper function to perform ADF test """
    if len(series) < 100:
        return 'Insufficient data'
    else:
        result = sm.tsa.adfuller(series, autolag='AIC')
        return {'ADF Statistic': result[0], 'p-value': result[1], 'Used Lag': result[2], 'N observations': result[3]}

def analyze_data(series):
    results = {}
    # Test on raw data
    results['ADF_raw'] = adf_test(series.dropna())
    
    # Replace zeros with a small number to avoid log of zero
    series_small = series.replace(0, 1e-6)
    
    # Test on data with small values replacing zeros
    results['ADF_small_values'] = adf_test(series_small.dropna())

    # Calculate log returns on data with small values
    log_returns = np.log(series_small / series_small.shift(1)).dropna()

    # Test on log returns
    results['ADF_log_returns'] = adf_test(log_returns)

    return results

# Pivot data to prepare for analysis
pivot_data = filtered_data.pivot(index='DATETIME', columns='STATION', values='VALUE')

# Analyze each station's data and print results
results = {}
for station in pivot_data.columns:
    results[station] = analyze_data(pivot_data[station])
    print(f"Results for Station {station}:")
    for key, value in results[station].items():
        print(f"{key}: {value}")


In [None]:
#Jarque-Bera residual test
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
from scipy.stats import jarque_bera, norm

# Load data
gauges_data = pd.read_csv('75gauges24H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the analysis
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Granger causality test with manual residual calculation and plot
def granger_test_with_manual_residuals(dataframe, column1, column2, max_lag=1):
    try:
        # Perform Granger causality test to get lag length
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        
        # Use lagged values for regression
        lags = max_lag
        X = dataframe[[column1]].shift(lags).dropna()
        y = dataframe[column2].iloc[lags:]
        X = X[:len(y)]

        # Add constant to the model
        X = add_constant(X)
        
        # Fit OLS regression model
        model = OLS(y, X).fit()

        # Calculate residuals
        residuals = model.resid
        
        # Perform Jarque-Bera test for normality on residuals
        jb_stat, jb_p_value = jarque_bera(residuals)
        normality = "Normal" if jb_p_value > 0.05 else "Not normal"
        
        # Plot residuals
        plot_residuals(residuals, normality, column1, column2)
        
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        
        return f_statistic, p_value, normality
    except Exception as e:
        print(f"Error in granger_test_with_manual_residuals: {e}")
        return 0, 0, "Error"

# Function to plot residuals
def plot_residuals(residuals, normality, column1, column2):
    plt.figure(figsize=(8, 6))
    
    # Plot histogram of residuals
    plt.hist(residuals, bins=20, density=True, alpha=0.6, color='g', label="Residuals")
    
    # Fit a normal distribution to the residuals
    mu, std = norm.fit(residuals)
    
    # Plot the normal distribution
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, mu, std)
    plt.plot(x, p, 'k', linewidth=2, label="Normal fit")
    
    # Title and labels
    plt.title(f'Residuals Distribution: {column1} -> {column2} ({normality})')
    plt.xlabel('Residuals')
    plt.ylabel('Density')
    plt.legend()
    
    # Show plot
    plt.show()

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic, p_value, normality = granger_test_with_manual_residuals(pivot_data, station1, station2)
                print(f"Year: {year}, {station1} -> {station2}, F-statistic: {f_statistic}, p-value: {p_value}, Residuals: {normality}")


In [None]:
#GC-statistics plot 

import matplotlib.pyplot as plt

# Initialize a list to collect all F-statistics
f_statistics_list = []

# Loop over each year and perform the analysis again, storing F-statistics
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic, p_value, normality = granger_test_with_manual_residuals(pivot_data, station1, station2)
                # Store the F-statistic in the list if it's greater than 0
                if f_statistic > 0:
                    f_statistics_list.append(f_statistic)

# Plot the distribution of F-statistics
plt.figure(figsize=(10, 6))
plt.hist(f_statistics_list, bins=20, color='blue', alpha=0.7)
plt.title('Distribution of F-statistics from Granger Causality Tests')
plt.xlabel('F-statistic')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# decumulative

import numpy as np
import matplotlib.pyplot as plt

# Sort the F-statistics list
f_statistics_sorted = np.sort(f_statistics_list)

# Compute the decumulative probability
decumulative_prob = 1.0 - np.arange(1, len(f_statistics_sorted) + 1) / len(f_statistics_sorted)

# Plot the decumulative probability
plt.figure(figsize=(10, 6))
plt.plot(f_statistics_sorted, decumulative_prob, marker='o', linestyle='-', color='blue', alpha=0.7)
plt.title('Decumulative Probability of F-statistics from Granger Causality Tests')
plt.xlabel('F-statistic')
plt.ylabel('Decumulative Probability')
plt.yscale('log')
plt.xscale('log')
plt.grid(True)
plt.show()

In [None]:
#PDF decades
#FSTATISTICS

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
import seaborn as sns

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Function to perform Granger causality test and return F-statistic
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return (f_statistic, p_value) if p_value < 0.0001 else (0, 1)
    except:
        return 0, 1  # Return 0 in case of an error

# Splitting the data into two periods
#period1_data = merged_data[(merged_data['DATETIME'].dt.year == 2003)]
#period2_data = merged_data[(merged_data['DATETIME'].dt.year == 2013)]

# Filtering the data for the specified periods
period1_data = merged_data[((merged_data['DATETIME'].dt.year >= 2002) & (merged_data['DATETIME'].dt.year <= 2012) & 
                            (merged_data['DATETIME'].dt.month.isin([12, 1, 2])))]

period2_data = merged_data[((merged_data['DATETIME'].dt.year >= 2013) & (merged_data['DATETIME'].dt.year <= 2023) & 
                            (merged_data['DATETIME'].dt.month.isin([12, 1, 2])))]


# Function to create a network and get F-statistics for eastward and westward links
def create_network_and_get_stats(data):
    G = nx.Graph()
    f_stats_eastward = []
    f_stats_westward = []

    pivot_data = data.pivot(index='DATETIME', columns='STATION', values='VALUE').fillna(0)

    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic, p_value = granger_test(pivot_data, station1, station2)
                if f_statistic > 0:
                    G.add_edge(station1, station2)
                    if coordinates_data[coordinates_data['STATION'] == station1]['EST'].values[0] < coordinates_data[coordinates_data['STATION'] == station2]['EST'].values[0]:
                        f_stats_eastward.append(f_statistic)
                    else:
                        f_stats_westward.append(f_statistic)
    
    return f_stats_eastward, f_stats_westward

# Get F-statistics for both periods
f_stats_eastward_period1, f_stats_westward_period1 = create_network_and_get_stats(period1_data)
f_stats_eastward_period2, f_stats_westward_period2 = create_network_and_get_stats(period2_data)


# Plotting the PDFs
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.kdeplot(f_stats_eastward_period1, label='Eastward ', bw_adjust=0.1,color='red')
sns.kdeplot(f_stats_westward_period1, label='Westward ', bw_adjust=0.1,color='blue')
plt.title('PDF of F-statistics 2003')
plt.xlabel('F-statistic')
plt.ylabel('Density')
#plt.xlim(0,2000)
plt.legend()

plt.subplot(1, 2, 2)
sns.kdeplot(f_stats_eastward_period2, label='Eastward ', bw_adjust=0.1,color='red')
sns.kdeplot(f_stats_westward_period2, label='Westward ', bw_adjust=0.1,color='blue')
plt.title('PDF of F-statistics 2019)')
plt.xlabel('F-statistic')
plt.ylabel('Density')
#plt.xlim(0,2000)
plt.legend()

plt.tight_layout()
plt.savefig('1.f_statistics_plot.jpg', format='jpg', dpi=300)
plt.show()


In [None]:
# Plotting the PDFs
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.kdeplot(f_stats_eastward_period1, label='Eastward ', fill=True, bw_adjust=0.2,color='red')
sns.kdeplot(f_stats_westward_period1, label='Westward ',fill=True, bw_adjust=0.2,color='blue')
plt.title('PDF of F-statistics 2003')
plt.xlabel('F-statistic')
plt.ylabel('Density')
plt.xscale('log')
plt.yscale('log')
plt.ylim(0.0001,0.1)
plt.xlim(20,1000)
plt.legend()

plt.subplot(1, 2, 2)
sns.kdeplot(f_stats_eastward_period2, label='Eastward ',fill=True, bw_adjust=0.2,color='red')
sns.kdeplot(f_stats_westward_period2, label='Westward ',fill=True, bw_adjust=0.2,color='blue')
plt.title('PDF of F-statistics 2019')
plt.xlabel('F-statistic')
plt.ylabel('Density')
plt.xscale('log')
plt.yscale('log')
plt.ylim(0.0001,0.1)
plt.xlim(20,1000)
plt.legend()

plt.tight_layout()
plt.savefig('1.f_statistics_plotLOG.jpg', format='jpg', dpi=300)
plt.show()


In [None]:
#decumulative-decades

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
from scipy.stats import gaussian_kde

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.05 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Initialize variables to find global maxima
global_max_f_statistic = 0
station_year_data = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    def calculate_log_returns(group):
        log_returns = np.log(group) - np.log(group.shift(1))
        return log_returns

    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic = granger_test(pivot_data, station1, station2)
                if f_statistic > 0:
                    station_year_data[(station1, year)] = f_statistic

    # Update global maxima
    year_max_f_statistic = max(station_year_data.values(), default=0)
    global_max_f_statistic = max(global_max_f_statistic, year_max_f_statistic)

# Separate the F-statistics into two intervals
f_statistics_2002_2012 = [f_stat for (station, year), f_stat in station_year_data.items() if 2002 <= year <= 2012]
f_statistics_2013_2023 = [f_stat for (station, year), f_stat in station_year_data.items() if 2013 <= year <= 2023]

# Plot the PDFs and decumulative probabilities
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Define a function to plot PDF and decumulative probability
def plot_pdf_and_deprob(ax_pdf, ax_deprob, data, label, color):
    kde = gaussian_kde(data)
    x = np.linspace(0, max(data), 1000)
    pdf = kde(x)
    deprob = 1 - np.cumsum(pdf) * (x[1] - x[0])
    
    ax_pdf.plot(x, pdf, label=label, color=color)
    ax_deprob.plot(x, deprob, label=label, color=color)

# Plot for the intervals
plot_pdf_and_deprob(axes[0], axes[1], f_statistics_2002_2012, '2002-2012', 'blue')
plot_pdf_and_deprob(axes[0], axes[1], f_statistics_2013_2023, '2013-2023', 'red')

axes[0].set_title('PDF of F-statistics')
axes[1].set_title('Decumulative Probability of F-statistics')
axes[0].set_xlabel('F-statistic')
axes[0].set_ylabel('Density')
axes[1].set_xlabel('F-statistic')
axes[1].set_ylabel('Decumulative Probability')
axes[1].set_yscale('log')
axes[0].set_yscale('log')
axes[1].set_xscale('log')
axes[0].set_xscale('log')
axes[0].legend()
axes[1].legend()

plt.tight_layout()
plt.savefig('1.f_statistics_analysis.jpg', dpi=300)
plt.show()


In [None]:
# Network 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
gauges_data = pd.read_csv('75gauges24H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Function to create the plot with usual arrows, color bar, and size legend
def create_network_plot_with_usual_arrows_and_size_legend(year, links, coordinates, title, filename, outlink_strength, inlink_strength, inlink_count):
    fig, ax = plt.subplots(figsize=(12, 8))

    # Normalize outlink and inlink strengths
    inlink_min, inlink_max = min(inlink_count.values()), max(inlink_count.values())
    inlink_count_normalized = {station: (value - inlink_min) / (inlink_max - inlink_min) if inlink_max != inlink_min else 0 for station, value in inlink_count.items()}

    inlink_strength_min, inlink_strength_max = min(inlink_strength.values()), max(inlink_strength.values())

    # Plot the nodes
    for _, row in coordinates.iterrows():
        if row['STATION'] in stations_of_interest:
            station = row['STATION']
            size = 10 + (inlink_strength[station] - inlink_strength_min) / (inlink_strength_max - inlink_strength_min) * 40  # Node size proportional to strength
            color = plt.cm.coolwarm(inlink_count_normalized[station])  # Node color proportional to inlink count
            ax.plot(row['EST'], row['NORD'], 'o', markersize=size, color=color)  # circle for each node
            ax.text(row['EST'], row['NORD'], row['LOCATION'], fontsize=12, ha='right')

    # Plot the directional links with usual arrows
    for (station1, station2), f_stat in links.items():
        coord1 = coordinates[coordinates['STATION'] == station1][['EST', 'NORD']].values[0]
        coord2 = coordinates[coordinates['STATION'] == station2][['EST', 'NORD']].values[0]

        # Draw the arrow
        ax.annotate("",
                    xy=(coord2[0], coord2[1]), xycoords='data',
                    xytext=(coord1[0], coord1[1]), textcoords='data',
                    arrowprops=dict(arrowstyle="-|>", color='black', lw=1))

    # Add a horizontal color bar for inlink count
    sm = plt.cm.ScalarMappable(cmap=plt.cm.coolwarm, norm=plt.Normalize(vmin=inlink_min, vmax=inlink_max))
    sm.set_array([])
    cbar = plt.colorbar(sm, orientation='horizontal', ax=ax, pad=0.02)
    cbar.set_label('Number of links per node', fontsize=12)
    # Set integer ticks on the color bar
    cbar.set_ticks(range(int(inlink_min), int(inlink_max) + 1))
    cbar.set_ticklabels(range(int(inlink_min), int(inlink_max) + 1))

    # Add a size legend
    #legend_sizes = [10, 25, 40, 55]
    #for size in legend_sizes:
     #   ax.plot([], [], 'o', markersize=size, color='gray', label=f'Strength: {inlink_strength_min + (size - 10) / 10 * (inlink_strength_max - inlink_strength_min):.2f}')
    #ax.legend(loc='lower right', title='Node Size (Strength)', fontsize=10)

    ax.set_title(title)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.savefig(filename, dpi=300)
    plt.show()

# Filter data for the year 2002
year = 2002
data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                 (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

# Replace zero values with a small quantity
small_quantity = 0.000001
data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

# Calculate log returns
data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
data_year.dropna(inplace=True)

# Reset index to align with the original DataFrame
data_year.reset_index(drop=True, inplace=True)

# Pivot the filtered data
pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

links = {}
outlink_strength = {station: 0 for station in stations_of_interest}
inlink_strength = {station: 0 for station in stations_of_interest}
inlink_count = {station: 0 for station in stations_of_interest}

# Analysis for each station pair
for station1 in pivot_data.columns:
    for station2 in pivot_data.columns:
        if station1 != station2:
            f_statistic_out = granger_test(pivot_data, station1, station2)
            if f_statistic_out > 0:
                links[(station1, station2)] = f_statistic_out
                outlink_strength[station1] += f_statistic_out
                inlink_strength[station2] += f_statistic_out
                inlink_count[station2] += 1

# Create plot for the year 2002
title = f'Granger Causality Network for Winter {year} - 24h'
filename = f'gcnetwork24H_{year}.jpg'
create_network_plot_with_usual_arrows_and_size_legend(year, links, coordinates_data, title, filename, outlink_strength, inlink_strength, inlink_count)


In [None]:
# Network
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
import math
from matplotlib.patches import FancyArrowPatch

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs and their order
station_order = {  
    756: "RG",
    764: "SR",
    684: "AG",
    695: "CL",
    718: "EN",
    706: "CT",
    779: "TP",
    750: "PA",  
    729: "ME"    
} 

stations_of_interest = list(station_order.keys())

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.05 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Modified function to create the plot with nodes on a circle and directional arrows with colors
def create_network_plot_circle(year, links, coordinates, station_order, title, filename):
    fig, ax = plt.subplots(figsize=(12, 8))

    num_stations = len(station_order)
    angle = 2 * math.pi / num_stations

    # Calculate positions for each node on a circle
    positions = {}
    for i, (station, label) in enumerate(station_order.items()):
        theta = i * angle - math.pi / 2  # Start at the top (12 o'clock)
        x = math.cos(theta)
        y = math.sin(theta)
        positions[station] = (x, y)
        ax.plot(x, y, 'bo', markersize=5)  # blue circle for each node
        ax.text(x, y, label, fontsize=12, ha='right', va='center')

    # Plot the directional links with varying thickness
    station_keys = list(station_order.keys())
    for (station1, station2), f_stat in links.items():
        x1, y1 = positions[station1]
        x2, y2 = positions[station2]
        index1 = station_keys.index(station1)
        index2 = station_keys.index(station2)
        if (index2 - index1) % num_stations < num_stations // 2:
            color = 'blue'  # Clockwise
        else:
            color = 'red'  # Counterclockwise
        arrow = FancyArrowPatch((x1, y1), (x2, y2), color=color, arrowstyle='->', mutation_scale=10, lw=f_stat/10)
        ax.add_patch(arrow)

    ax.set_title(title)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.savefig(filename, dpi=300)
    plt.show()

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    links = {}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    links[(station1, station2)] = f_statistic_out

    # Create plot for the year
    title = f'Granger Causality Network for Winter {year}'
    filename = f'11.gcnetwork_{year}.jpg'
    create_network_plot_circle(year, links, coordinates_data, station_order, title, filename)

In [None]:
#SMALL QUANITY SYNOPTIC GRID

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
gauges_data = pd.read_csv('75gauges12H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Initialize variables to find global maxima
global_max_out_links = 0
global_max_in_links = 0
global_max_f_statistic_out = 0
global_max_f_statistic_in = 0
station_year_data_out = {}
station_year_data_in = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year.loc[:, 'VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    def calculate_log_returns(group):
        log_returns = np.log(group) - np.log(group.shift(1))
        return log_returns

    data_year.loc[:, 'LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year = data_year.dropna()

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    temp_out_links_count = {station: 0 for station in pivot_data.columns}
    temp_in_links_count = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_out = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_in = {station: 0 for station in pivot_data.columns}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                f_statistic_in = granger_test(pivot_data, station2, station1)
                
                if f_statistic_out > 0:
                    temp_out_links_count[station1] += 1
                    temp_f_statistics_sum_out[station1] += f_statistic_out
                    station_year_data_out[(station1, year)] = (temp_f_statistics_sum_out[station1], temp_out_links_count[station1])
                
                if f_statistic_in > 0:
                    temp_in_links_count[station1] += 1
                    temp_f_statistics_sum_in[station1] += f_statistic_in
                    station_year_data_in[(station1, year)] = (temp_f_statistics_sum_in[station1], temp_in_links_count[station1])

    # Update global maxima
    year_max_out_links = max(temp_out_links_count.values())
    year_max_in_links = max(temp_in_links_count.values())
    year_max_f_statistic_out = max(temp_f_statistics_sum_out.values())
    year_max_f_statistic_in = max(temp_f_statistics_sum_in.values())
    
    global_max_out_links = max(global_max_out_links, year_max_out_links)
    global_max_in_links = max(global_max_in_links, year_max_in_links)
    global_max_f_statistic_out = max(global_max_f_statistic_out, year_max_f_statistic_out)
    global_max_f_statistic_in = max(global_max_f_statistic_in, year_max_f_statistic_in)

# Define the station order dictionary
station_order = {
    779: "TP",
    750: "PA",
    684: "AG",
    695: "CL",
    718: "EN",
    756: "RG",
    706: "CT",
    764: "SR",
    729: "ME"
}  # ESTOVEST

def create_visualization(station_year_data, global_max_links, global_max_f_statistic, title, filename, fontsize=12):
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.set_facecolor('white')  # Set background to white
    
    # Define scales for size and color
    max_node_size = 0.2  # Maximum node size
    color_norm = plt.Normalize(0, global_max_links)  # Normalize link count
    color_map = plt.cm.coolwarm  # Color map

    # Draw each station-year as a circle on the grid
    for (station, year), (f_stat, links) in station_year_data.items():
        x = year
        y = list(station_order.keys()).index(station)  # Get station position based on order
        size = (f_stat / global_max_f_statistic) * max_node_size  # Scale size based on f_stat
        color = color_map(color_norm(links))  # Get color based on number of links

        # Create a circle and add it to the plot
        circle = Circle((x, y), np.sqrt(size), color=color, alpha=0.6)  # Use square root of size for radius
        ax.add_patch(circle)

    # Add color bar
    sm = plt.cm.ScalarMappable(cmap=color_map, norm=color_norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax, orientation='vertical', pad=0.02)
    cbar.set_label('Number of Links', fontsize=12)
    cbar.set_ticks(range(0, global_max_links + 1))  # Set integer ticks on the color bar

    # Set axis labels, ticks, and limits
    ax.set_xlabel('Year')
    ax.set_ylabel('Station ID')
    ax.set_xticks(np.arange(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1))
    ax.set_yticks(np.arange(len(station_order)))
    ax.set_yticklabels(station_order.values())
    ax.set_xlim(merged_data_filtered['DATETIME'].dt.year.min() - 1, merged_data_filtered['DATETIME'].dt.year.max() + 1)
    ax.set_ylim(-1, len(station_order))

    plt.grid(False)
    plt.title(title, fontsize=fontsize)
    plt.savefig(filename, dpi=300)
    plt.show()

# Create visualizations for outlinks and inlinks
# Create visualizations for outlinks and inlinks
create_visualization(station_year_data_out, global_max_out_links, global_max_f_statistic_out, 'Winter Outlinks - 12h',
                     "1.12Hgrid_visualization9gauges.jpg", fontsize=14)
create_visualization(station_year_data_in, global_max_in_links, global_max_f_statistic_in, 'Winter Inlinks - 12h',
                     "1.12Hingrid_visualization9gauges.jpg", fontsize=14)


In [None]:
#Synoptic grid

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.05 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Initialize variables to find global maxima
global_max_out_links = 0
global_max_in_links = 0
global_max_f_statistic_out = 0
global_max_f_statistic_in = 0
station_year_data_out = {}
station_year_data_in = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    def calculate_log_returns(group):
        log_returns = np.log(group) - np.log(group.shift(1))
        return log_returns

    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    temp_out_links_count = {station: 0 for station in pivot_data.columns}
    temp_in_links_count = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_out = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_in = {station: 0 for station in pivot_data.columns}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                f_statistic_in = granger_test(pivot_data, station2, station1)
                
                if f_statistic_out > 0:
                    temp_out_links_count[station1] += 1
                    temp_f_statistics_sum_out[station1] += f_statistic_out
                    station_year_data_out[(station1, year)] = (temp_f_statistics_sum_out[station1], temp_out_links_count[station1])
                
                if f_statistic_in > 0:
                    temp_in_links_count[station1] += 1
                    temp_f_statistics_sum_in[station1] += f_statistic_in
                    station_year_data_in[(station1, year)] = (temp_f_statistics_sum_in[station1], temp_in_links_count[station1])

    # Update global maxima
    year_max_out_links = max(temp_out_links_count.values())
    year_max_in_links = max(temp_in_links_count.values())
    year_max_f_statistic_out = max(temp_f_statistics_sum_out.values())
    year_max_f_statistic_in = max(temp_f_statistics_sum_in.values())
    
    global_max_out_links = max(global_max_out_links, year_max_out_links)
    global_max_in_links = max(global_max_in_links, year_max_in_links)
    global_max_f_statistic_out = max(global_max_f_statistic_out, year_max_f_statistic_out)
    global_max_f_statistic_in = max(global_max_f_statistic_in, year_max_f_statistic_in)

# Define the station order dictionary
station_order = {  
    756: "RG",
    764: "SR",
    684: "AG",
    706: "CT",
    779: "TP",
    750: "PA",  
     729: "ME"    
} #NORDSUD

# Function to create visualization of the 2D grid
def create_visualization(station_year_data, global_max_links, global_max_f_statistic, title, filename):
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.set_facecolor('white')  # Set background to white

    # Define scales for size and color
    max_node_size = 0.2  # Maximum node size
    color_norm = plt.Normalize(0, global_max_links)  # Normalize link count
    color_map = plt.cm.coolwarm  # Color map

    # Draw each station-year as a circle on the grid
    circles = []
    for (station, year), (f_stat, links) in station_year_data.items():
        x = year
        y = list(station_order.keys()).index(station)  # Get station position based on order
        size = (f_stat / global_max_f_statistic) * max_node_size  # Scale size based on f_stat
        color = color_map(color_norm(links))  # Get color based on number of links

        # Create a circle and add it to the plot
        circle = Circle((x, y), np.sqrt(size), color=color, alpha=0.6)  # Use square root of size for radius
        ax.add_patch(circle)
        circles.append(circle)

    # Set axis labels, ticks, and limits
    ax.set_xlabel('Year')
    ax.set_ylabel('Station ID')
    ax.set_xticks(np.arange(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1))
    ax.set_yticks(np.arange(len(station_order)))
    ax.set_yticklabels(station_order.values())
    ax.set_xlim(merged_data_filtered['DATETIME'].dt.year.min() - 1, merged_data_filtered['DATETIME'].dt.year.max() + 1)
    ax.set_ylim(-1, len(station_order))

    # Add color bar for link count
    sm = plt.cm.ScalarMappable(cmap=color_map, norm=color_norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax)
    cbar.set_label('Number of Links')

    # Add legend for circle sizes
    legend_sizes = [global_max_f_statistic * x / 3 for x in range(1, 4)]  # Example sizes
    for size in legend_sizes:
        ax.scatter([], [], c='k', alpha=0.6, s=np.sqrt((size / global_max_f_statistic) * max_node_size) * 1000,
                   label=f'F-stat: {size:.2f}')
    ax.legend(scatterpoints=1, frameon=False, labelspacing=1, title='Circle Sizes')

    plt.grid(False)
    plt.title(title)
    plt.savefig(filename, dpi=300)
    plt.show()

# Create visualizations for outlinks and inlinks
create_visualization(station_year_data_out, global_max_out_links, global_max_f_statistic_out, 'Winter Outlinks', "1.9gauges.jpg")
create_visualization(station_year_data_in, global_max_in_links, global_max_f_statistic_in, 'Winter Inlinks', "1.9gauges.jpg")


In [None]:
# Barplot node strenght
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
#gauges_data = pd.read_csv('combined_data_9gauges.csv')
#gauges_data = pd.read_csv('75gauges.csv')
#gauges_data = pd.read_csv('75gauges6H.csv')
#gauges_data = pd.read_csv('75gauges12H.csv')
gauges_data = pd.read_csv('75gauges24H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Initialize variables to find global maxima
global_max_out_links = 0
global_max_in_links = 0
global_max_f_statistic_out = 0
global_max_f_statistic_in = 0
station_year_data_out = {}
station_year_data_in = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([9, 10, 11]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    def calculate_log_returns(group):
        log_returns = np.log(group) - np.log(group.shift(1))
        return log_returns

    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    temp_out_links_count = {station: 0 for station in pivot_data.columns}
    temp_in_links_count = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_out = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_in = {station: 0 for station in pivot_data.columns}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                f_statistic_in = granger_test(pivot_data, station2, station1)
                
                if f_statistic_out > 0:
                    temp_out_links_count[station1] += 1
                    temp_f_statistics_sum_out[station1] += f_statistic_out
                    station_year_data_out[(station1, year)] = (temp_f_statistics_sum_out[station1], temp_out_links_count[station1])
                
                if f_statistic_in > 0:
                    temp_in_links_count[station1] += 1
                    temp_f_statistics_sum_in[station1] += f_statistic_in
                    station_year_data_in[(station1, year)] = (temp_f_statistics_sum_in[station1], temp_in_links_count[station1])

    # Update global maxima
    year_max_out_links = max(temp_out_links_count.values())
    year_max_in_links = max(temp_in_links_count.values())
    year_max_f_statistic_out = max(temp_f_statistics_sum_out.values())
    year_max_f_statistic_in = max(temp_f_statistics_sum_in.values())
    
    global_max_out_links = max(global_max_out_links, year_max_out_links)
    global_max_in_links = max(global_max_in_links, year_max_in_links)
    global_max_f_statistic_out = max(global_max_f_statistic_out, year_max_f_statistic_out)
    global_max_f_statistic_in = max(global_max_f_statistic_in, year_max_f_statistic_in)

station_order = {  
    756: "RG",
    764: "SR",
    684: "AG",
    695: "CL",
    718: "EN",
    706: "CT",
    779: "TP",
    750: "PA",  
    729: "ME"    
}    

station_colors = {
    "CT": "orange",
    "EN": "orange",
    "CL": "orange",
    "ME": "blue",
    "TP": "blue",
    "PA": "blue",
    "AG": "red",
    "RG": "red",
    "SR": "red"
}

# Function to create grid of bar plots
def create_grid_bar_plot(station_year_data, title_prefix, filename):
    years = sorted(set(year for _, year in station_year_data.keys()))
    num_years = len(years)
    
    fig, axes = plt.subplots(nrows=11, ncols=2, figsize=(20, 50))
    fig.subplots_adjust(hspace=0.5, wspace=0.3)

    for i, year in enumerate(years):
        # Calculate total F-statistic for each station in the given year
        station_totals = {station: 0 for station, _ in station_year_data.keys()}
        for (station, y), (f_stat, links) in station_year_data.items():
            if y == year:
                station_totals[station] += f_stat

        # Sort stations by total F-statistic
        sorted_stations = sorted(station_totals.items(), key=lambda x: -x[1])

        # Extract data for plotting
        stations = [station_order[station[0]] for station in sorted_stations]
        total_f_stat = [station[1] for station in sorted_stations]
        colors = [station_colors[station_order[station[0]]] for station in sorted_stations]

        # Determine the correct subplot
        row = i % 11
        col = i // 11
        ax = axes[row, col]

        # Create bar plot
        ax.bar(stations, total_f_stat, color=colors, alpha=0.6)
        ax.set_xlabel('Station')
        ax.set_ylabel('Total F-statistic')
        ax.set_ylim(0,280)
        ax.set_xticklabels(stations, rotation=45, ha='right')

        ax.set_title(f'{title_prefix} {year}')

    plt.savefig(filename, dpi=300)
    plt.show()

# Create grid bar plots for outlinks and inlinks
create_grid_bar_plot(station_year_data_out, 'Winter Outlinks Total GC-statistics - 24h', "4.grid24H_outlinks.jpg")
create_grid_bar_plot(station_year_data_in, 'Winter Inlinks Total F-statistics 24h', "4.grid24H_inlinks.jpg")

In [None]:
# Network degree across years
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
#gauges_data = pd.read_csv('combined_data_9gauges.csv')
#gauges_data = pd.read_csv('75gauges.csv')
#gauges_data = pd.read_csv('75gauges6H.csv')
#gauges_data = pd.read_csv('75gauges12H.csv')
gauges_data = pd.read_csv('75gauges24H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Function to create the plot with nodes and directional arrows
def create_network_plot(year, links, coordinates, title, filename):
    fig, ax = plt.subplots(figsize=(12, 8))

    # Plot the nodes
    for _, row in coordinates.iterrows():
        if row['STATION'] in stations_of_interest:
            ax.plot(row['EST'], row['NORD'], 'bo', markersize=2)  # blue circle for each node
            ax.text(row['EST'], row['NORD'], row['LOCATION'], fontsize=12, ha='right')

    # Plot the directional links
    for (station1, station2), f_stat in links.items():
        coord1 = coordinates[coordinates['STATION'] == station1][['EST', 'NORD']].values[0]
        coord2 = coordinates[coordinates['STATION'] == station2][['EST', 'NORD']].values[0]
        ax.annotate("",
                    xy=(coord2[0], coord2[1]), xycoords='data',
                    xytext=(coord1[0], coord1[1]), textcoords='data',
                    arrowprops=dict(arrowstyle="->", color='blue', lw=1))

    ax.set_title(title)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.savefig(filename)
    plt.show()

# Initialize dictionaries to hold total number of links and sum of F-statistics
total_links = {}
sum_statistics = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([9, 10, 11]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year.loc[:, 'VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    links = {}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    links[(station1, station2)] = f_statistic_out

    total_links[year] = len(links)
    sum_statistics[year] = sum(links.values())

    # Create plot for the year
    title = f'Granger Causality Network for Winter {year}'
    filename = f'0.gcnetwork_{year}.jpg'
    create_network_plot(year, links, coordinates_data, title, filename)

# Prepare data for heatmap
years = sorted(list(total_links.keys()))
heatmap_data = pd.DataFrame({
    'Total Links': [total_links[year] for year in years],
    #'Sum of F-statistics': [sum_statistics[year] for year in years]
}, index=years)

# Plot the heatmap with squared cells and explicit normalization
plt.figure(figsize=(12, 2))  # Adjust the figure size to make it look like strips
max_val = max(heatmap_data.values.flatten())
sns.heatmap(heatmap_data.T, annot=False, cmap='coolwarm', cbar_kws={'label': 'Value'}, vmin=0, vmax=max_val, square=True)
plt.title('Autumn Granger Causality Network Links - 24h')
plt.xlabel('Year')
plt.yticks(rotation=0)
plt.xticks(rotation=45) 
#plt.savefig('4.total24H_Links.jpg', dpi=300)
plt.show()

In [None]:
#barplot netwrok degree and statistics 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Initialize variables to find global maxima
global_max_out_links = 0
global_max_in_links = 0
global_max_f_statistic_out = 0
global_max_f_statistic_in = 0
station_year_data_out = {}
station_year_data_in = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([9, 10, 11]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    def calculate_log_returns(group):
        log_returns = np.log(group) - np.log(group.shift(1))
        return log_returns

    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    temp_out_links_count = {station: 0 for station in pivot_data.columns}
    temp_in_links_count = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_out = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_in = {station: 0 for station in pivot_data.columns}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                f_statistic_in = granger_test(pivot_data, station2, station1)
                
                if f_statistic_out > 0:
                    temp_out_links_count[station1] += 1
                    temp_f_statistics_sum_out[station1] += f_statistic_out
                    station_year_data_out[(station1, year)] = (temp_f_statistics_sum_out[station1], temp_out_links_count[station1])
                
                if f_statistic_in > 0:
                    temp_in_links_count[station1] += 1
                    temp_f_statistics_sum_in[station1] += f_statistic_in
                    station_year_data_in[(station1, year)] = (temp_f_statistics_sum_in[station1], temp_in_links_count[station1])

    # Update global maxima
    year_max_out_links = max(temp_out_links_count.values())
    year_max_in_links = max(temp_in_links_count.values())
    year_max_f_statistic_out = max(temp_f_statistics_sum_out.values())
    year_max_f_statistic_in = max(temp_f_statistics_sum_in.values())
    
    global_max_out_links = max(global_max_out_links, year_max_out_links)
    global_max_in_links = max(global_max_in_links, year_max_in_links)
    global_max_f_statistic_out = max(global_max_f_statistic_out, year_max_f_statistic_out)
    global_max_f_statistic_in = max(global_max_f_statistic_in, year_max_f_statistic_in)

station_order = {  
    756: "RG",
    764: "SR",
    684: "AG",
    695: "CL",
    718: "EN",
    706: "CT",
    779: "TP",
    750: "PA",  
    729: "ME"    
}    

# Function to create bar plot visualization
def create_bar_plot(station_year_data, title, filename):
    # Calculate total links and total F-statistic for each station
    station_totals = {}
    for (station, year), (f_stat, links) in station_year_data.items():
        if station not in station_totals:
            station_totals[station] = {'total_f_stat': 0, 'total_links': 0}
        station_totals[station]['total_f_stat'] += f_stat
        station_totals[station]['total_links'] += links

    # Sort stations by total links and then by total F-statistic
    sorted_stations = sorted(station_totals.items(), key=lambda x: (-x[1]['total_links'], -x[1]['total_f_stat']))

    # Extract data for plotting
    stations = [station_order[station[0]] for station in sorted_stations]
    total_links = [station[1]['total_links'] for station in sorted_stations]
    total_f_stat = [station[1]['total_f_stat'] for station in sorted_stations]

    # Create bar plot
    fig, ax1 = plt.subplots(figsize=(15, 10))
    
    color = 'tab:blue'
    ax1.set_xlabel('Station')
    ax1.set_ylabel('Total Links', color=color)
    ax1.bar(stations, total_links, color=color, alpha=0.6)
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.set_xticklabels(stations, rotation=45, ha='right')

    ax2 = ax1.twinx()  # Instantiate a second y-axis that shares the same x-axis
    color = 'tab:red'
    ax2.set_ylabel('Total F-statistic', color=color)
    ax2.plot(stations, total_f_stat, color=color, marker='o')
    ax2.tick_params(axis='y', labelcolor=color)

    plt.title(title)
    plt.savefig(filename, dpi=300)
    plt.show()

# Create bar plots for outlinks and inlinks
create_bar_plot(station_year_data_out, 'Autumn Outlinks Strength', "0.bar_plot_outlinks.jpg")
create_bar_plot(station_year_data_in, 'Autumn Inlinks Strength', "0.bar_plot_inlinks.jpg")


In [None]:
#edges direction west-east
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
#gauges_data = pd.read_csv('combined_data_9gauges.csv')
#gauges_data = pd.read_csv('75gauges.csv')
#gauges_data = pd.read_csv('75gauges6H.csv')
#gauges_data = pd.read_csv('75gauges12H.csv')
gauges_data = pd.read_csv('75gauges24H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Initialize dictionaries to hold total number of links and sum of F-statistics
eastward_links = {}
westward_links = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([6, 7, 8]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year.loc[:, 'VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    links = {}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    links[(station1, station2)] = f_statistic_out

                    # Determine if the link is eastward or westward
                    coord1 = coordinates_data[coordinates_data['STATION'] == station1][['EST', 'NORD']].values[0]
                    coord2 = coordinates_data[coordinates_data['STATION'] == station2][['EST', 'NORD']].values[0]
                    if coord1[0] < coord2[0]:
                        if year not in eastward_links:
                            eastward_links[year] = 0
                        eastward_links[year] += 1
                    else:
                        if year not in westward_links:
                            westward_links[year] = 0
                        westward_links[year] += 1

# Prepare data for plotting
years = sorted(list(set(westward_links.keys()).union(set(eastward_links.keys()))))
number_of_westward_links = [westward_links.get(year, 0) for year in years]
number_of_eastward_links = [eastward_links.get(year, 0) for year in years]

# Plot the number of westward and eastward links over the years
plt.figure(figsize=(10, 6))

# Plot westward links with a solid line
plt.plot(years, number_of_westward_links, marker='o', linestyle='-', color='r', label='Westward Links')

# Plot eastward links with a dashed line
plt.plot(years, number_of_eastward_links, marker='o', linestyle='--', color='b', label='Eastward Links')

# Add labels and title
plt.title('Autumn no of Granger Causality Links over Years (East vs West) - 24h', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Links', fontsize=14)

# Add grid and legend
plt.grid(True)
plt.legend(loc='upper right')

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
#plt.savefig('4.24Heast_west_links_over_years.jpg', dpi=300)
plt.show()



In [None]:
# seasonal network degree
import matplotlib.pyplot as plt

# Prepare data for plotting
years = sorted(list(set(westward_links.keys()).union(set(eastward_links.keys()))))
total_links = [eastward_links.get(year, 0) + westward_links.get(year, 0) for year in years]

# Plot the total number of links over the years
plt.figure(figsize=(10, 6))
plt.plot(years, total_links, marker='o', linestyle='-', color='#FFA500')

# Add labels and title
plt.title('Summer no of Granger Causality Links over Years - 24h', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Total Number of Links', fontsize=14)

# Add grid and legend
plt.grid(True)
#plt.legend(loc='upper right')

# Show the plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('3.24Htotal_links_over_years.jpg', dpi=300)
plt.show()


In [None]:
#tabula
import pandas as pd
import matplotlib.pyplot as plt

# Prepare data for DataFrame
data = {
    'Autumn - 24h': years,
    'Eastward Links': [eastward_links.get(year, 0) for year in years],
    'Westward Links': [westward_links.get(year, 0) for year in years],
    'Total Links': [eastward_links.get(year, 0) + westward_links.get(year, 0) for year in years]
}

# Create DataFrame
df = pd.DataFrame(data)

# Plot the table and save as JPG
fig, ax = plt.subplots(figsize=(10, 8))  # Set the size of the figure
ax.axis('tight')
ax.axis('off')

# Create table
table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')

# Adjust the font size
table.auto_set_font_size(False)
table.set_fontsize(10)

# Save the table as an image
plt.savefig('4.24Hlinks_table.jpg', bbox_inches='tight', dpi=300)

# Show the plot (optional)
plt.show()


In [None]:
#max statistics
# Find the maximum F-statistic reached in eastward and westward links
max_f_statistic_eastward = max(eastward_links.values()) if eastward_links else 0
max_f_statistic_westward = max(westward_links.values()) if westward_links else 0

# Find the overall maximum F-statistic
max_f_statistic = max(max_f_statistic_eastward, max_f_statistic_westward)

# Count the number of links for eastward and westward directions
num_eastward_links = len(eastward_links)
num_westward_links = len(westward_links)

# Print the maximum values and number of links
print(f"Maximum F-statistic (Eastward): {max_f_statistic_eastward}")
print(f"Maximum F-statistic (Westward): {max_f_statistic_westward}")
print(f"Overall Maximum F-statistic: {max_f_statistic}")
print(f"Number of Eastward links: {num_eastward_links}")
print(f"Number of Westward links: {num_westward_links}")


In [None]:
# Find the maximum F-statistic reached in eastward and westward links
max_f_statistic_eastward = max(eastward_links.values()) if eastward_links else 0
max_f_statistic_westward = max(westward_links.values()) if westward_links else 0

# Find the overall maximum F-statistic
max_f_statistic = max(max_f_statistic_eastward, max_f_statistic_westward)

# Print the maximum values
print(f"Maximum F-statistic (Eastward): {max_f_statistic_eastward}")
print(f"Maximum F-statistic (Westward): {max_f_statistic_westward}")
print(f"Overall Maximum F-statistic: {max_f_statistic}")


In [None]:
#plot seasonal max strenght across years
import matplotlib.pyplot as plt

# Dati di esempio per la F-statistics (puoi modificare con i tuoi dati reali)
dati_f_statistics = {
    'Winter': [312.3, 514.3, 498.2, 232.5, 62.7],
    'Spring': [584.4, 531.2, 401.3, 328.6, 85.5],
    'Summer': [1946.9, 1781.9, 953.5, 597.0, 922.0],
    'Autumn': [506.1, 702.8, 439.5, 192.5, 143.7],
}

# Granularità temporali
scale_temporali = ['10 min', '1 Hr', '6 Hr', '12 Hr', '24 Hr']

# Colori per le stagioni
colori = {
    'Winter': 'b',
    'Spring': 'g',
    'Summer': '#FFA500',
    'Autumn': '#FF0000',
}

# Marker per ciascun punto (puoi personalizzare i marker per ciascun punto)
markers = {
    'Winter': ['^', 'o', '^', 'o', 'o'],
    'Spring': ['^', '^', '^', '^', 'o'],
    'Summer': ['^', '^', 'o', 'o', 'o'],
    'Autumn': ['o', '^', '^', 'o', 'o'],
}

# Est/Ovest: True rappresenta Est (pieno), False rappresenta Ovest (vuoto)
east_west = {
    'Winter': [True, False, False, False, False],
    'Spring': [False, False, False, False, False],
    'Summer': [False, False, False, False, False],
    'Autumn': [False, False, False, False, False],
}

# Dimensione dei marker per ciascuna categoria
size_o_markers = 10
size_caret_markers = 12

# Creare il grafico con dimensioni maggiori
fig, ax = plt.subplots(figsize=(10, 7))

# Iterare attraverso le stagioni e plottare i valori delle F-statistics per ciascuna scala temporale
for stagione in dati_f_statistics:
    y_values = dati_f_statistics[stagione]
    x_values = scale_temporali
    marker_values = markers[stagione]
    east_west_values = east_west[stagione]
    
    # Plottare i punti e le linee collegate
    for i in range(len(x_values)):
        marker_size = size_o_markers if marker_values[i] == 'o' else size_caret_markers
        marker_fill = colori[stagione] if east_west_values[i] else 'none'  # Pieno per Est, trasparente per Ovest
        ax.plot(x_values[i], y_values[i], marker=marker_values[i], color=colori[stagione],
                markerfacecolor=marker_fill, markersize=marker_size)
    
    # Collegare i punti con una linea
    ax.plot(x_values, y_values, color=colori[stagione], linewidth=1)

# Aggiungere dummy plot per le stagioni senza marker nella legenda
ax.plot([], [], color='b', linewidth=1, label='Winter')
ax.plot([], [], color='g', linewidth=1, label='Spring')
ax.plot([], [], color='#FFA500', linewidth=1, label='Summer')
ax.plot([], [], color='#FF0000', linewidth=1, label='Autumn')

# Aggiungere una dummy plot per spiegare il significato dei marker (2002-2012 e 2013-2023)
ax.plot([], [], marker='o', color='black', linestyle='None', markersize=size_o_markers, label='2002-2012')
ax.plot([], [], marker='^', color='black', linestyle='None', markersize=size_caret_markers, label='2013-2023')

# Aggiungere una dummy plot per spiegare il significato di Est e Ovest
ax.plot([], [], marker='s', color='black', markerfacecolor='black', linestyle='None', label='East')
ax.plot([], [], marker='s', color='black', markerfacecolor='none', linestyle='None', label='West')

# Aggiungere titolo e etichette con dimensioni maggiori
plt.title('F-statistics max - WE', fontsize=20)
plt.xlabel('Time scales', fontsize=16)
plt.ylabel('F-statistics', fontsize=16)

# Aumentare dimensione etichette degli assi
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

# Aggiungere la leggenda fuori dal grafico con dimensioni maggiori
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14)

# Aggiungere griglia
plt.grid(False)

# Aggiungere layout regolato
plt.tight_layout()

plt.savefig('F-statistics max - WE.jpg', format='jpg', dpi=300)

# Mostrare il grafico
plt.show()


In [None]:
#Pairwise analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Function to create the plot with nodes and directional arrows
def create_network_plot(year, links, coordinates, title, filename):
    fig, ax = plt.subplots(figsize=(12, 8))

    # Plot the nodes
    for _, row in coordinates.iterrows():
        if row['STATION'] in stations_of_interest:
            ax.plot(row['EST'], row['NORD'], 'bo', markersize=2)  # blue circle for each node
            ax.text(row['EST'], row['NORD'], row['LOCATION'], fontsize=12, ha='right')

    # Plot the directional links
    for (station1, station2), f_stat in links.items():
        coord1 = coordinates[coordinates['STATION'] == station1][['EST', 'NORD']].values[0]
        coord2 = coordinates[coordinates['STATION'] == station2][['EST', 'NORD']].values[0]
        ax.annotate("",
                    xy=(coord2[0], coord2[1]), xycoords='data',
                    xytext=(coord1[0], coord1[1]), textcoords='data',
                    arrowprops=dict(arrowstyle="->", color='blue', lw=1))

    ax.set_title(title)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.savefig(filename)
    plt.show()

# Initialize dictionary to hold F-statistics for the PA and CT stations
f_stats_pa_ct = {}
f_stats_ct_pa = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    links = {}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    links[(station1, station2)] = f_statistic_out

                # Collect F-statistics for PA and CT stations
                if station1 == 750 and station2 == 779:
                    if year not in f_stats_pa_ct:
                        f_stats_pa_ct[year] = f_statistic_out
                if station1 == 779 and station2 == 750:
                    if year not in f_stats_ct_pa:
                        f_stats_ct_pa[year] = f_statistic_out

    # Create plot for the year
    title = f'Granger Causality Network for Summer {year}'
    filename = f'0.gcnetwork_{year}.jpg'
    create_network_plot(year, links, coordinates_data, title, filename)

# Prepare data for heatmap
years = sorted(list(f_stats_pa_ct.keys()))
heatmap_data = pd.DataFrame({
    'PA-TP': [f_stats_pa_ct[year] for year in years],
    'TP-PA': [f_stats_ct_pa[year] for year in years]
}, index=years)

# Plot the heatmap with squared cells and explicit normalization
plt.figure(figsize=(12, 2))  # Adjust the figure size to make it look like strips
max_val = max(heatmap_data.values.flatten())
sns.heatmap(heatmap_data.T, annot=False, cmap='coolwarm', cbar_kws={'label': 'F-statistic'}, vmin=0, vmax=max_val, square=True)
plt.title('Winter Granger Causality F-statistics')
plt.xlabel('Year')
plt.ylabel('Direction')
plt.yticks(rotation=0)
plt.xticks(rotation=45) 
plt.savefig('1.PATP.jpg', dpi=300)
plt.show()

In [None]:
#Edges direction north-south strenght
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
#gauges_data = pd.read_csv('combined_data_9gauges.csv')
#gauges_data = pd.read_csv('75gauges.csv')
#gauges_data = pd.read_csv('75gauges6H.csv')
#gauges_data = pd.read_csv('75gauges12H.csv')
gauges_data = pd.read_csv('75gauges24H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Function to create the plot with nodes and directional arrows
def create_network_plot(year, links, coordinates, title, filename):
    fig, ax = plt.subplots(figsize=(12, 8))

    # Plot the nodes
    for _, row in coordinates.iterrows():
        if row['STATION'] in stations_of_interest:
            ax.plot(row['EST'], row['NORD'], 'bo', markersize=2)  # blue circle for each node
            ax.text(row['EST'], row['NORD'], row['LOCATION'], fontsize=12, ha='right')

    # Plot the directional links
    for (station1, station2), f_stat in links.items():
        coord1 = coordinates[coordinates['STATION'] == station1][['EST', 'NORD']].values[0]
        coord2 = coordinates[coordinates['STATION'] == station2][['EST', 'NORD']].values[0]
        ax.annotate("",
                    xy=(coord2[0], coord2[1]), xycoords='data',
                    xytext=(coord1[0], coord1[1]), textcoords='data',
                    arrowprops=dict(arrowstyle="->", color='blue', lw=1))

    ax.set_title(title)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.savefig(filename)
    plt.show()

# Initialize dictionaries to hold F-statistics for directional links
northward_links = {}
southward_links = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([9, 10, 11]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    links = {}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    links[(station1, station2)] = f_statistic_out

                    # Collect F-statistics for northward and southward links
                    coord1 = coordinates_data[coordinates_data['STATION'] == station1][['EST', 'NORD']].values[0]
                    coord2 = coordinates_data[coordinates_data['STATION'] == station2][['EST', 'NORD']].values[0]
                    if coord1[1] < coord2[1]:
                        if year not in northward_links:
                            northward_links[year] = 0
                        northward_links[year] += f_statistic_out
                    else:
                        if year not in southward_links:
                            southward_links[year] = 0
                        southward_links[year] += f_statistic_out

    # Create plot for the year
    title = f'Granger Causality Network for Winter {year}'
    filename = f'0.gcnetwork_{year}.jpg'
    create_network_plot(year, links, coordinates_data, title, filename)

# Prepare data for heatmap
#years = sorted(list(northward_links.keys()))
years = sorted(list(set(eastward_links.keys()).union(set(westward_links.keys()))))  # Include all years from both dictionaries



heatmap_data = pd.DataFrame({
    'Northward': [northward_links.get(year, 0) for year in years],
    'Southward': [southward_links.get(year, 0) for year in years]
}, index=years)



# Create a mask for cells with zero values (these will be colored in black)
mask = heatmap_data == 0

# Create a custom colormap where zero values are black and the rest use 'coolwarm'
cmap = sns.color_palette("coolwarm", as_cmap=True)
cmap_with_black = ListedColormap(['black'] + list(cmap(np.linspace(0, 1, 256))))

# Plot the heatmap with masked zero values
plt.figure(figsize=(12, 2))  # Adjust the figure size to make it look like strips
max_val = max(heatmap_data.values.flatten())

sns.heatmap(heatmap_data.T, mask=mask.T, annot=False, cmap=cmap_with_black, cbar_kws={'label': 'F-statistic'}, vmin=0, vmax=max_val, square=True)

plt.title('Summer Granger Causality F-statistics - 24h')
plt.xlabel('Year')
plt.ylabel('Direction')
plt.yticks(rotation=0)
plt.xticks(rotation=45) 
#plt.savefig('3.24HNS.jpg', dpi=300)
plt.show()


#heatmap_data = pd.DataFrame({
#    'Northward': [northward_links[year] for year in years],
#    'Southward': [southward_links[year] for year in years]
#}, index=years)

# Plot the heatmap with squared cells and explicit normalization
#plt.figure(figsize=(12, 2))  # Adjust the figure size to make it look like strips
#max_val = max(heatmap_data.values.flatten())
#sns.heatmap(heatmap_data.T, annot=False, cmap='coolwarm', cbar_kws={'label': 'F-statistic'}, vmin=0, vmax=max_val, square=True)
#plt.title('Autumn Granger Causality F-statistics - 12h')
#plt.xlabel('Year')
#plt.ylabel('Direction')
#plt.yticks(rotation=0)
#plt.xticks(rotation=45) 
#plt.savefig('4.12HNS.jpg', dpi=300)
#plt.show()


In [None]:
# Find the maximum F-statistics and the number of links for both directions
max_f_statistic_northward = max(northward_links.values()) if northward_links else 0
max_f_statistic_southward = max(southward_links.values()) if southward_links else 0
max_f_statistic = max(max_f_statistic_northward, max_f_statistic_southward)

num_northward_links = len(northward_links)
num_southward_links = len(southward_links)

# Print the results
print(f"Maximum F-statistic (Northward): {max_f_statistic_northward}")
print(f"Maximum F-statistic (Southward): {max_f_statistic_southward}")
print(f"Overall Maximum F-statistic: {max_f_statistic}")
print(f"Number of Northward links: {num_northward_links}")
print(f"Number of Southward links: {num_southward_links}")

In [None]:
# number of edges across years
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
#gauges_data = pd.read_csv('combined_data_9gauges.csv')
#gauges_data = pd.read_csv('75gauges.csv')
#gauges_data = pd.read_csv('75gauges6H.csv')
#gauges_data = pd.read_csv('75gauges12H.csv')
gauges_data = pd.read_csv('75gauges24H.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

# Function to create the plot with nodes and directional arrows
def create_network_plot(year, links, coordinates, title, filename):
    fig, ax = plt.subplots(figsize=(12, 8))

    # Plot the nodes
    for _, row in coordinates.iterrows():
        if row['STATION'] in stations_of_interest:
            ax.plot(row['EST'], row['NORD'], 'bo', markersize=2)  # blue circle for each node
            ax.text(row['EST'], row['NORD'], row['LOCATION'], fontsize=12, ha='right')

    # Plot the directional links
    for (station1, station2), f_stat in links.items():
        coord1 = coordinates[coordinates['STATION'] == station1][['EST', 'NORD']].values[0]
        coord2 = coordinates[coordinates['STATION'] == station2][['EST', 'NORD']].values[0]
        ax.annotate("",
                    xy=(coord2[0], coord2[1]), xycoords='data',
                    xytext=(coord1[0], coord1[1]), textcoords='data',
                    arrowprops=dict(arrowstyle="->", color='blue', lw=1))

    ax.set_title(title)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    plt.savefig(filename)
    plt.show()

# Initialize dictionaries to hold the number of links for directional links
northward_links_count = {}
southward_links_count = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([9, 10, 11]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    links = {}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    links[(station1, station2)] = f_statistic_out

                    # Count northward and southward links
                    coord1 = coordinates_data[coordinates_data['STATION'] == station1][['EST', 'NORD']].values[0]
                    coord2 = coordinates_data[coordinates_data['STATION'] == station2][['EST', 'NORD']].values[0]
                    if coord1[1] < coord2[1]:
                        if year not in northward_links_count:
                            northward_links_count[year] = 0
                        northward_links_count[year] += 1
                    else:
                        if year not in southward_links_count:
                            southward_links_count[year] = 0
                        southward_links_count[year] += 1

    # Create plot for the year
    title = f'Granger Causality Network for Winter {year}'
    filename = f'0.gcnetwork_{year}.jpg'
    create_network_plot(year, links, coordinates_data, title, filename)

# Prepare data for heatmap
#years = sorted(list(northward_links_count.keys()))
years = sorted(list(set(eastward_links.keys()).union(set(westward_links.keys()))))  # Include all years from both dictionaries

heatmap_data = pd.DataFrame({
    'Northward': [northward_links.get(year, 0) for year in years],
    'Southward': [southward_links.get(year, 0) for year in years]
}, index=years)



# Create a mask for cells with zero values (these will be colored in black)
mask = heatmap_data == 0

# Create a custom colormap where zero values are black and the rest use 'coolwarm'
cmap = sns.color_palette("coolwarm", as_cmap=True)
cmap_with_black = ListedColormap(['black'] + list(cmap(np.linspace(0, 1, 256))))


# Plot the heatmap with squared cells and explicit normalization
plt.figure(figsize=(12, 2))  # Adjust the figure size to make it look like strips
max_val = max(heatmap_data.values.flatten())
sns.heatmap(heatmap_data.T, annot=False, cmap='coolwarm', cbar_kws={'label': 'Number of Links'}, vmin=0, vmax=max_val, square=True)
plt.title('Autumn Granger Causality Number of Links - 24h')
plt.xlabel('Year')
plt.ylabel('Direction')
plt.yticks(rotation=0)
plt.xticks(rotation=45) 
plt.savefig('44.24HNS_Links.jpg', dpi=300)
plt.show()


In [None]:
#Spearman correlation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
import seaborn as sns

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

adjacency_matrices = []
years = []

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    adjacency_matrix = np.zeros((len(stations_of_interest), len(stations_of_interest)))

    # Analysis for each station pair
    for i, station1 in enumerate(pivot_data.columns):
        for j, station2 in enumerate(pivot_data.columns):
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    adjacency_matrix[i, j] = f_statistic_out

    adjacency_matrices.append(adjacency_matrix)
    years.append(year)

# Calculate Pearson and Spearman correlations for each year's adjacency matrix
pearson_correlations = np.zeros((len(years), len(years)))
spearman_correlations = np.zeros((len(years), len(years)))

for i in range(len(years)):
    for j in range(len(years)):
        pearson_correlations[i, j] = np.corrcoef(adjacency_matrices[i].flatten(), adjacency_matrices[j].flatten())[0, 1]
        spearman_correlations[i, j] = np.corrcoef(np.argsort(np.argsort(adjacency_matrices[i].flatten())),
                                                  np.argsort(np.argsort(adjacency_matrices[j].flatten())))[0, 1]

# Plot heatmaps for Pearson and Spearman correlations
plt.figure(figsize=(12, 8))
sns.heatmap(pearson_correlations, xticklabels=years, yticklabels=years, annot=False, cmap='coolwarm', cbar=True)
plt.title('Winter Pearson Correlation')
plt.xlabel('Year')
plt.ylabel('Year')
plt.savefig('1.pearson_correlation_heatmap.jpg', dpi=300)
plt.show()

plt.figure(figsize=(12, 8))
sns.heatmap(spearman_correlations, xticklabels=years, yticklabels=years, annot=False, cmap='coolwarm', cbar=True)
plt.title('Winter Spearman Correlation')
plt.xlabel('Year')
plt.ylabel('Year')
plt.savefig('1.spearman_correlation_heatmap.jpg', dpi=300)
plt.show()


In [None]:
# Kendall-tau correlation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import grangercausalitytests
import seaborn as sns
from scipy.stats import kendalltau
from scipy.spatial.distance import cosine, euclidean
from fastdtw import fastdtw

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.01 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Function to calculate log returns
def calculate_log_returns(group):
    log_returns = np.log(group) - np.log(group.shift(1))
    return log_returns

adjacency_matrices = []
years = []

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([12, 1, 2]))]

    # Replace zero values with a small quantity
    small_quantity = 0.000001
    data_year['VALUE'] = data_year['VALUE'].replace(0, small_quantity)

    # Calculate log returns
    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    adjacency_matrix = np.zeros((len(stations_of_interest), len(stations_of_interest)))

    # Analysis for each station pair
    for i, station1 in enumerate(pivot_data.columns):
        for j, station2 in enumerate(pivot_data.columns):
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                if f_statistic_out > 0:
                    adjacency_matrix[i, j] = f_statistic_out

    adjacency_matrices.append(adjacency_matrix)
    years.append(year)

# Initialize matrices for correlations
kendall_correlations = np.zeros((len(years), len(years)))

# Calculate correlations for each year's adjacency matrix
for i in range(len(years)):
    for j in range(len(years)):
        if i != j:
            kendall_correlations[i, j] = kendalltau(adjacency_matrices[i].flatten(), adjacency_matrices[j].flatten())[0]
            
        else:
            kendall_correlations[i, j] = 1
           

 # Plot heatmaps for each correlation type
plt.figure(figsize=(12, 8))
sns.heatmap(kendall_correlations, xticklabels=years, yticklabels=years, annot=False, cmap='coolwarm', cbar=True)
plt.title('Winter Kendall Correlation')
plt.xlabel('Year')
plt.ylabel('Year')
plt.savefig('1.kendall_correlation_heatmap.jpg', dpi=300)
plt.show()

In [None]:
#replace 0 with mean value
#MEAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.05 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Initialize variables to find global maxima
global_max_out_links = 0
global_max_in_links = 0
global_max_f_statistic_out = 0
global_max_f_statistic_in = 0
station_year_data_out = {}
station_year_data_in = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([3, 4, 5]))]

    # Calculate mean values for each station across the specified months
    station_means = data_year[data_year['VALUE'] != 0].groupby('STATION')['VALUE'].mean()

    # Replace zero values with the station's mean for the considered months
    data_year['VALUE'] = data_year.apply(lambda row: station_means[row['STATION']] if row['VALUE'] == 0 else row['VALUE'], axis=1)

    # Calculate log returns
    def calculate_log_returns(group):
        log_returns = np.log(group) - np.log(group.shift(1))
        return log_returns

    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    temp_out_links_count = {station: 0 for station in pivot_data.columns}
    temp_in_links_count = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_out = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_in = {station: 0 for station in pivot_data.columns}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                f_statistic_in = granger_test(pivot_data, station2, station1)
                
                if f_statistic_out > 0:
                    temp_out_links_count[station1] += 1
                    temp_f_statistics_sum_out[station1] += f_statistic_out
                    station_year_data_out[(station1, year)] = (temp_f_statistics_sum_out[station1], temp_out_links_count[station1])
                
                if f_statistic_in > 0:
                    temp_in_links_count[station1] += 1
                    temp_f_statistics_sum_in[station1] += f_statistic_in
                    station_year_data_in[(station1, year)] = (temp_f_statistics_sum_in[station1], temp_in_links_count[station1])

    # Update global maxima
    year_max_out_links = max(temp_out_links_count.values())
    year_max_in_links = max(temp_in_links_count.values())
    year_max_f_statistic_out = max(temp_f_statistics_sum_out.values())
    year_max_f_statistic_in = max(temp_f_statistics_sum_in.values())
    
    global_max_out_links = max(global_max_out_links, year_max_out_links)
    global_max_in_links = max(global_max_in_links, year_max_in_links)
    global_max_f_statistic_out = max(global_max_f_statistic_out, year_max_f_statistic_out)
    global_max_f_statistic_in = max(global_max_f_statistic_in, year_max_f_statistic_in)

# Define the station order dictionary
station_order = {  
    756: "RG",
    764: "SR",
    684: "AG",
    695: "CL",
    718: "EN",
    706: "CT",
    779: "TP",
    750: "PA",  
    729: "ME"    
} #NOEDSUD


#station_order = {
#    779: "TP",
#    750: "PA",
#    684: "AG",
#    695: "CL",
#    718: "EN",
#    756: "RG",
#    706: "CT",
#    764: "SR",
#    729: "ME"
    
#} #ESTOVEST

# Function to create visualization of the 2D grid
def create_visualization(station_year_data, global_max_links, global_max_f_statistic, title, filename):
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.set_facecolor('white')  # Set background to white

    # Define scales for size and color
    max_node_size = 0.2  # Maximum node size
    color_norm = plt.Normalize(0, global_max_links)  # Normalize link count
    color_map = plt.cm.coolwarm  # Color map

    # Draw each station-year as a circle on the grid
    for (station, year), (f_stat, links) in station_year_data.items():
        x = year
        y = list(station_order.keys()).index(station)  # Get station position based on order
        size = (f_stat / global_max_f_statistic) * max_node_size  # Scale size based on f_stat
        color = color_map(color_norm(links))  # Get color based on number of links

        # Create a circle and add it to the plot
        circle = Circle((x, y), np.sqrt(size), color=color, alpha=0.6)  # Use square root of size for radius
        ax.add_patch(circle)

    # Set axis labels, ticks, and limits
    ax.set_xlabel('Year')
    ax.set_ylabel('Station ID')
    ax.set_xticks(np.arange(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1))
    ax.set_yticks(np.arange(len(station_order)))
    ax.set_yticklabels(station_order.values())
    ax.set_xlim(merged_data_filtered['DATETIME'].dt.year.min() - 1, merged_data_filtered['DATETIME'].dt.year.max() + 1)
    ax.set_ylim(-1, len(station_order))

    plt.grid(False)
    plt.title(title)
    plt.savefig(filename, dpi=300)
    plt.show()

# Create visualizations for outlinks and inlinks
create_visualization(station_year_data_out, global_max_out_links, global_max_f_statistic_out, 'Outlinks Grid Visualization', "3.MEANoutgrid_visualization9gauges.jpg")
create_visualization(station_year_data_in, global_max_in_links, global_max_f_statistic_in, 'Inlinks Grid Visualization', "3.MEANingrid_visualization9gauges.jpg")


In [None]:
#Natural time
#NATURAL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from statsmodels.tsa.stattools import grangercausalitytests

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Define the station IDs you want to include in the network
stations_of_interest = [779, 750, 684, 695, 718, 756, 706, 764, 729]

# Filter merged_data to include only stations of interest
merged_data_filtered = merged_data[merged_data['STATION'].isin(stations_of_interest)]

# Granger causality test function
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.05 else 0
    except Exception as e:
        print(f"Error in granger_test: {e}")
        return 0

# Initialize variables to find global maxima
global_max_out_links = 0
global_max_in_links = 0
global_max_f_statistic_out = 0
global_max_f_statistic_in = 0
station_year_data_out = {}
station_year_data_in = {}

# Loop over each year for analysis
for year in range(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1):
    data_year = merged_data_filtered[(merged_data_filtered['DATETIME'].dt.year == year) & 
                                     (merged_data_filtered['DATETIME'].dt.month.isin([6, 7 ,8, 9, 10, 11]))]

    # Remove zero values
    data_year = data_year[data_year['VALUE'] != 0]

    # Calculate log returns
    def calculate_log_returns(group):
        log_returns = np.log(group) - np.log(group.shift(1))
        return log_returns

    data_year['LOG_RETURN'] = data_year.groupby('STATION')['VALUE'].transform(calculate_log_returns)
    data_year.dropna(inplace=True)

    # Reset index to align with the original DataFrame
    data_year.reset_index(drop=True, inplace=True)

    # Pivot the filtered data
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='LOG_RETURN').fillna(0)

    temp_out_links_count = {station: 0 for station in pivot_data.columns}
    temp_in_links_count = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_out = {station: 0 for station in pivot_data.columns}
    temp_f_statistics_sum_in = {station: 0 for station in pivot_data.columns}

    # Analysis for each station pair
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic_out = granger_test(pivot_data, station1, station2)
                f_statistic_in = granger_test(pivot_data, station2, station1)
                
                if f_statistic_out > 0:
                    temp_out_links_count[station1] += 1
                    temp_f_statistics_sum_out[station1] += f_statistic_out
                    station_year_data_out[(station1, year)] = (temp_f_statistics_sum_out[station1], temp_out_links_count[station1])
                
                if f_statistic_in > 0:
                    temp_in_links_count[station1] += 1
                    temp_f_statistics_sum_in[station1] += f_statistic_in
                    station_year_data_in[(station1, year)] = (temp_f_statistics_sum_in[station1], temp_in_links_count[station1])

    # Update global maxima
    year_max_out_links = max(temp_out_links_count.values())
    year_max_in_links = max(temp_in_links_count.values())
    year_max_f_statistic_out = max(temp_f_statistics_sum_out.values())
    year_max_f_statistic_in = max(temp_f_statistics_sum_in.values())
    
    global_max_out_links = max(global_max_out_links, year_max_out_links)
    global_max_in_links = max(global_max_in_links, year_max_in_links)
    global_max_f_statistic_out = max(global_max_f_statistic_out, year_max_f_statistic_out)
    global_max_f_statistic_in = max(global_max_f_statistic_in, year_max_f_statistic_in)

# Define the station order dictionary
station_order = {  
    756: "RG",
    764: "SR",
    684: "AG",
    695: "CL",
    718: "EN",
    706: "CT",
    779: "TP",
    750: "PA",  
     729: "ME"    
} #NOEDSUD


#station_order = {
#    779: "TP",
#    750: "PA",
#    684: "AG",
#    695: "CL",
#    718: "EN",
#    756: "RG",
#    706: "CT",
#    764: "SR",
#    729: "ME"
    
#} #ESTOVEST

# Function to create visualization of the 2D grid
def create_visualization(station_year_data, global_max_links, global_max_f_statistic, title, filename):
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.set_facecolor('white')  # Set background to white

    # Define scales for size and color
    max_node_size = 0.2  # Maximum node size
    color_norm = plt.Normalize(0, global_max_links)  # Normalize link count
    color_map = plt.cm.coolwarm  # Color map

    # Draw each station-year as a circle on the grid
    for (station, year), (f_stat, links) in station_year_data.items():
        x = year
        y = list(station_order.keys()).index(station)  # Get station position based on order
        size = (f_stat / global_max_f_statistic) * max_node_size  # Scale size based on f_stat
        color = color_map(color_norm(links))  # Get color based on number of links

        # Create a circle and add it to the plot
        circle = Circle((x, y), np.sqrt(size), color=color, alpha=0.6)  # Use square root of size for radius
        ax.add_patch(circle)

    # Set axis labels, ticks, and limits
    ax.set_xlabel('Year')
    ax.set_ylabel('Station ID')
    ax.set_xticks(np.arange(merged_data_filtered['DATETIME'].dt.year.min(), merged_data_filtered['DATETIME'].dt.year.max() + 1))
    ax.set_yticks(np.arange(len(station_order)))
    ax.set_yticklabels(station_order.values())
    ax.set_xlim(merged_data_filtered['DATETIME'].dt.year.min() - 1, merged_data_filtered['DATETIME'].dt.year.max() + 1)
    ax.set_ylim(-1, len(station_order))

    plt.grid(False)
    plt.title(title)
    plt.savefig(filename, dpi=300)
    plt.show()

# Create visualizations for outlinks and inlinks
create_visualization(station_year_data_out, global_max_out_links, global_max_f_statistic_out, 'Outlinks Grid Visualization', "4.1NATUoutgrid_visualization9gauges.jpg")
create_visualization(station_year_data_in, global_max_in_links, global_max_f_statistic_in, 'Inlinks Grid Visualization', "4.1NATUingrid_visualization9gauges.jpg")


In [None]:
# Clustering coefficient,mean distance, diameter
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.stattools import grangercausalitytests

# Function to perform Granger causality test
def granger_test(dataframe, column1, column2, max_lag=1):
    try:
        gc_test = grangercausalitytests(dataframe[[column1, column2]], maxlag=max_lag)
        f_statistic = gc_test[1][0]['ssr_ftest'][0]  # F-statistic
        p_value = gc_test[1][0]['ssr_ftest'][1]  # p-value
        return f_statistic if p_value < 0.0001 else 0
    except:
        return 0  # Return 0 in case of an error

# Load data
gauges_data = pd.read_csv('75gauges.csv')
coordinates_data = pd.read_csv('merged_COO.csv')

# Merge data on STATION
merged_data = pd.merge(gauges_data, coordinates_data, on='STATION')

# Convert 'DATETIME' to datetime
merged_data['DATETIME'] = pd.to_datetime(merged_data['DATETIME'])

# Years range
years = range(merged_data['DATETIME'].dt.year.min(), merged_data['DATETIME'].dt.year.max() + 1)

# Initialize lists for global properties and dictionary for degree distributions
global_clustering_coefficients = []
global_mean_distances = []
global_network_diameters = []
degree_distributions = {}

# Main analysis loop
for year in years:
    # Filter for the specific year and months
    data_year = merged_data[(merged_data['DATETIME'].dt.year == year) & (merged_data['DATETIME'].dt.month.isin([9, 10, 11]))]
    pivot_data = data_year.pivot(index='DATETIME', columns='STATION', values='VALUE').fillna(0)

    # Initialize NetworkX graph for this year
    G = nx.Graph()

    # Build the network with all available stations
    for station1 in pivot_data.columns:
        for station2 in pivot_data.columns:
            if station1 != station2:
                f_statistic = granger_test(pivot_data, station1, station2)
                if f_statistic > 0:
                    G.add_edge(station1, station2)

    # Calculate clustering coefficient, mean path length, and diameter
    clustering_coefficient = nx.average_clustering(G)
    if nx.is_connected(G):
        mean_distance = nx.average_shortest_path_length(G)
        diameter = nx.diameter(G)
    else:
        mean_distance = diameter = float('inf')
    
    global_clustering_coefficients.append(clustering_coefficient)
    global_mean_distances.append(mean_distance)
    global_network_diameters.append(diameter)

    # Compute degree distribution for this year's network
    degrees = [degree for node, degree in G.degree()]
    degree_distributions[year] = degrees

# Plotting global network properties across years
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(years, global_clustering_coefficients, marker='o')
plt.title('Autumn Clustering Coefficient Across Years')
plt.xlabel('Year')
plt.ylabel('Clustering Coefficient')
plt.grid(True)

plt.subplot(1, 3, 2)
plt.plot(years, global_mean_distances, marker='o')
plt.title('Autumn Mean Distance Across Years')
plt.xlabel('Year')
plt.ylabel('Mean Distance')
plt.grid(True)

plt.subplot(1, 3, 3)
plt.plot(years, global_network_diameters, marker='o')
plt.title('Autumn Network Diameter Across Years')
plt.xlabel('Year')
plt.ylabel('Diameter')
plt.grid(True)
plt.tight_layout()
plt.savefig("4.network_analysis.jpg", dpi=300)