todo:
- create new function relative, then import and implement to final output
- repeat for absolute

final output:
- new functions absolute and relative used
- with random generated data
- without the years loop
- using the old structure

potential additions:
- add relative matrix index recognition - able to work with multi-level index as well as single-level index

In [372]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import networkx as nx
import os
from tqdm import tqdm
from networkx.algorithms.community import louvain_communities
from networkx.algorithms.community.quality import modularity
from sklearn.metrics.pairwise import cosine_distances

from functions import calculate_monthly_velocities_cosine, get_similarities, get_matrix, get_month, author_mapping
from ipynb.fs.full.example_data_gen import random_data_gen
#from ipynb.fs.full.absolute_relative import compute_biadjacency_matrix, compute_relative_matrix

In [373]:
df = random_data_gen()

In [374]:
df.head()

Unnamed: 0,id1,id2,timestamp
0,L1_1,L2_8,2020-01-05 13:26:21.535344
1,L1_2,L2_11,2020-01-05 16:25:51.236232
2,L1_1,L2_0,2020-01-06 03:43:39.408620
3,L1_1,L2_15,2020-01-07 18:07:43.140132
4,L1_2,L2_15,2020-01-10 10:15:18.473481


### Get Statistcs

In [375]:
# import pickle
# variables = pickle.load(open("recovered_variables.pkl", "rb"))
# globals().update(variables)  # restores them into current session

In [376]:
def get_unique_entities(df,layer = L1):
    return df[layer].unique()

def get_unique_timeframes(df):
    # If MultiIndex, get level 1 (usually timeframe), else get unique values from 'timeframe' column
    if isinstance(df.index, pd.MultiIndex):
        return df.index.get_level_values(1).unique()
    elif 'timeframe' in df.columns:
        return df['timeframe'].unique()
    else:
        raise ValueError("DataFrame must have either a MultiIndex with a timeframe level or a 'timeframe' column.")

def get_timeframe(df, timeframe=TIMEFRAME, timestamp_col=TIMESTAMP_COL):
    return df[df[timestamp_col].dt.to_period(timeframe) == timeframe]

def bin_timestamps(df, timestamp_col = TIMESTAMP_COL, timeframe = TIMEFRAME):

    '''
    adds a new column called ‘timeframe‘ and bins the timestamps user specified timeframes
    '''

    df['timeframe'] = df[timestamp_col].dt.to_period(timeframe).apply(lambda r: str(r.start_time)+" to " + str(r.end_time))

    return df

def get_timeframe_data(df, timeframe_value, timeframe_col='timeframe'):
    """
    Returns rows from df where the timeframe_col matches the specified timeframe_value.
    Example timeframe_value: '2020-01-01 00:00:00 to 2021-01-01 00:00:00'
    """
    return df[df[timeframe_col] == timeframe_value]

In [377]:
def compute_biadjacency_matrix(df, layer1=L1, layer2=L2):
    """
    Groups and pivots the dataframe by id1, id2, and a timeframe period.
    Args:
        df: Input DataFrame (e.g., from random_data_gen)
        layer1: Name of the first ID column
        layer2: Name of the second ID column
    Returns:
        biadjacency_matrix: Pivoted DataFrame with id2 values as columns and counts as values
    """
    grouped_df = df.groupby([layer1, layer2, 'timeframe']).size().reset_index(name='counts')
    biadjacency_matrix = grouped_df.pivot(index=[layer1, 'timeframe'], columns=layer2, values='counts').fillna(0)
    
    return biadjacency_matrix

In [378]:
def compute_relative_matrix(biadjacency_matrix, unique_entities, distance_func):
    """
    Compute a relative distance matrix for each (id1, timeframe) in biadjacency_matrix against all unique_entities at the same timeframe.
    
    Parameters:
        biadjacency_matrix: pd.DataFrame, indexed by (id1, timeframe), columns are id2s, values are counts
        unique_entities: list or array of id1 values to compare against
        distance_func: function, pairwise distance function (default: cosine_distances)
    """
    relative_matrix = pd.DataFrame(index=biadjacency_matrix.index, columns=unique_entities)
    for id1, timeframe in biadjacency_matrix.index:
        row_entity_vector = biadjacency_matrix.loc[(id1, timeframe)].values.reshape(1, -1)
        for col_entity in unique_entities:
            # Only compute if (col_entity, timeframe) exists in index
            if (col_entity, timeframe) in biadjacency_matrix.index:
                col_entity_vector = biadjacency_matrix.loc[(col_entity, timeframe)].values.reshape(1, -1)
                dist = distance_func(row_entity_vector, col_entity_vector)[0, 0]
                relative_matrix.loc[(id1, timeframe), col_entity] = dist
            else:
                relative_matrix.loc[(id1, timeframe), col_entity] = 0

    relative_matrix = relative_matrix.reindex(sorted(relative_matrix.columns), axis=1)

    return relative_matrix

In [379]:
# declare user preferences and variables
TIMEFRAME = 'D'
TIMESTAMP_COL = 'timestamp'
L1 = 'id1'
L2 = 'id2'

In [380]:
# generate random data
df = random_data_gen()
df

Unnamed: 0,id1,id2,timestamp
0,L1_1,L2_3,2020-01-02 08:28:51.745964
1,L1_1,L2_0,2020-01-06 05:36:48.329855
2,L1_1,L2_2,2020-01-06 10:45:39.016520
3,L1_1,L2_16,2020-01-13 06:52:26.990090
4,L1_1,L2_4,2020-01-13 11:58:01.293252
...,...,...,...
995,L1_0,L2_18,2024-12-22 09:08:26.675234
996,L1_1,L2_1,2024-12-23 18:15:37.339583
997,L1_0,L2_2,2024-12-24 12:56:19.736023
998,L1_0,L2_10,2024-12-27 10:29:36.434814


In [381]:
# allocate retweets to timeframes, based on timestamps
bin_timestamps(df)

Unnamed: 0,id1,id2,timestamp,timeframe
0,L1_1,L2_3,2020-01-02 08:28:51.745964,2020-01-02 00:00:00 to 2020-01-02 23:59:59.999...
1,L1_1,L2_0,2020-01-06 05:36:48.329855,2020-01-06 00:00:00 to 2020-01-06 23:59:59.999...
2,L1_1,L2_2,2020-01-06 10:45:39.016520,2020-01-06 00:00:00 to 2020-01-06 23:59:59.999...
3,L1_1,L2_16,2020-01-13 06:52:26.990090,2020-01-13 00:00:00 to 2020-01-13 23:59:59.999...
4,L1_1,L2_4,2020-01-13 11:58:01.293252,2020-01-13 00:00:00 to 2020-01-13 23:59:59.999...
...,...,...,...,...
995,L1_0,L2_18,2024-12-22 09:08:26.675234,2024-12-22 00:00:00 to 2024-12-22 23:59:59.999...
996,L1_1,L2_1,2024-12-23 18:15:37.339583,2024-12-23 00:00:00 to 2024-12-23 23:59:59.999...
997,L1_0,L2_2,2024-12-24 12:56:19.736023,2024-12-24 00:00:00 to 2024-12-24 23:59:59.999...
998,L1_0,L2_10,2024-12-27 10:29:36.434814,2024-12-27 00:00:00 to 2024-12-27 23:59:59.999...


In [382]:
# # compute biadjacency matrix
# biadjacency_matrix = compute_biadjacency_matrix(df)
# biadjacency_matrix

In [383]:
# compute unique entities for the relative matrix - these are the columns of the matrix
unique_entities = get_unique_entities(df)
unique_entities

array(['L1_1', 'L1_0', 'L1_2'], dtype=object)

In [384]:
# # compute relative matrix
# relative_matrix = compute_relative_matrix(biadjacency_matrix, unique_entities, cosine_distances)
# relative_matrix

1. do a loop to segment data according to the user-specified timeframe.
2. create biadjacency matrices based on the segmented data from 1.
3. based on biadjacency matrices from 2. create relative distance matrices
4. concatenate the relative distance matrices from 3. together

In [385]:
unique_timeframes = get_unique_timeframes(df)
#unique_timeframes

In [386]:
# STEP 2
timeframe_data = {}

for ut in unique_timeframes:
    timeframe_data[ut] = get_timeframe_data(df, timeframe_value=ut)

In [387]:
# biadjacency_matrices = {}

# for timerange, t_data in timeframe_data.items():
#     biadjacency_matrices[timerange] = compute_biadjacency_matrix(t_data)

In [388]:
# # Number of timeframes (keys) in the dictionary
# print("Number of timeframes:", len(biadjacency_matrices))

# # Shape of the first biadjacency matrix (for the first timeframe)
# first_key = next(iter(biadjacency_matrices))
# print("Shape of first biadjacency matrix:", biadjacency_matrices[first_key].shape)

In [389]:
# biadjacency_matrices[first_key]

In [390]:
# Concatenate all biadjacency matrices into a single large DataFrame
biadjacency_matrices = pd.concat(
    [compute_biadjacency_matrix(t_data) for t_data in timeframe_data.values()],
    axis=0).fillna(0)

biadjacency_matrices

Unnamed: 0_level_0,id2,L2_3,L2_0,L2_2,L2_16,L2_4,L2_13,L2_14,L2_5,L2_12,L2_9,L2_11,L2_17,L2_1,L2_15,L2_7,L2_18,L2_19,L2_8,L2_10,L2_6
id1,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L1_1,2020-01-02 00:00:00 to 2020-01-02 23:59:59.999999999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_1,2020-01-06 00:00:00 to 2020-01-06 23:59:59.999999999,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_1,2020-01-13 00:00:00 to 2020-01-13 23:59:59.999999999,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_0,2020-01-17 00:00:00 to 2020-01-17 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_1,2020-01-17 00:00:00 to 2020-01-17 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L1_0,2024-12-22 00:00:00 to 2024-12-22 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
L1_1,2024-12-23 00:00:00 to 2024-12-23 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_0,2024-12-24 00:00:00 to 2024-12-24 23:59:59.999999999,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L1_0,2024-12-27 00:00:00 to 2024-12-27 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [391]:
relative_matrix = compute_relative_matrix(biadjacency_matrices, unique_entities, cosine_distances)
relative_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,L1_0,L1_1,L1_2
id1,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L1_1,2020-01-02 00:00:00 to 2020-01-02 23:59:59.999999999,0,0.0,0
L1_1,2020-01-06 00:00:00 to 2020-01-06 23:59:59.999999999,0,0.0,0
L1_1,2020-01-13 00:00:00 to 2020-01-13 23:59:59.999999999,0,0.0,0
L1_0,2020-01-17 00:00:00 to 2020-01-17 23:59:59.999999999,0.0,1.0,0
L1_1,2020-01-17 00:00:00 to 2020-01-17 23:59:59.999999999,1.0,0.0,0
...,...,...,...,...
L1_0,2024-12-22 00:00:00 to 2024-12-22 23:59:59.999999999,0.0,0,0
L1_1,2024-12-23 00:00:00 to 2024-12-23 23:59:59.999999999,0,0.0,0
L1_0,2024-12-24 00:00:00 to 2024-12-24 23:59:59.999999999,0.0,0,0
L1_0,2024-12-27 00:00:00 to 2024-12-27 23:59:59.999999999,0.0,1.0,0


In [392]:
# # STEP 7
# for t in relative_matrix.index[1].unique():
#     filtered = merged_similarities[merged_similarities.index.str.contains(f'_{month}', case=False)].sort_index()
#     std_per_column = filtered.std(axis=0, skipna=True)
#     mean_per_column = filtered.mean(axis=0, skipna=True)
#     avg_std_per_month[month] = std_per_column.mean()
#     avg_mean_per_month[month] = mean_per_column.mean()

# # STEP 8
# # Calculate velocities using cosine distance
# velocities_cosine_full = calculate_monthly_velocities_cosine(merged_similarities, list(month_mapping.values()))

# # Convert the velocities_cosine dictionary into a DataFrame
# velocities_df_full = pd.concat(velocities_cosine_full, axis=0)

# # Reset the index to make the month pairs a column
# velocities_df_full.reset_index(inplace=True)

# # Rename the columns for clarity
# velocities_df_full.columns = ['Month Pair', 'Node', 'Velocity']

# # Pivot the DataFrame to make month pairs the column names
# velocities_df_full = velocities_df_full.pivot(index='Node', columns='Month Pair', values='Velocity')

# # Reorder the columns
# velocities_df_full = velocities_df_full[column_order]

# # Reset the index to make it more readable
# velocities_df_full.reset_index(inplace=False)

# # Prepare velocity means and stds for the same x-axis
# veloc_means_arr = np.array([velocities_df_full[col].mean(skipna=True) for col in column_order])
# veloc_stds_arr = np.array([velocities_df_full[col].std(skipna=True) for col in column_order])


# # STEP 9
# modularity_per_month = {}

# for month in list(month_mapping.values()):
#     # Create the graph for the month
#     filtered = merged_similarities[merged_similarities.index.str.contains(f'_{month}', case=False)].sort_index()
#     filtered.index = filtered.index.str.replace(r'_[^_]+$', '', regex=True)

#     filtered = filtered.loc[filtered.index, filtered.index]
#     G = nx.from_pandas_adjacency(filtered)
    
#     # Louvain communities and modularity
#     communities = louvain_communities(G, weight='weight', seed=42) # weight='weight' as values from the filtered variable
#     mod = modularity(G, communities, weight='weight')
#     modularity_per_month[month] = mod

# consecutive_modularity_averages = []
# consecutive_month_pairs = []

# months = list(modularity_per_month.keys())
# modularities = list(modularity_per_month.values())

# for i in range(len(months) - 1):
#     avg = (modularities[i] + modularities[i + 1]) / 2
#     consecutive_modularity_averages.append(avg)
#     consecutive_month_pairs.append(f"{months[i]}-{months[i+1]}")

# # Store stats for this year, now including avg_std_per_month and avg_mean_per_month
# year_stats[year] = (
#     veloc_means_arr,
#     veloc_stds_arr,
#     consecutive_modularity_averages,
#     avg_std_per_month,
#     avg_mean_per_month
# )

# # STEP 10
# # After the loop, create a DataFrame
# stats_df = pd.DataFrame([
# {
#     'year': year,
#     'veloc_means_arr': veloc_means_arr,
#     'veloc_stds_arr': veloc_stds_arr,
#     'consecutive_modularity_averages': mod_avgs,
#     'sim_avg_std_per_month': avg_std_per_month,
#     'sim_avg_mean_per_month': avg_mean_per_month
# }
# for year, (veloc_means_arr, veloc_stds_arr, mod_avgs, avg_std_per_month, avg_mean_per_month) in year_stats.items()
# ])

In [393]:
# stats_df.head()

In [394]:
# stats_checkpoint_path = "stats_checkpoint.parquet"

# # load data from checkpoint or save
# if os.path.exists(stats_checkpoint_path):
#     print("Loading stats_df from checkpoint...")
#     stats_df = pd.read_parquet(stats_checkpoint_path)
# else:
#     print("Saving stats_df to checkpoint...")
#     stats_df.to_parquet(stats_checkpoint_path, index=False)