todo:
- create new function relative, then import and implement to final output
- repeat for absolute

final output:
- new functions absolute and relative used
- with random generated data
- without the years loop
- using the old structure

potential additions:
- make sure both functions compute_relative_matrix as well as compute_absolute_matrix are able to work with multi-level index as well as the regular one
- simplify compute_relative_matrix?

In [1000]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import networkx as nx
import os
from tqdm import tqdm
from networkx.algorithms.community import louvain_communities
from networkx.algorithms.community.quality import modularity
from sklearn.metrics.pairwise import cosine_distances, distance_metrics, manhattan_distances, euclidean_distances, haversine_distances, manhattan_distances, nan_euclidean_distances
from sklearn.decomposition import PCA

#from functions import calculate_monthly_velocities_cosine, get_similarities, get_matrix, get_month, author_mapping
from ipynb.fs.full.example_data_gen import random_data_gen
#from ipynb.fs.full.absolute_relative import compute_biadjacency_matrix, compute_relative_matrix

In [954]:
# import pickle
# variables = pickle.load(open("recovered_variables.pkl", "rb"))
# globals().update(variables)  # restores them into current session

In [955]:
#distances = list(distance_metrics().keys())

In [956]:
# declare user preferences and variables
TIMEFRAME = 'Y'
TIMESTAMP_COL = 'timestamp'
L1 = 'id1'
L2 = 'id2'

ABSOLUTE = True
DISTANCE_FUNCTION = cosine_distances

SUBSET = True
REAL_DATASET = True

### Define functions

In [957]:
def get_unique_entities(df,layer = L1):
    return df[layer].unique()

def get_unique_timeframes(df):
    # If MultiIndex, get level 1 (usually timeframe), else get unique values from 'timeframe' column
    if isinstance(df.index, pd.MultiIndex):
        return df.index.get_level_values(1).unique()
    elif 'timeframe' in df.columns:
        return df['timeframe'].unique()
    else:
        raise ValueError("DataFrame must have either a MultiIndex with a timeframe level or a 'timeframe' column.")

def get_timeframe(df, timeframe=TIMEFRAME, timestamp_col=TIMESTAMP_COL):
    return df[df[timestamp_col].dt.to_period(timeframe) == timeframe]

def bin_timestamps(df, timestamp_col = TIMESTAMP_COL, timeframe = TIMEFRAME):

    '''
    adds a new column called ‘timeframe‘ and bins the timestamps user specified timeframes
    '''

    df['timeframe'] = df[timestamp_col].dt.to_period(timeframe).apply(lambda r: str(r.start_time)+" to " + str(r.end_time))

    return df

def get_timeframe_data(df, timeframe_value, timeframe_col='timeframe'):
    """
    Returns rows from df where the timeframe_col matches the specified timeframe_value.
    Example timeframe_value: '2020-01-01 00:00:00 to 2021-01-01 00:00:00'
    """
    return df[df[timeframe_col] == timeframe_value]

In [958]:
def compute_biadjacency_matrix(df, layer1=L1, layer2=L2):
    """
    Groups and pivots the dataframe by id1, id2, and a timeframe period.
    Args:
        df: Input DataFrame (e.g., from random_data_gen)
        layer1: Name of the first ID column
        layer2: Name of the second ID column
    Returns:
        biadjacency_matrix: Pivoted DataFrame with id2 values as columns and counts as values
    """
    grouped_df = df.groupby([layer1, layer2, 'timeframe']).size().reset_index(name='counts')
    biadjacency_matrix = grouped_df.pivot(index=[layer1, 'timeframe'], columns=layer2, values='counts').fillna(0)
    
    return biadjacency_matrix

In [959]:
def compute_relative_matrix(biadjacency_matrix, unique_entities, distance_function = DISTANCE_FUNCTION):
    """
    Compute a relative distance matrix for each (id1, timeframe) in biadjacency_matrix against all unique_entities at the same timeframe.
    
    Parameters:
        biadjacency_matrix: pd.DataFrame, indexed by (id1, timeframe), columns are id2s, values are counts
        unique_entities: list or array of id1 values to compare against
        distance_func: function, pairwise distance function (default: cosine_distances)
    """
    relative_matrix = pd.DataFrame(index=biadjacency_matrix.index, columns=unique_entities)
    for id1, timeframe in biadjacency_matrix.index:
        row_entity_vector = biadjacency_matrix.loc[(id1, timeframe)].values.reshape(1, -1)
        for col_entity in unique_entities:
            # Only compute if (col_entity, timeframe) exists in index
            if (col_entity, timeframe) in biadjacency_matrix.index:
                col_entity_vector = biadjacency_matrix.loc[(col_entity, timeframe)].values.reshape(1, -1)
                dist = distance_function(row_entity_vector, col_entity_vector)[0, 0]
                relative_matrix.loc[(id1, timeframe), col_entity] = dist
            else:
                relative_matrix.loc[(id1, timeframe), col_entity] = 0

    relative_matrix = relative_matrix.reindex(sorted(relative_matrix.columns), axis=1)

    return relative_matrix

In [960]:
def compute_absolute_matrix(df, layer1=L1, layer2=L2, timeframe_col = 'timeframe', distance_function = DISTANCE_FUNCTION):

    '''
    expects a dataframe with a column 'timeframe', which containts the binned retweets
    '''

    grouped_df = df.groupby([layer1, layer2, timeframe_col]).size().reset_index(name='counts')
    pivot_df = grouped_df.pivot(index=['id1', 'timeframe'], columns='id2', values='counts').fillna(0)
    absolute_matrix = pd.DataFrame(distance_function(pivot_df), index=pivot_df.index, columns=pivot_df.index)

    return absolute_matrix

### Generate distance matrix

In [961]:
# generate random data
#df = random_data_gen()
#df

In [976]:
if REAL_DATASET:

    checkpoint_path = "df_checkpoint.parquet"

    # load data from checkpoint or save
    if os.path.exists(checkpoint_path):
        print("Loading df from checkpoint...")
        df = pd.read_parquet(checkpoint_path)
        df = df.drop(columns=['original_post_id', 'author_name', 'author_username'])
        df = df.rename(columns={'tweet_created_at':'timestamp', 'author_id':'id1', 'retweeter_id':'id2'})


        if SUBSET:
            df = df.tail(10000)
            print('Working with subset of real data')

        else:
            print('Working with full real data')

else:
    print('Working with randomly generated data')
    # generate random data
    df = random_data_gen()

Loading df from checkpoint...
Working with subset of real data


In [977]:
print(df.shape)
display(df.head())

(10000, 3)


Unnamed: 0,timestamp,id1,id2
29225029,2022-01-02 09:43:40,1683455144,1474810730578944004
29225030,2022-01-02 09:43:40,1683455144,2175541491
29225031,2022-01-02 09:43:40,1683455144,1476873714050506753
29225032,2022-01-02 09:43:40,1683455144,861666300
29225033,2022-01-02 09:43:40,1683455144,1425194624033902594


In [978]:
# # allocate retweets to timeframes, based on timestamps
df = bin_timestamps(df)
df

Unnamed: 0,timestamp,id1,id2,timeframe
29225029,2022-01-02 09:43:40,1683455144,1474810730578944004,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999...
29225030,2022-01-02 09:43:40,1683455144,2175541491,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999...
29225031,2022-01-02 09:43:40,1683455144,1476873714050506753,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999...
29225032,2022-01-02 09:43:40,1683455144,861666300,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999...
29225033,2022-01-02 09:43:40,1683455144,1425194624033902594,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999...
...,...,...,...,...
29235024,2021-12-31 23:00:00,8873182,754395750499491840,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999...
29235025,2021-12-31 23:00:00,8873182,1354014924301852673,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999...
29235026,2021-12-31 23:00:00,8873182,1089500969052848128,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999...
29235027,2021-12-31 23:00:00,8873182,1283784692,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999...


In [979]:
# # compute biadjacency matrix
# biadjacency_matrix = compute_biadjacency_matrix(df)
# biadjacency_matrix

In [980]:
# compute unique entities for the relative matrix - these are the columns of the matrix
unique_entities = get_unique_entities(df)
unique_entities

array([         1683455144,           150725695,           425752285,
                  10228272,           138736601,           543774554,
                 910827588,  900766265245130753,           395218906,
       1059361525109010433,           133790890,           289400495,
                 420351046,           399004979,            13514762,
                 132880191,           472966889,  732817452569141248,
                   8873182,            29416653,            85626417,
       1072167411984551936,           331617619,            61765111,
                 454365633,           500882938,            78567974,
                   5893702,          1935534786,            24184892,
                1904010924,           828717014,          2446734732,
                  18935802,           322933929,            25676606,
                  86390214,           425686235,          3091392485,
                  56341776,           291294443,          2983707267,
                1024

In [981]:
# # compute relative matrix
# relative_matrix = compute_relative_matrix(biadjacency_matrix, unique_entities, cosine_distances)
# relative_matrix

1. do a loop to segment data according to the user-specified timeframe.
2. create biadjacency matrices based on the segmented data from 1.
3. based on biadjacency matrices from 2. create relative distance matrices
4. concatenate the relative distance matrices from 3. together

In [982]:
unique_timeframes = get_unique_timeframes(df)
#unique_timeframes

In [983]:
# STEP 2
timeframe_data = {}

for ut in unique_timeframes:
    timeframe_data[ut] = get_timeframe_data(df, timeframe_value=ut)

In [984]:
# biadjacency_matrices = {}

# for timerange, t_data in timeframe_data.items():
#     biadjacency_matrices[timerange] = compute_biadjacency_matrix(t_data)

In [985]:
# # Number of timeframes (keys) in the dictionary
# print("Number of timeframes:", len(biadjacency_matrices))

# # Shape of the first biadjacency matrix (for the first timeframe)
# first_key = next(iter(biadjacency_matrices))
# print("Shape of first biadjacency matrix:", biadjacency_matrices[first_key].shape)

In [986]:
# biadjacency_matrices[first_key]

In [987]:
# Concatenate all biadjacency matrices into a single large DataFrame
biadjacency_matrices = pd.concat(
    [compute_biadjacency_matrix(t_data) for t_data in timeframe_data.values()],
    axis=0).fillna(0)

biadjacency_matrices.head()

Unnamed: 0_level_0,id2,1450171,3426491,5404582,6073442,6494072,6566022,7622952,7645092,8339422,8882322,...,1469727868523409412,1469888267197767680,1470533332089942020,1470900202840199174,1472336698717655044,1474025356273274885,1474025607033991189,1474439891656265734,1476494421063618560,1476830072032186394
id1,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5893702,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8873182,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10228272,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13514762,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14060262,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [988]:
if ABSOLUTE:
    absolute_matrix = compute_absolute_matrix(df)
    display(absolute_matrix.head())

else:
    relative_matrix = compute_relative_matrix(biadjacency_matrices, unique_entities, cosine_distances)
    display(relative_matrix.head())

Unnamed: 0_level_0,id1,5893702,8873182,8873182,10228272,13514762,14060262,18935802,24184892,25676606,29416653,...,1935534786,2416067982,2446734732,2983707267,3091392485,732817452569141248,900766265245130753,930822249707376640,1059361525109010433,1072167411984551936
Unnamed: 0_level_1,timeframe,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,...,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999
id1,timeframe,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
5893702,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.997126,0.994309,1.0,1.0,0.932918,0.955958,1.0,1.0,0.989318,...,1.0,1.0,0.96,1.0,0.97137,0.974825,0.995951,0.978483,1.0,1.0
8873182,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,0.997126,0.0,0.57865,1.0,0.949796,1.0,1.0,1.0,0.979433,0.855903,...,0.927855,0.99713,0.990802,1.0,1.0,1.0,0.698348,1.0,1.0,1.0
8873182,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.994309,0.57865,0.0,1.0,0.957019,1.0,1.0,1.0,1.0,0.886534,...,0.925182,1.0,0.986342,1.0,1.0,1.0,0.719825,1.0,1.0,1.0
10228272,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
13514762,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,1.0,0.949796,0.957019,1.0,0.0,1.0,0.98614,1.0,1.0,0.954619,...,1.0,0.997054,1.0,1.0,1.0,0.996039,0.887232,1.0,0.995719,1.0


### Perform dimensionality reduction

In [None]:
# take the biadjacency matrix and reduce dimensionality using e.g. PCA
# use 2 components
pca = PCA(n_components=2)
biadjacency_matrix_pca = pd.DataFrame(pca.fit_transform(biadjacency_matrices))


# do the same on the absolute matrix
absolute_matrix_pca = pd.DataFrame(pca.fit_transform(absolute_matrix))

In [999]:
biadjacency_matrix_pca.head()

Unnamed: 0,0,1
0,-2.961069,-0.522591
1,5.033376,1.789363
2,-3.241747,-0.634009
3,-0.51795,-1.111916
4,-2.928263,-0.502268


### Get Statistcs

In [None]:
# # STEP 7
# for t in relative_matrix.index[1].unique():
#     filtered = merged_similarities[merged_similarities.index.str.contains(f'_{month}', case=False)].sort_index()
#     std_per_column = filtered.std(axis=0, skipna=True)
#     mean_per_column = filtered.mean(axis=0, skipna=True)
#     avg_std_per_month[month] = std_per_column.mean()
#     avg_mean_per_month[month] = mean_per_column.mean()

# # STEP 8
# # Calculate velocities using cosine distance
# velocities_cosine_full = calculate_monthly_velocities_cosine(merged_similarities, list(month_mapping.values()))

# # Convert the velocities_cosine dictionary into a DataFrame
# velocities_df_full = pd.concat(velocities_cosine_full, axis=0)

# # Reset the index to make the month pairs a column
# velocities_df_full.reset_index(inplace=True)

# # Rename the columns for clarity
# velocities_df_full.columns = ['Month Pair', 'Node', 'Velocity']

# # Pivot the DataFrame to make month pairs the column names
# velocities_df_full = velocities_df_full.pivot(index='Node', columns='Month Pair', values='Velocity')

# # Reorder the columns
# velocities_df_full = velocities_df_full[column_order]

# # Reset the index to make it more readable
# velocities_df_full.reset_index(inplace=False)

# # Prepare velocity means and stds for the same x-axis
# veloc_means_arr = np.array([velocities_df_full[col].mean(skipna=True) for col in column_order])
# veloc_stds_arr = np.array([velocities_df_full[col].std(skipna=True) for col in column_order])


# # STEP 9
# modularity_per_month = {}

# for month in list(month_mapping.values()):
#     # Create the graph for the month
#     filtered = merged_similarities[merged_similarities.index.str.contains(f'_{month}', case=False)].sort_index()
#     filtered.index = filtered.index.str.replace(r'_[^_]+$', '', regex=True)

#     filtered = filtered.loc[filtered.index, filtered.index]
#     G = nx.from_pandas_adjacency(filtered)
    
#     # Louvain communities and modularity
#     communities = louvain_communities(G, weight='weight', seed=42) # weight='weight' as values from the filtered variable
#     mod = modularity(G, communities, weight='weight')
#     modularity_per_month[month] = mod

# consecutive_modularity_averages = []
# consecutive_month_pairs = []

# months = list(modularity_per_month.keys())
# modularities = list(modularity_per_month.values())

# for i in range(len(months) - 1):
#     avg = (modularities[i] + modularities[i + 1]) / 2
#     consecutive_modularity_averages.append(avg)
#     consecutive_month_pairs.append(f"{months[i]}-{months[i+1]}")

# # Store stats for this year, now including avg_std_per_month and avg_mean_per_month
# year_stats[year] = (
#     veloc_means_arr,
#     veloc_stds_arr,
#     consecutive_modularity_averages,
#     avg_std_per_month,
#     avg_mean_per_month
# )

# # STEP 10
# # After the loop, create a DataFrame
# stats_df = pd.DataFrame([
# {
#     'year': year,
#     'veloc_means_arr': veloc_means_arr,
#     'veloc_stds_arr': veloc_stds_arr,
#     'consecutive_modularity_averages': mod_avgs,
#     'sim_avg_std_per_month': avg_std_per_month,
#     'sim_avg_mean_per_month': avg_mean_per_month
# }
# for year, (veloc_means_arr, veloc_stds_arr, mod_avgs, avg_std_per_month, avg_mean_per_month) in year_stats.items()
# ])

In [None]:
# stats_df.head()

In [None]:
# stats_checkpoint_path = "stats_checkpoint.parquet"

# # load data from checkpoint or save
# if os.path.exists(stats_checkpoint_path):
#     print("Loading stats_df from checkpoint...")
#     stats_df = pd.read_parquet(stats_checkpoint_path)
# else:
#     print("Saving stats_df to checkpoint...")
#     stats_df.to_parquet(stats_checkpoint_path, index=False)