In [1]:
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
from sklearn.metrics.pairwise import cosine_distances, manhattan_distances

from ipynb.fs.full.example_data_gen import random_data_gen
from functions import get_absolute_matrix, get_relative_matrix

In [2]:
REAL_DATASET = False
SUBSET = True

### Define functions

In [3]:
if REAL_DATASET:

    checkpoint_path = "df_checkpoint.parquet"

    # load data from checkpoint or save
    if os.path.exists(checkpoint_path):
        print("Loading df from checkpoint...")
        df = pd.read_parquet(checkpoint_path)
        df = df.drop(columns=['original_post_id', 'author_name', 'author_username'])
        df = df.rename(columns={'tweet_created_at':'timestamp', 'author_id':'id1', 'retweeter_id':'id2'})

        if SUBSET:
            df = df.tail(10000)
            print('Working with subset of real data')

        else:
            print('Working with full real data')

else:
    print('Working with randomly generated data')
    # generate random data
    df = random_data_gen()
    df = df.rename(columns={'L1_id':'id1', 'L2_id':'id2'})

Working with randomly generated data


In [4]:
df.head()

Unnamed: 0,id1,id2,timestamp
0,L1_0,L2_10,2020-01-04 15:03:51.443816
1,L1_0,L2_3,2020-01-05 10:53:58.356795
2,L1_1,L2_7,2020-01-05 16:38:25.657112
3,L1_1,L2_17,2020-01-07 18:48:19.784832
4,L1_1,L2_10,2020-01-09 01:36:22.757205


In [5]:
get_relative_matrix(df, timeframe = 'M', distance_function=cosine_distances)

Unnamed: 0_level_0,id1,L1_0,L1_1,L1_2
id1,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L1_0,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,0.000000,0.393220,1.000000
L1_1,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,0.393220,0.000000,1.000000
L1_2,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,1.000000,1.000000,0.000000
L1_0,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,0.000000,0.666667,0.855662
L1_1,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,0.666667,0.000000,0.711325
L1_1,...,...,...,...
L1_1,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,1.000000,0.000000,0.833333
L1_2,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,0.823223,0.833333,0.000000
L1_0,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999999999,0.000000,0.519616,0.839872
L1_1,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999999999,0.519616,0.000000,1.000000


In [6]:
get_absolute_matrix(df, timeframe='Y', distance_function=manhattan_distances)

Unnamed: 0_level_0,id1,L1_0,L1_0,L1_0,L1_0,L1_0,L1_1,L1_1,L1_1,L1_1,L1_1,L1_2,L1_2,L1_2,L1_2,L1_2
Unnamed: 0_level_1,timeframe,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999
id1,timeframe,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
L1_0,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.0,40.0,39.0,30.0,35.0,41.0,30.0,40.0,44.0,34.0,50.0,25.0,43.0,42.0,48.0
L1_0,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,40.0,0.0,35.0,42.0,29.0,43.0,38.0,30.0,28.0,30.0,44.0,31.0,41.0,52.0,44.0
L1_0,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,39.0,35.0,0.0,29.0,28.0,40.0,29.0,39.0,31.0,17.0,53.0,30.0,40.0,39.0,35.0
L1_0,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,30.0,42.0,29.0,0.0,33.0,37.0,34.0,44.0,40.0,34.0,56.0,33.0,47.0,40.0,50.0
L1_0,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,35.0,29.0,28.0,33.0,0.0,42.0,29.0,31.0,37.0,27.0,41.0,22.0,34.0,37.0,43.0
L1_1,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,41.0,43.0,40.0,37.0,42.0,0.0,39.0,41.0,43.0,37.0,47.0,38.0,38.0,41.0,39.0
L1_1,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,30.0,38.0,29.0,34.0,29.0,39.0,0.0,42.0,38.0,28.0,52.0,23.0,41.0,32.0,38.0
L1_1,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,40.0,30.0,39.0,44.0,31.0,41.0,42.0,0.0,32.0,36.0,32.0,31.0,43.0,46.0,40.0
L1_1,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,44.0,28.0,31.0,40.0,37.0,43.0,38.0,32.0,0.0,34.0,42.0,33.0,49.0,46.0,52.0
L1_1,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,34.0,30.0,17.0,34.0,27.0,37.0,28.0,36.0,34.0,0.0,52.0,29.0,37.0,46.0,36.0
