In [2]:
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
from sklearn.metrics.pairwise import cosine_distances, manhattan_distances

from ipynb.fs.full.example_data_gen import random_data_gen
from functions import get_absolute_matrix, get_relative_matrix

In [3]:
REAL_DATASET = False
SUBSET = True

### Define functions

In [4]:
if REAL_DATASET:

    checkpoint_path = "df_checkpoint.parquet"

    # load data from checkpoint or save
    if os.path.exists(checkpoint_path):
        print("Loading df from checkpoint...")
        df = pd.read_parquet(checkpoint_path)
        df = df.drop(columns=['original_post_id', 'author_name', 'author_username'])
        df = df.rename(columns={'tweet_created_at':'timestamp', 'author_id':'id1', 'retweeter_id':'id2'})

        if SUBSET:
            df = df.tail(10000)
            print('Working with subset of real data')

        else:
            print('Working with full real data')

else:
    print('Working with randomly generated data')
    # generate random data
    df = random_data_gen()
    df = df.rename(columns={'L1_id':'id1', 'L2_id':'id2'})

Working with randomly generated data


In [5]:
df.head()

Unnamed: 0,id1,id2,timestamp
0,L1_0,L2_4,2020-01-01 14:26:04.411584
1,L1_2,L2_13,2020-01-04 05:46:58.040960
2,L1_2,L2_4,2020-01-12 01:44:24.654522
3,L1_2,L2_17,2020-01-12 14:31:49.907284
4,L1_2,L2_4,2020-01-17 08:42:30.877029


In [6]:
get_relative_matrix(df)

Unnamed: 0_level_0,id1,L1_0,L1_1,L1_2
id1,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L1_0,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.0,0.171781,0.242395
L1_1,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.171781,0.0,0.361376
L1_2,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.242395,0.361376,0.0
L1_0,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,0.0,0.16931,0.244901
L1_1,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,0.16931,0.0,0.191353
L1_2,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,0.244901,0.191353,0.0
L1_0,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.271628,0.3304
L1_1,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.271628,0.0,0.175634
L1_2,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.3304,0.175634,0.0
L1_0,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,0.0,0.146262,0.196305


In [7]:
get_absolute_matrix(df, timeframe='Y', distance_function=manhattan_distances)

Unnamed: 0_level_0,id1,L1_0,L1_0,L1_0,L1_0,L1_0,L1_1,L1_1,L1_1,L1_1,L1_1,L1_2,L1_2,L1_2,L1_2,L1_2
Unnamed: 0_level_1,timeframe,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999
id1,timeframe,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
L1_0,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.0,33.0,31.0,34.0,36.0,33.0,27.0,31.0,42.0,42.0,34.0,40.0,39.0,36.0,37.0
L1_0,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,33.0,0.0,44.0,37.0,47.0,32.0,36.0,40.0,37.0,27.0,43.0,49.0,42.0,45.0,46.0
L1_0,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,31.0,44.0,0.0,43.0,37.0,34.0,32.0,38.0,37.0,43.0,45.0,41.0,46.0,39.0,36.0
L1_0,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,34.0,37.0,43.0,0.0,32.0,37.0,29.0,31.0,36.0,34.0,34.0,44.0,37.0,40.0,25.0
L1_0,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,36.0,47.0,37.0,32.0,0.0,43.0,37.0,39.0,42.0,44.0,44.0,40.0,45.0,42.0,33.0
L1_1,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,33.0,32.0,34.0,37.0,43.0,0.0,42.0,32.0,41.0,41.0,47.0,47.0,38.0,49.0,40.0
L1_1,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,27.0,36.0,32.0,29.0,37.0,42.0,0.0,36.0,37.0,35.0,29.0,35.0,42.0,35.0,30.0
L1_1,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,31.0,40.0,38.0,31.0,39.0,32.0,36.0,0.0,31.0,43.0,33.0,37.0,34.0,41.0,36.0
L1_1,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,42.0,37.0,37.0,36.0,42.0,41.0,37.0,31.0,0.0,36.0,32.0,48.0,37.0,46.0,41.0
L1_1,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,42.0,27.0,43.0,34.0,44.0,41.0,35.0,43.0,36.0,0.0,40.0,46.0,45.0,46.0,49.0
