In [4]:
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
from sklearn.metrics.pairwise import cosine_distances, manhattan_distances

from functions import get_absolute_matrix, get_relative_matrix, random_data_gen

In [5]:
REAL_DATASET = False
SUBSET = True

In [6]:
if REAL_DATASET:

    checkpoint_path = "df_checkpoint.parquet"

    # load data from checkpoint or save
    if os.path.exists(checkpoint_path):
        print("Loading df from checkpoint...")
        df = pd.read_parquet(checkpoint_path)
        df = df.drop(columns=['original_post_id', 'author_name', 'author_username'])
        df = df.rename(columns={'tweet_created_at':'timestamp', 'author_id':'id1', 'retweeter_id':'id2'})

        if SUBSET:
            df = df.tail(10000)
            print('Working with subset of real data')

        else:
            print('Working with full real data')

else:
    print('Working with randomly generated data')
    # generate random data
    df = random_data_gen()
    df = df.rename(columns={'L1_id':'id1', 'L2_id':'id2'})

Working with randomly generated data


In [7]:
df.head()

Unnamed: 0,id1,id2,timestamp
0,L1_1,L2_0,2020-01-02 01:04:25.679659
1,L1_1,L2_7,2020-01-05 02:00:20.138697
2,L1_1,L2_8,2020-01-05 15:43:24.833179
3,L1_1,L2_4,2020-01-11 00:37:16.896025
4,L1_1,L2_8,2020-01-11 16:56:24.714127


In [8]:
get_relative_matrix(df)

Unnamed: 0_level_0,id1,L1_0,L1_1,L1_2
id1,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L1_0,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.0,0.238736,0.413229
L1_1,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.238736,0.0,0.379644
L1_2,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.413229,0.379644,0.0
L1_0,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,0.0,0.214938,0.123611
L1_1,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,0.214938,0.0,0.147453
L1_2,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,0.123611,0.147453,0.0
L1_0,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.0,0.172115,0.27669
L1_1,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.172115,0.0,0.208591
L1_2,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,0.27669,0.208591,0.0
L1_0,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,0.0,0.11473,0.247674


In [9]:
get_absolute_matrix(df, timeframe='Y', distance_function=manhattan_distances)

Unnamed: 0_level_0,id1,L1_0,L1_0,L1_0,L1_0,L1_0,L1_1,L1_1,L1_1,L1_1,L1_1,L1_2,L1_2,L1_2,L1_2,L1_2
Unnamed: 0_level_1,timeframe,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999
id1,timeframe,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
L1_0,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,0.0,38.0,40.0,44.0,33.0,37.0,44.0,42.0,39.0,46.0,58.0,36.0,41.0,40.0,38.0
L1_0,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,38.0,0.0,32.0,44.0,43.0,37.0,40.0,42.0,37.0,46.0,56.0,30.0,43.0,40.0,38.0
L1_0,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,40.0,32.0,0.0,34.0,37.0,37.0,46.0,38.0,37.0,48.0,56.0,44.0,49.0,46.0,48.0
L1_0,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,44.0,44.0,34.0,0.0,45.0,47.0,44.0,36.0,31.0,36.0,60.0,40.0,37.0,50.0,36.0
L1_0,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,33.0,43.0,37.0,45.0,0.0,48.0,47.0,41.0,42.0,51.0,43.0,37.0,44.0,37.0,39.0
L1_1,2020-01-01 00:00:00 to 2020-12-31 23:59:59.999999999,37.0,37.0,37.0,47.0,48.0,0.0,45.0,35.0,42.0,37.0,53.0,51.0,50.0,39.0,39.0
L1_1,2021-01-01 00:00:00 to 2021-12-31 23:59:59.999999999,44.0,40.0,46.0,44.0,47.0,45.0,0.0,34.0,39.0,40.0,44.0,34.0,31.0,44.0,34.0
L1_1,2022-01-01 00:00:00 to 2022-12-31 23:59:59.999999999,42.0,42.0,38.0,36.0,41.0,35.0,34.0,0.0,27.0,24.0,46.0,34.0,35.0,34.0,26.0
L1_1,2023-01-01 00:00:00 to 2023-12-31 23:59:59.999999999,39.0,37.0,37.0,31.0,42.0,42.0,39.0,27.0,0.0,43.0,53.0,37.0,30.0,41.0,33.0
L1_1,2024-01-01 00:00:00 to 2024-12-31 23:59:59.999999999,46.0,46.0,48.0,36.0,51.0,37.0,40.0,24.0,43.0,0.0,52.0,36.0,47.0,48.0,36.0
