In [36]:
import sys
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.metrics.pairwise import cosine_distances



In [37]:
#!{sys.executable} -m pip install pandas numpy scikit-learn

In [38]:
def random_data_gen(num_rows = 1000, n_layer_1 = 3, n_layer_2 = 20, start_dt = datetime(2020, 1, 1), end_dt = datetime(2025, 1, 1) ):
    id1 = ["L1_"+str(x) for x in np.random.randint(n_layer_1, size=num_rows)]
    id2 = ["L2_"+str(x) for x in np.random.randint(n_layer_2, size=num_rows)]
    start_u = start_dt.timestamp()
    end_u = end_dt.timestamp()
    random_ts = np.random.uniform(start_u, end_u, num_rows)
    dt = sorted([datetime.fromtimestamp(ts) for ts in random_ts])

    df = pd.DataFrame({
        'L1_id': id1,
        'L2_id': id2,
        'timestamp': dt
    })
    return df   

In [39]:
df = random_data_gen()
df


Unnamed: 0,L1_id,L2_id,timestamp
0,L1_1,L2_19,2020-01-01 07:24:37.958029
1,L1_1,L2_19,2020-01-01 09:04:05.790815
2,L1_2,L2_14,2020-01-01 14:34:08.441235
3,L1_1,L2_6,2020-01-07 09:05:17.870064
4,L1_1,L2_6,2020-01-08 02:20:18.024705
...,...,...,...
995,L1_1,L2_7,2024-12-26 05:58:13.137806
996,L1_0,L2_8,2024-12-26 09:50:40.447717
997,L1_1,L2_0,2024-12-29 00:54:53.917082
998,L1_0,L2_17,2024-12-30 21:25:14.212302


In [40]:
df['timeframe'] = df['timestamp'].dt.to_period('M').apply(lambda r: str(r.start_time)+" to " + str(r.end_time))
df

Unnamed: 0,L1_id,L2_id,timestamp,timeframe
0,L1_1,L2_19,2020-01-01 07:24:37.958029,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999...
1,L1_1,L2_19,2020-01-01 09:04:05.790815,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999...
2,L1_2,L2_14,2020-01-01 14:34:08.441235,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999...
3,L1_1,L2_6,2020-01-07 09:05:17.870064,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999...
4,L1_1,L2_6,2020-01-08 02:20:18.024705,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999...
...,...,...,...,...
995,L1_1,L2_7,2024-12-26 05:58:13.137806,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999...
996,L1_0,L2_8,2024-12-26 09:50:40.447717,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999...
997,L1_1,L2_0,2024-12-29 00:54:53.917082,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999...
998,L1_0,L2_17,2024-12-30 21:25:14.212302,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999...


In [41]:
grouped_df = df.groupby(['L1_id','L2_id','timeframe']).size().reset_index(name='counts')
grouped_df

Unnamed: 0,L1_id,L2_id,timeframe,counts
0,L1_0,L2_0,2020-05-01 00:00:00 to 2020-05-31 23:59:59.999...,1
1,L1_0,L2_0,2020-08-01 00:00:00 to 2020-08-31 23:59:59.999...,1
2,L1_0,L2_0,2021-04-01 00:00:00 to 2021-04-30 23:59:59.999...,1
3,L1_0,L2_0,2021-12-01 00:00:00 to 2021-12-31 23:59:59.999...,1
4,L1_0,L2_0,2022-03-01 00:00:00 to 2022-03-31 23:59:59.999...,1
...,...,...,...,...
861,L1_2,L2_9,2023-12-01 00:00:00 to 2023-12-31 23:59:59.999...,1
862,L1_2,L2_9,2024-01-01 00:00:00 to 2024-01-31 23:59:59.999...,1
863,L1_2,L2_9,2024-03-01 00:00:00 to 2024-03-31 23:59:59.999...,1
864,L1_2,L2_9,2024-04-01 00:00:00 to 2024-04-30 23:59:59.999...,1


In [42]:
# Pivot grouped_df so that L2_id becomes columns
pivot_df = grouped_df.pivot(index=['L1_id', 'timeframe'], columns='L2_id', values='counts').fillna(0)
pivot_df

Unnamed: 0_level_0,L2_id,L2_0,L2_1,L2_10,L2_11,L2_12,L2_13,L2_14,L2_15,L2_16,L2_17,L2_18,L2_19,L2_2,L2_3,L2_4,L2_5,L2_6,L2_7,L2_8,L2_9
L1_id,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L1_0,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
L1_0,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0
L1_0,2020-03-01 00:00:00 to 2020-03-31 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
L1_0,2020-04-01 00:00:00 to 2020-04-30 23:59:59.999999999,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
L1_0,2020-05-01 00:00:00 to 2020-05-31 23:59:59.999999999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L1_2,2024-08-01 00:00:00 to 2024-08-31 23:59:59.999999999,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
L1_2,2024-09-01 00:00:00 to 2024-09-30 23:59:59.999999999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
L1_2,2024-10-01 00:00:00 to 2024-10-31 23:59:59.999999999,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
L1_2,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
tmp_index_tf = []

relative_cosine_dist_matrices = pd.DataFrame()

for tf in pivot_df.index.levels[1].unique():
    tmp_df = pivot_df.xs(tf, level='timeframe')
    tmp_cosine_dist_matrix = pd.DataFrame(cosine_distances(tmp_df), index=tmp_df.index, columns=tmp_df.index)
    relative_cosine_dist_matrices = pd.concat([relative_cosine_dist_matrices, tmp_cosine_dist_matrix], axis=0)
    tmp_index_tf.extend([tf] * len(tmp_cosine_dist_matrix))

relative_cosine_dist_matrices['timeframe'] = tmp_index_tf
relative_cosine_dist_matrices.set_index(['timeframe'], append=True, inplace=True)
relative_cosine_dist_matrices

Unnamed: 0_level_0,L1_id,L1_0,L1_1,L1_2
L1_id,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L1_0,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,0.000000,0.773545,0.634852
L1_1,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,0.773545,0.000000,0.875965
L1_2,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,0.634852,0.875965,0.000000
L1_0,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,0.000000,0.803884,1.000000
L1_1,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,0.803884,0.000000,0.683772
L1_1,...,...,...,...
L1_1,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,1.000000,0.000000,1.000000
L1_2,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,1.000000,1.000000,0.000000
L1_0,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999999999,0.000000,1.000000,0.776393
L1_1,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999999999,1.000000,0.000000,1.000000


In [None]:
#df.set_index(pd.Index(['2023-01-01'] * len(df), name='date'), append=True)

L1_id,L1_0,L1_1,L1_2
L1_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
L1_0,0.0,0.773545,0.634852
L1_1,0.773545,0.0,0.875965
L1_2,0.634852,0.875965,0.0


In [43]:
cosine_dist_matrix = pd.DataFrame(cosine_distances(pivot_df), index=pivot_df.index, columns=pivot_df.index)
cosine_dist_matrix

Unnamed: 0_level_0,L1_id,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,...,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2
Unnamed: 0_level_1,timeframe,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,2020-03-01 00:00:00 to 2020-03-31 23:59:59.999999999,2020-04-01 00:00:00 to 2020-04-30 23:59:59.999999999,2020-05-01 00:00:00 to 2020-05-31 23:59:59.999999999,2020-06-01 00:00:00 to 2020-06-30 23:59:59.999999999,2020-07-01 00:00:00 to 2020-07-31 23:59:59.999999999,2020-08-01 00:00:00 to 2020-08-31 23:59:59.999999999,2020-09-01 00:00:00 to 2020-09-30 23:59:59.999999999,2020-10-01 00:00:00 to 2020-10-31 23:59:59.999999999,...,2024-03-01 00:00:00 to 2024-03-31 23:59:59.999999999,2024-04-01 00:00:00 to 2024-04-30 23:59:59.999999999,2024-05-01 00:00:00 to 2024-05-31 23:59:59.999999999,2024-06-01 00:00:00 to 2024-06-30 23:59:59.999999999,2024-07-01 00:00:00 to 2024-07-31 23:59:59.999999999,2024-08-01 00:00:00 to 2024-08-31 23:59:59.999999999,2024-09-01 00:00:00 to 2024-09-30 23:59:59.999999999,2024-10-01 00:00:00 to 2024-10-31 23:59:59.999999999,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999999999
L1_id,timeframe,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
L1_0,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,0.000000,0.773545,1.000000,1.000000,0.817426,1.000000,1.000000,0.845697,0.483602,1.000000,...,0.833333,0.691393,0.727834,0.817426,0.422650,0.817426,1.000000,0.833333,0.591752,1.000000
L1_0,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,0.773545,0.000000,0.306625,0.679744,0.503861,1.000000,1.000000,0.685515,0.736883,0.901942,...,0.773545,0.475858,0.722650,0.627896,0.509710,0.751931,1.000000,0.886772,1.000000,0.861325
L1_0,2020-03-01 00:00:00 to 2020-03-31 23:59:59.999999999,1.000000,0.306625,0.000000,0.711325,0.776393,0.811018,0.711325,0.622036,0.841886,1.000000,...,0.795876,0.622036,0.833333,0.776393,0.823223,0.776393,1.000000,0.897938,1.000000,1.000000
L1_0,2020-04-01 00:00:00 to 2020-04-30 23:59:59.999999999,1.000000,0.679744,0.711325,0.000000,0.483602,0.345346,1.000000,0.781782,0.817426,0.795876,...,1.000000,0.781782,0.807550,1.000000,0.795876,1.000000,1.000000,0.882149,0.711325,1.000000
L1_0,2020-05-01 00:00:00 to 2020-05-31 23:59:59.999999999,0.817426,0.503861,0.776393,0.483602,0.000000,0.661938,1.000000,0.492907,0.717157,0.841886,...,1.000000,0.661938,0.701858,0.600000,0.841886,0.600000,0.830969,0.634852,0.776393,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L1_2,2024-08-01 00:00:00 to 2024-08-31 23:59:59.999999999,0.817426,0.751931,0.776393,1.000000,0.600000,1.000000,1.000000,0.661938,0.717157,0.841886,...,0.817426,0.492907,0.552786,0.400000,0.841886,0.000000,0.661938,0.269703,0.776393,0.776393
L1_2,2024-09-01 00:00:00 to 2024-09-30 23:59:59.999999999,1.000000,1.000000,1.000000,1.000000,0.830969,0.714286,1.000000,0.714286,0.760954,0.866369,...,0.845697,0.714286,0.622036,0.830969,1.000000,0.661938,0.000000,0.614242,0.433053,1.000000
L1_2,2024-10-01 00:00:00 to 2024-10-31 23:59:59.999999999,0.833333,0.886772,0.897938,0.882149,0.634852,1.000000,1.000000,0.768545,0.483602,0.855662,...,1.000000,0.845697,0.387628,0.452277,0.639156,0.269703,0.614242,0.000000,0.591752,0.693814
L1_2,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,0.591752,1.000000,1.000000,0.711325,0.776393,0.622036,1.000000,0.811018,0.683772,0.823223,...,1.000000,1.000000,0.500000,0.776393,0.823223,0.776393,0.433053,0.591752,0.000000,1.000000


L1_id,L1_id,timeframe,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,L1_0,...,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2,L1_2
timeframe,Unnamed: 1_level_1,Unnamed: 2_level_1,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999999999,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999999999,2020-03-01 00:00:00 to 2020-03-31 23:59:59.999999999,2020-04-01 00:00:00 to 2020-04-30 23:59:59.999999999,2020-05-01 00:00:00 to 2020-05-31 23:59:59.999999999,2020-06-01 00:00:00 to 2020-06-30 23:59:59.999999999,2020-07-01 00:00:00 to 2020-07-31 23:59:59.999999999,2020-08-01 00:00:00 to 2020-08-31 23:59:59.999999999,...,2024-03-01 00:00:00 to 2024-03-31 23:59:59.999999999,2024-04-01 00:00:00 to 2024-04-30 23:59:59.999999999,2024-05-01 00:00:00 to 2024-05-31 23:59:59.999999999,2024-06-01 00:00:00 to 2024-06-30 23:59:59.999999999,2024-07-01 00:00:00 to 2024-07-31 23:59:59.999999999,2024-08-01 00:00:00 to 2024-08-31 23:59:59.999999999,2024-09-01 00:00:00 to 2024-09-30 23:59:59.999999999,2024-10-01 00:00:00 to 2024-10-31 23:59:59.999999999,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999999999,2024-12-01 00:00:00 to 2024-12-31 23:59:59.999999999
0,L1_0,2020-01-01 00:00:00 to 2020-01-31 23:59:59.999...,0.000000,0.773545,1.000000,1.000000,0.817426,1.000000,1.000000,0.845697,...,0.833333,0.691393,0.727834,0.817426,0.422650,0.817426,1.000000,0.833333,0.591752,1.000000
1,L1_0,2020-02-01 00:00:00 to 2020-02-29 23:59:59.999...,0.773545,0.000000,0.306625,0.679744,0.503861,1.000000,1.000000,0.685515,...,0.773545,0.475858,0.722650,0.627896,0.509710,0.751931,1.000000,0.886772,1.000000,0.861325
2,L1_0,2020-03-01 00:00:00 to 2020-03-31 23:59:59.999...,1.000000,0.306625,0.000000,0.711325,0.776393,0.811018,0.711325,0.622036,...,0.795876,0.622036,0.833333,0.776393,0.823223,0.776393,1.000000,0.897938,1.000000,1.000000
3,L1_0,2020-04-01 00:00:00 to 2020-04-30 23:59:59.999...,1.000000,0.679744,0.711325,0.000000,0.483602,0.345346,1.000000,0.781782,...,1.000000,0.781782,0.807550,1.000000,0.795876,1.000000,1.000000,0.882149,0.711325,1.000000
4,L1_0,2020-05-01 00:00:00 to 2020-05-31 23:59:59.999...,0.817426,0.503861,0.776393,0.483602,0.000000,0.661938,1.000000,0.492907,...,1.000000,0.661938,0.701858,0.600000,0.841886,0.600000,0.830969,0.634852,0.776393,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,L1_2,2024-08-01 00:00:00 to 2024-08-31 23:59:59.999...,0.817426,0.751931,0.776393,1.000000,0.600000,1.000000,1.000000,0.661938,...,0.817426,0.492907,0.552786,0.400000,0.841886,0.000000,0.661938,0.269703,0.776393,0.776393
176,L1_2,2024-09-01 00:00:00 to 2024-09-30 23:59:59.999...,1.000000,1.000000,1.000000,1.000000,0.830969,0.714286,1.000000,0.714286,...,0.845697,0.714286,0.622036,0.830969,1.000000,0.661938,0.000000,0.614242,0.433053,1.000000
177,L1_2,2024-10-01 00:00:00 to 2024-10-31 23:59:59.999...,0.833333,0.886772,0.897938,0.882149,0.634852,1.000000,1.000000,0.768545,...,1.000000,0.845697,0.387628,0.452277,0.639156,0.269703,0.614242,0.000000,0.591752,0.693814
178,L1_2,2024-11-01 00:00:00 to 2024-11-30 23:59:59.999...,0.591752,1.000000,1.000000,0.711325,0.776393,0.622036,1.000000,0.811018,...,1.000000,1.000000,0.500000,0.776393,0.823223,0.776393,0.433053,0.591752,0.000000,1.000000
