# Imports

In [1]:
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

#  Taking raw network

In [2]:
# Get uci raw and convert it to uci.txt format (t, u, v)
cols = ['u','v','weight','timestamp']
df = pd.read_csv('out.dnc-temporalGraph', sep="\t",  names=cols, skiprows=(1))
df

Unnamed: 0,u,v,weight,timestamp
0,419,465,1,1463507482
1,869,453,1,1462337903
2,943,1151,1,1463167636
3,943,217,1,1463167636
4,943,841,1,1463167636
...,...,...,...,...
39259,691,691,1,1462516415
39260,1287,1550,1,1463727590
39261,601,1952,1,1462350250
39262,1876,1287,1,1461579818


In [3]:
df.drop(columns=["weight"], inplace=True)
df

Unnamed: 0,u,v,timestamp
0,419,465,1463507482
1,869,453,1462337903
2,943,1151,1463167636
3,943,217,1463167636
4,943,841,1463167636
...,...,...,...
39259,691,691,1462516415
39260,1287,1550,1463727590
39261,601,1952,1462350250
39262,1876,1287,1461579818


In [4]:
# Changing the position of the columns
df = df[['timestamp', 'u', 'v']]
df

Unnamed: 0,timestamp,u,v
0,1463507482,419,465
1,1462337903,869,453
2,1463167636,943,1151
3,1463167636,943,217
4,1463167636,943,841
...,...,...,...
39259,1462516415,691,691
39260,1463727590,1287,1550
39261,1462350250,601,1952
39262,1461579818,1876,1287


In [5]:
df['timestamp'] = df['timestamp'].astype(int)
df

Unnamed: 0,timestamp,u,v
0,1463507482,419,465
1,1462337903,869,453
2,1463167636,943,1151
3,1463167636,943,217
4,1463167636,943,841
...,...,...,...
39259,1462516415,691,691
39260,1463727590,1287,1550
39261,1462350250,601,1952
39262,1461579818,1876,1287


In [6]:
# Check how many occurances we have in every timestamp
timestamps_grouped = df.groupby('timestamp')[['u', 'v']].apply(lambda x: x.shape[0])
print(timestamps_grouped)

timestamp
1379298633    1
1421071197    1
1421246251    1
1421247394    1
1422010302    1
             ..
1464155763    1
1464155888    8
1464156450    2
1464156466    8
1464168388    1
Length: 19383, dtype: int64


In [8]:
df_time = pd.DataFrame()

In [9]:
# Convert timestamps to datetime format
df_time['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df_time

Unnamed: 0,timestamp
0,2016-05-17 17:51:22
1,2016-05-04 04:58:23
2,2016-05-13 19:27:16
3,2016-05-13 19:27:16
4,2016-05-13 19:27:16
...,...
39259,2016-05-06 06:33:35
39260,2016-05-20 06:59:50
39261,2016-05-04 08:24:10
39262,2016-04-25 10:23:38


In [10]:
# Calculate total time span
time_span = df_time['timestamp'].max() - df_time['timestamp'].min()
time_span

Timedelta('982 days 06:55:55')

In [11]:
# Convert time span to days
total_days = divmod(time_span.total_seconds(), 86400)[0]
print("Total number of days:", total_days)

Total number of days: 982.0


In [12]:
unique_timestamps = df_time['timestamp'].nunique()

print("Total different timestamps:", unique_timestamps)

Total different timestamps: 19383


# Remove direction then ordering by `t`, `u`, `v`, remove self-loops, duplicates, then save network

## Remove direction and sorting the link stream by `t`, `u`, `v`

In [13]:
# Swap 'u' and 'v' values where 'u' > 'v'
mask = df['u'] > df['v']
df.loc[mask, ['u', 'v']] = df.loc[mask, ['v', 'u']].values

# Sort DataFrame by timestamp, u, and v columns
df.sort_values(by=['timestamp', 'u', 'v'], inplace=True)

# Now df contains the sorted DataFrame with 'u' <= 'v'
df

Unnamed: 0,timestamp,u,v
30437,1379298633,547,607
27750,1421071197,1625,1625
27651,1421246251,1625,1625
29071,1421247394,1,993
5722,1422010302,56,1159
...,...,...,...
5866,1464156466,1037,1876
39181,1464156466,1037,1876
5867,1464156466,1037,1952
39182,1464156466,1037,1952


In [14]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Removing self-loops

In [19]:
# Check for self-loops
self_loops = df['u'] == df['v']

if self_loops.any():
    # Print self-loops
    print("Self-loops:")
    print(df[self_loops])

    # Remove self-loops
    df_no_self_loops = df[~self_loops] # Not self-loops 
    
    # Display the DataFrame without self-loops
    print("\nDataFrame without self-loops:")
    print(df_no_self_loops)
    df = df_no_self_loops
else:
    print("No self-loops found.")

Self-loops:
        timestamp     u     v
27750  1421071197  1625  1625
27651  1421246251  1625  1625
28113  1422036537  1625  1625
28395  1422098530  1625  1625
27751  1422136953  1226  1226
...           ...   ...   ...
29753  1464117806  1625  1625
30038  1464117806  1625  1625
6015   1464120805  1625  1625
29146  1464120805  1625  1625
30453  1464120805  1625  1625

[1843 rows x 3 columns]

DataFrame without self-loops:
        timestamp     u     v
30437  1379298633   547   607
29071  1421247394     1   993
5722   1422010302    56  1159
31199  1423271713   415  1092
2711   1423560787   643  1159
...           ...   ...   ...
5866   1464156466  1037  1876
39181  1464156466  1037  1876
5867   1464156466  1037  1952
39182  1464156466  1037  1952
32689  1464168388  1377  1641

[37421 rows x 3 columns]


## Removing duplicate rows

In [20]:
# Count duplicate rows
duplicate_count = df.duplicated(keep=False).sum()

if duplicate_count > 0:
    # Print all duplicate rows
    print("Duplicate rows:")
    print(df[df.duplicated(keep=False)])
    
    # Remove duplicate rows
    df_no_duplicates = df.drop_duplicates()
    
    # Display the DataFrame without duplicates
    print("\nDataFrame without duplicates:")
    print(df_no_duplicates)
else:
    print("No duplicates found.")

Duplicate rows:
        timestamp     u     v
29456  1443611467   993  1882
29551  1443611467   993  1882
29026  1443612186   993  1882
29175  1443612186   993  1882
29533  1443612186   993  1882
...           ...   ...   ...
39184  1464156466  1037  1377
5866   1464156466  1037  1876
39181  1464156466  1037  1876
5867   1464156466  1037  1952
39182  1464156466  1037  1952

[9481 rows x 3 columns]

DataFrame without duplicates:
        timestamp     u     v
30437  1379298633   547   607
29071  1421247394     1   993
5722   1422010302    56  1159
31199  1423271713   415  1092
2711   1423560787   643  1159
...           ...   ...   ...
5868   1464156466  1037  1159
5869   1464156466  1037  1377
5866   1464156466  1037  1876
5867   1464156466  1037  1952
32689  1464168388  1377  1641

[31725 rows x 3 columns]


In [21]:
df = df_no_duplicates
df

Unnamed: 0,timestamp,u,v
30437,1379298633,547,607
29071,1421247394,1,993
5722,1422010302,56,1159
31199,1423271713,415,1092
2711,1423560787,643,1159
...,...,...,...
5868,1464156466,1037,1159
5869,1464156466,1037,1377
5866,1464156466,1037,1876
5867,1464156466,1037,1952


In [22]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'."

DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Saving the link stream

In [24]:
df.to_csv(r'dnc.txt', header=None, index=None, sep=' ') # Stop here