# Imports

In [1]:
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

#  Taking raw network

In [2]:
# Get uci raw and convert it to uci.txt format (t, u, v)
cols = ['u','v','weight','timestamp']
df = pd.read_csv('out.topology', sep=" ",  names=cols, skiprows=(1))
df

Unnamed: 0,u,v,weight,timestamp
0,1,2,1,1266192000
1,3,4,1,1266474818
2,5,6,1,1266213618
3,7,8,1,1266238515
4,9,10,1,1266192000
...,...,...,...,...
171398,160,466,1,1266204900
171399,670,32351,1,1266192000
171400,1794,9116,1,1266192000
171401,53,11520,1,1266192000


In [3]:
df.drop(columns=["weight"], inplace=True)
df

Unnamed: 0,u,v,timestamp
0,1,2,1266192000
1,3,4,1266474818
2,5,6,1266213618
3,7,8,1266238515
4,9,10,1266192000
...,...,...,...
171398,160,466,1266204900
171399,670,32351,1266192000
171400,1794,9116,1266192000
171401,53,11520,1266192000


In [4]:
# Changing the position of the columns
df = df[['timestamp', 'u', 'v']]
df

Unnamed: 0,timestamp,u,v
0,1266192000,1,2
1,1266474818,3,4
2,1266213618,5,6
3,1266238515,7,8
4,1266192000,9,10
...,...,...,...
171398,1266204900,160,466
171399,1266192000,670,32351
171400,1266192000,1794,9116
171401,1266192000,53,11520


In [5]:
df['timestamp'] = df['timestamp'].astype(int)
df

Unnamed: 0,timestamp,u,v
0,1266192000,1,2
1,1266474818,3,4
2,1266213618,5,6
3,1266238515,7,8
4,1266192000,9,10
...,...,...,...
171398,1266204900,160,466
171399,1266192000,670,32351
171400,1266192000,1794,9116
171401,1266192000,53,11520


In [6]:
# Check how many occurances we have in every timestamp
timestamps_grouped = df.groupby('timestamp')[['u', 'v']].apply(lambda x: x.shape[0])
print(timestamps_grouped)

timestamp
1266192000    67010
1266192001       60
1266192002       22
1266192003       13
1266192004       11
              ...  
1268207999       15
1268208000       21
1268208001        3
1268208002        3
1268208004        1
Length: 32824, dtype: int64


In [8]:
df_time = pd.DataFrame()

In [9]:
# Convert timestamps to datetime format
df_time['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df_time

Unnamed: 0,timestamp
0,2010-02-15 00:00:00
1,2010-02-18 06:33:38
2,2010-02-15 06:00:18
3,2010-02-15 12:55:15
4,2010-02-15 00:00:00
...,...
171398,2010-02-15 03:35:00
171399,2010-02-15 00:00:00
171400,2010-02-15 00:00:00
171401,2010-02-15 00:00:00


In [10]:
# Calculate total time span
time_span = df_time['timestamp'].max() - df_time['timestamp'].min()
time_span

Timedelta('23 days 08:00:04')

In [11]:
# Convert time span to days
total_days = divmod(time_span.total_seconds(), 86400)[0]
print("Total number of days:", total_days)

Total number of days: 23.0


In [12]:
unique_timestamps = df_time['timestamp'].nunique()

print("Total different timestamps:", unique_timestamps)

Total different timestamps: 32824


# Remove direction then ordering by `t`, `u`, `v`, remove self-loops, duplicates, then save network

## Remove direction and sorting the link stream by `t`, `u`, `v`

In [13]:
# Swap 'u' and 'v' values where 'u' > 'v'
mask = df['u'] > df['v']
df.loc[mask, ['u', 'v']] = df.loc[mask, ['v', 'u']].values

# Sort DataFrame by timestamp, u, and v columns
df.sort_values(by=['timestamp', 'u', 'v'], inplace=True)

# Now df contains the sorted DataFrame with 'u' <= 'v'
df

Unnamed: 0,timestamp,u,v
0,1266192000,1,2
39,1266192000,1,73
162,1266192000,1,257
10133,1266192000,1,314
110776,1266192000,1,361
...,...,...,...
58624,1268208001,785,3237
130464,1268208002,160,22264
25934,1268208002,304,412
62313,1268208002,325,3178


In [14]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


In [15]:
df.timestamp.is_monotonic_increasing

True

In [16]:
df.u.is_monotonic_increasing

False

In [17]:
df.v.is_monotonic_increasing

False

## Removing self-loops

In [18]:
# Check for self-loops
self_loops = df['u'] == df['v']

if self_loops.any():
    # Print self-loops
    print("Self-loops:")
    print(df[self_loops])

    # Remove self-loops
    df_no_self_loops = df[~self_loops] # Not self-loops 
    
    # Display the DataFrame without self-loops
    print("\nDataFrame without self-loops:")
    print(df_no_self_loops)
    df = df_no_self_loops
else:
    print("No self-loops found.")

No self-loops found.


## Removing duplicate rows

In [19]:
# Count duplicate rows
duplicate_count = df.duplicated(keep=False).sum()

if duplicate_count > 0:
    # Print all duplicate rows
    print("Duplicate rows:")
    print(df[df.duplicated(keep=False)])
    
    # Remove duplicate rows
    df_no_duplicates = df.drop_duplicates()
    
    # Display the DataFrame without duplicates
    print("\nDataFrame without duplicates:")
    print(df_no_duplicates)
else:
    print("No duplicates found.")

Duplicate rows:
         timestamp    u     v
108682  1266192000    1   806
143933  1266192000    1   806
25878   1266192000    1  8821
73851   1266192000    1  8821
17554   1266192000    3  4496
...            ...  ...   ...
163761  1268207998  306   325
39939   1268208000    5  3049
71061   1268208000    5  3049
101348  1268208000  105  3049
131758  1268208000  105  3049

[31677 rows x 3 columns]

DataFrame without duplicates:
         timestamp     u      v
0       1266192000     1      2
39      1266192000     1     73
162     1266192000     1    257
10133   1266192000     1    314
110776  1266192000     1    361
...            ...   ...    ...
58624   1268208001   785   3237
130464  1268208002   160  22264
25934   1268208002   304    412
62313   1268208002   325   3178
144079  1268208004  7105  25235

[154841 rows x 3 columns]


In [20]:
df = df_no_duplicates
df

Unnamed: 0,timestamp,u,v
0,1266192000,1,2
39,1266192000,1,73
162,1266192000,1,257
10133,1266192000,1,314
110776,1266192000,1,361
...,...,...,...
58624,1268208001,785,3237
130464,1268208002,160,22264
25934,1268208002,304,412
62313,1268208002,325,3178


In [21]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Saving the link stream

In [24]:
df.to_csv(r'internettopology.txt', header=None, index=None, sep=' ') # Stop here