# Imports

In [1]:
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

#  Taking raw network

In [2]:
# Get uci raw and convert it to uci.txt format (t, u, v)
cols = ['u','v','weight','timestamp']
df = pd.read_csv('out.munmun_digg_reply', sep=" ",  names=cols, skiprows=(1))
df

Unnamed: 0,u,v,weight,timestamp
0,1,2,1,1225229529
1,3,2,1,1225369697
2,4,2,1,1225376543
3,5,2,1,1225397251
4,6,2,1,1225398672
...,...,...,...,...
87622,2342,24221,1,1226530877
87623,23647,2342,1,1226531828
87624,1423,2342,1,1226531875
87625,3739,30387,1,1226531096


In [3]:
df.drop(columns=["weight"], inplace=True)
df

Unnamed: 0,u,v,timestamp
0,1,2,1225229529
1,3,2,1225369697
2,4,2,1225376543
3,5,2,1225397251
4,6,2,1225398672
...,...,...,...
87622,2342,24221,1226530877
87623,23647,2342,1226531828
87624,1423,2342,1226531875
87625,3739,30387,1226531096


In [4]:
# Changing the position of the columns
df = df[['timestamp', 'u', 'v']]
df

Unnamed: 0,timestamp,u,v
0,1225229529,1,2
1,1225369697,3,2
2,1225376543,4,2
3,1225397251,5,2
4,1225398672,6,2
...,...,...,...
87622,1226530877,2342,24221
87623,1226531828,23647,2342
87624,1226531875,1423,2342
87625,1226531096,3739,30387


In [5]:
df['timestamp'] = df['timestamp'].astype(int)
df

Unnamed: 0,timestamp,u,v
0,1225229529,1,2
1,1225369697,3,2
2,1225376543,4,2
3,1225397251,5,2
4,1225398672,6,2
...,...,...,...
87622,1226530877,2342,24221
87623,1226531828,23647,2342
87624,1226531875,1423,2342
87625,1226531096,3739,30387


In [6]:
# Check how many occurances we have in every timestamp
timestamps_grouped = df.groupby('timestamp')[['u', 'v']].apply(lambda x: x.shape[0])
print(timestamps_grouped)

timestamp
1225229529    1
1225274307    1
1225287486    1
1225288438    1
1225290183    1
             ..
1226532116    1
1226532281    1
1226532332    1
1226532552    1
1226536024    1
Length: 83943, dtype: int64


In [8]:
df_time = pd.DataFrame()

In [9]:
# Convert timestamps to datetime format
df_time['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df_time

Unnamed: 0,timestamp
0,2008-10-28 21:32:09
1,2008-10-30 12:28:17
2,2008-10-30 14:22:23
3,2008-10-30 20:07:31
4,2008-10-30 20:31:12
...,...
87622,2008-11-12 23:01:17
87623,2008-11-12 23:17:08
87624,2008-11-12 23:17:55
87625,2008-11-12 23:04:56


In [10]:
# Calculate total time span
time_span = df_time['timestamp'].max() - df_time['timestamp'].min()
time_span

Timedelta('15 days 02:54:55')

In [11]:
# Convert time span to days
total_days = divmod(time_span.total_seconds(), 86400)[0]
print("Total number of days:", total_days)

Total number of days: 15.0


In [12]:
unique_timestamps = df_time['timestamp'].nunique()

print("Total different timestamps:", unique_timestamps)

Total different timestamps: 83943


# Remove direction then ordering by `t`, `u`, `v`, remove self-loops, duplicates, then save network

## Remove direction and sorting the link stream by `t`, `u`, `v`

In [13]:
# Swap 'u' and 'v' values where 'u' > 'v'
mask = df['u'] > df['v']
df.loc[mask, ['u', 'v']] = df.loc[mask, ['v', 'u']].values

# Sort DataFrame by timestamp, u, and v columns
df.sort_values(by=['timestamp', 'u', 'v'], inplace=True)

# Now df contains the sorted DataFrame with 'u' <= 'v'
df

Unnamed: 0,timestamp,u,v
0,1225229529,1,2
42,1225274307,51,52
77,1225287486,91,92
104,1225288438,124,125
28,1225290183,34,35
...,...,...,...
85469,1226532116,451,1141
80227,1226532281,4688,25607
80323,1226532332,879,25607
69185,1226532552,14939,26373


In [14]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Removing self-loops

In [19]:
# Check for self-loops
self_loops = df['u'] == df['v']

if self_loops.any():
    # Print self-loops
    print("Self-loops:")
    print(df[self_loops])

    # Remove self-loops
    df_no_self_loops = df[~self_loops] # Not self-loops 
    
    # Display the DataFrame without self-loops
    print("\nDataFrame without self-loops:")
    print(df_no_self_loops)
    df = df_no_self_loops
else:
    print("No self-loops found.")

Self-loops:
        timestamp      u      v
318    1225333227    361    361
1179   1225366656   1210   1210
1237   1225366729    274    274
1217   1225368616   1247   1247
1706   1225370070   1674   1674
...           ...    ...    ...
81488  1226528392  29092  29092
49529  1226528543  21598  21598
87599  1226529494   3764   3764
87613  1226530253   4792   4792
87611  1226530331  21539  21539

[1424 rows x 3 columns]

DataFrame without self-loops:
        timestamp      u      v
0      1225229529      1      2
42     1225274307     51     52
77     1225287486     91     92
104    1225288438    124    125
28     1225290183     34     35
...           ...    ...    ...
85469  1226532116    451   1141
80227  1226532281   4688  25607
80323  1226532332    879  25607
69185  1226532552  14939  26373
31587  1226536024    626  16099

[86203 rows x 3 columns]


## Removing duplicate rows

In [20]:
# Count duplicate rows
duplicate_count = df.duplicated(keep=False).sum()

if duplicate_count > 0:
    # Print all duplicate rows
    print("Duplicate rows:")
    print(df[df.duplicated(keep=False)])
    
    # Remove duplicate rows
    df_no_duplicates = df.drop_duplicates()
    
    # Display the DataFrame without duplicates
    print("\nDataFrame without duplicates:")
    print(df_no_duplicates)
else:
    print("No duplicates found.")

No duplicates found.


In [22]:
#df = df_no_duplicates
df

Unnamed: 0,timestamp,u,v
0,1225229529,1,2
42,1225274307,51,52
77,1225287486,91,92
104,1225288438,124,125
28,1225290183,34,35
...,...,...,...
85469,1226532116,451,1141
80227,1226532281,4688,25607
80323,1226532332,879,25607
69185,1226532552,14939,26373


In [23]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
    # Handle the problem as needed, e.g., raise an exception or exit the script
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Saving the link stream

In [25]:
df.to_csv(r'digg.txt', header=None, index=None, sep=' ') # Stop here