# Imports

In [1]:
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

#  Taking raw network

In [2]:
# Get uci raw and convert it to uci.txt format (t, u, v)
cols = ['u','v','weight','timestamp']
df = pd.read_csv('out.opsahl-ucsocial', sep=" ",  names=cols, skiprows=(2))
df

Unnamed: 0,u,v,weight,timestamp
0,1,2,1,1082008561
1,3,4,1,1082123439
2,5,2,1,1082381991
3,6,7,1,1082407219
4,8,7,1,1082407356
...,...,...,...,...
59830,1899,1847,1,1098744248
59831,1899,1097,1,1098744436
59832,1899,277,1,1098744603
59833,1878,1624,1,1098744711


In [3]:
df.drop(columns=["weight"], inplace=True)
df

Unnamed: 0,u,v,timestamp
0,1,2,1082008561
1,3,4,1082123439
2,5,2,1082381991
3,6,7,1082407219
4,8,7,1082407356
...,...,...,...
59830,1899,1847,1098744248
59831,1899,1097,1098744436
59832,1899,277,1098744603
59833,1878,1624,1098744711


In [4]:
# Changing the position of the columns
df = df[['timestamp', 'u', 'v']]
df

Unnamed: 0,timestamp,u,v
0,1082008561,1,2
1,1082123439,3,4
2,1082381991,5,2
3,1082407219,6,7
4,1082407356,8,7
...,...,...,...
59830,1098744248,1899,1847
59831,1098744436,1899,1097
59832,1098744603,1899,277
59833,1098744711,1878,1624


In [5]:
df['timestamp'] = df['timestamp'].astype(int)
df

Unnamed: 0,timestamp,u,v
0,1082008561,1,2
1,1082123439,3,4
2,1082381991,5,2
3,1082407219,6,7
4,1082407356,8,7
...,...,...,...
59830,1098744248,1899,1847
59831,1098744436,1899,1097
59832,1098744603,1899,277
59833,1098744711,1878,1624


In [6]:
# Check how many occurances we have in every timestamp
timestamps_grouped = df.groupby('timestamp')[['u', 'v']].apply(lambda x: x.shape[0])
print(timestamps_grouped)

timestamp
1082008561    1
1082123439    1
1082381991    1
1082407219    1
1082407356    1
             ..
1098744248    1
1098744436    1
1098744603    1
1098744711    1
1098744742    1
Length: 58911, dtype: int64


In [8]:
df_time = pd.DataFrame()

In [9]:
# Convert timestamps to datetime format
df_time['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df_time

Unnamed: 0,timestamp
0,2004-04-15 05:56:01
1,2004-04-16 13:50:39
2,2004-04-19 13:39:51
3,2004-04-19 20:40:19
4,2004-04-19 20:42:36
...,...
59830,2004-10-25 22:44:08
59831,2004-10-25 22:47:16
59832,2004-10-25 22:50:03
59833,2004-10-25 22:51:51


In [10]:
# Calculate total time span
time_span = df_time['timestamp'].max() - df_time['timestamp'].min()
time_span

Timedelta('193 days 16:56:21')

In [11]:
# Convert time span to days
total_days = divmod(time_span.total_seconds(), 86400)[0]
print("Total number of days:", total_days)

Total number of days: 193.0


In [12]:
unique_timestamps = df_time['timestamp'].nunique()

print("Total different timestamps:", unique_timestamps)

Total different timestamps: 58911


# Remove direction then ordering by `t`, `u`, `v`, remove self-loops, duplicates, then save network

## Remove direction and sorting the link stream by `t`, `u`, `v`

In [13]:
# Swap 'u' and 'v' values where 'u' > 'v'
mask = df['u'] > df['v']
df.loc[mask, ['u', 'v']] = df.loc[mask, ['v', 'u']].values

# Sort DataFrame by timestamp, u, and v columns
df.sort_values(by=['timestamp', 'u', 'v'], inplace=True)

# Now df contains the sorted DataFrame with 'u' <= 'v'
df

Unnamed: 0,timestamp,u,v
0,1082008561,1,2
1,1082123439,3,4
2,1082381991,2,5
3,1082407219,6,7
4,1082407356,7,8
...,...,...,...
59830,1098744248,1847,1899
59831,1098744436,1097,1899
59832,1098744603,277,1899
59833,1098744711,1624,1878


In [14]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Removing self-loops

In [18]:
# Check for self-loops
self_loops = df['u'] == df['v']

if self_loops.any():
    # Print self-loops
    print("Self-loops:")
    print(df[self_loops])

    # Remove self-loops
    df_no_self_loops = df[~self_loops] # Not self-loops 
    
    # Display the DataFrame without self-loops
    print("\nDataFrame without self-loops:")
    print(df_no_self_loops)
    df = df_no_self_loops
else:
    print("No self-loops found.")

No self-loops found.


## Removing duplicate rows

In [19]:
# Count duplicate rows
duplicate_count = df.duplicated(keep=False).sum()

if duplicate_count > 0:
    # Print all duplicate rows
    print("Duplicate rows:")
    print(df[df.duplicated(keep=False)])
    
    # Remove duplicate rows
    df_no_duplicates = df.drop_duplicates()
    
    # Display the DataFrame without duplicates
    print("\nDataFrame without duplicates:")
    print(df_no_duplicates)
else:
    print("No duplicates found.")

Duplicate rows:
        timestamp    u    v
965    1082846205   97  228
966    1082846205   97  228
2126   1083031338  260  338
2127   1083031338  260  338
5293   1083367240  308  398
...           ...  ...  ...
59602  1097939561    3  372
59608  1097939561    3  372
59604  1097939561    3  800
59606  1097939561    3  800
59607  1097939561    3  800

[79 rows x 3 columns]

DataFrame without duplicates:
        timestamp     u     v
0      1082008561     1     2
1      1082123439     3     4
2      1082381991     2     5
3      1082407219     6     7
4      1082407356     7     8
...           ...   ...   ...
59830  1098744248  1847  1899
59831  1098744436  1097  1899
59832  1098744603   277  1899
59833  1098744711  1624  1878
59834  1098744742  1624  1878

[59795 rows x 3 columns]


In [20]:
df = df_no_duplicates
df

Unnamed: 0,timestamp,u,v
0,1082008561,1,2
1,1082123439,3,4
2,1082381991,2,5
3,1082407219,6,7
4,1082407356,7,8
...,...,...,...
59830,1098744248,1847,1899
59831,1098744436,1097,1899
59832,1098744603,277,1899
59833,1098744711,1624,1878


In [21]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Saving the link stream

In [23]:
df.to_csv(r'ucimessages.txt', header=None, index=None, sep=' ') # Stop here