# Imports

In [1]:
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

#  Taking raw network

In [4]:
# Get uci raw and convert it to uci.txt format (t, u, v)
cols = ['u','v','weight','timestamp']
df = pd.read_csv('out.opsahl-ucforum', sep=" ",  names=cols, skiprows=(2))
df

Unnamed: 0,u,v,weight,timestamp
0,1,1,1,1084560796
1,2,1,1,1084560848
2,3,2,1,1084560940
3,4,1,1,1084561204
4,5,1,1,1084561337
...,...,...,...,...
33715,597,17,1,1098748293
33716,597,84,1,1098748697
33717,395,43,1,1098750918
33718,354,229,1,1098772822


In [5]:
# Add "top_" prefix to "u" column and "bottom_" prefix to "v" column
df['u'] = df['u'].apply(lambda x: f"top_{x}")
df['v'] = df['v'].apply(lambda x: f"bottom_{x}")
df

Unnamed: 0,u,v,weight,timestamp
0,top_1,bottom_1,1,1084560796
1,top_2,bottom_1,1,1084560848
2,top_3,bottom_2,1,1084560940
3,top_4,bottom_1,1,1084561204
4,top_5,bottom_1,1,1084561337
...,...,...,...,...
33715,top_597,bottom_17,1,1098748293
33716,top_597,bottom_84,1,1098748697
33717,top_395,bottom_43,1,1098750918
33718,top_354,bottom_229,1,1098772822


In [6]:
df.drop(columns=["weight"], inplace=True)
df

Unnamed: 0,u,v,timestamp
0,top_1,bottom_1,1084560796
1,top_2,bottom_1,1084560848
2,top_3,bottom_2,1084560940
3,top_4,bottom_1,1084561204
4,top_5,bottom_1,1084561337
...,...,...,...
33715,top_597,bottom_17,1098748293
33716,top_597,bottom_84,1098748697
33717,top_395,bottom_43,1098750918
33718,top_354,bottom_229,1098772822


In [7]:
# Changing the position of the columns
df = df[['timestamp', 'u', 'v']]
df

Unnamed: 0,timestamp,u,v
0,1084560796,top_1,bottom_1
1,1084560848,top_2,bottom_1
2,1084560940,top_3,bottom_2
3,1084561204,top_4,bottom_1
4,1084561337,top_5,bottom_1
...,...,...,...
33715,1098748293,top_597,bottom_17
33716,1098748697,top_597,bottom_84
33717,1098750918,top_395,bottom_43
33718,1098772822,top_354,bottom_229


In [8]:
df['timestamp'] = df['timestamp'].astype(int)
df

Unnamed: 0,timestamp,u,v
0,1084560796,top_1,bottom_1
1,1084560848,top_2,bottom_1
2,1084560940,top_3,bottom_2
3,1084561204,top_4,bottom_1
4,1084561337,top_5,bottom_1
...,...,...,...
33715,1098748293,top_597,bottom_17
33716,1098748697,top_597,bottom_84
33717,1098750918,top_395,bottom_43
33718,1098772822,top_354,bottom_229


In [9]:
# Check how many occurances we have in every timestamp
timestamps_grouped = df.groupby('timestamp')[['u', 'v']].apply(lambda x: x.shape[0])
print(timestamps_grouped)

timestamp
1084560796    1
1084560848    1
1084560940    1
1084561204    1
1084561337    1
             ..
1098748293    1
1098748697    1
1098750918    1
1098772822    1
1098772901    1
Length: 33515, dtype: int64


In [11]:
df_time = pd.DataFrame()

In [12]:
# Convert timestamps to datetime format
df_time['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df_time

Unnamed: 0,timestamp
0,2004-05-14 18:53:16
1,2004-05-14 18:54:08
2,2004-05-14 18:55:40
3,2004-05-14 19:00:04
4,2004-05-14 19:02:17
...,...
33715,2004-10-25 23:51:33
33716,2004-10-25 23:58:17
33717,2004-10-26 00:35:18
33718,2004-10-26 06:40:22


In [14]:
# Calculate total time span
time_span = df_time['timestamp'].max() - df_time['timestamp'].min()
time_span

Timedelta('164 days 11:48:25')

In [15]:
# Convert time span to days
total_days = divmod(time_span.total_seconds(), 86400)[0]
print("Total number of days:", total_days)

Total number of days: 164.0


In [16]:
unique_timestamps = df_time['timestamp'].nunique()

print("Total different timestamps:", unique_timestamps)

Total different timestamps: 33515


# Remove direction then ordering by `t`, `u`, `v`, remove self-loops, duplicates, then save network

## Remove direction and sorting the link stream by `t`, `u`, `v`

In [17]:
# Swap 'u' and 'v' values where 'u' > 'v'
mask = df['u'] > df['v']
df.loc[mask, ['u', 'v']] = df.loc[mask, ['v', 'u']].values

# Sort DataFrame by timestamp, u, and v columns
df.sort_values(by=['timestamp', 'u', 'v'], inplace=True)

# Now df contains the sorted DataFrame with 'u' <= 'v'
df

Unnamed: 0,timestamp,u,v
0,1084560796,bottom_1,top_1
1,1084560848,bottom_1,top_2
2,1084560940,bottom_2,top_3
3,1084561204,bottom_1,top_4
4,1084561337,bottom_1,top_5
...,...,...,...
33715,1098748293,bottom_17,top_597
33716,1098748697,bottom_84,top_597
33717,1098750918,bottom_43,top_395
33718,1098772822,bottom_229,top_354


In [18]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


In [19]:
df.timestamp.is_monotonic_increasing

True

## Removing self-loops

In [22]:
# Check for self-loops
self_loops = df['u'] == df['v']

if self_loops.any():
    # Print self-loops
    print("Self-loops:")
    print(df[self_loops])

    # Remove self-loops
    df_no_self_loops = df[~self_loops] # Not self-loops 
    
    # Display the DataFrame without self-loops
    print("\nDataFrame without self-loops:")
    print(df_no_self_loops)
    df = df_no_self_loops
else:
    print("No self-loops found.")

No self-loops found.


## Removing duplicate rows

In [23]:
# Count duplicate rows
duplicate_count = df.duplicated(keep=False).sum()

if duplicate_count > 0:
    # Print all duplicate rows
    print("Duplicate rows:")
    print(df[df.duplicated(keep=False)])
    
    # Remove duplicate rows
    df_no_duplicates = df.drop_duplicates()
    
    # Display the DataFrame without duplicates
    print("\nDataFrame without duplicates:")
    print(df_no_duplicates)
else:
    print("No duplicates found.")

Duplicate rows:
        timestamp           u        v
8054   1085565680  bottom_369   top_34
8055   1085565680  bottom_369   top_34
13646  1086013008  bottom_102  top_115
13647  1086013008  bottom_102  top_115
16966  1086290084   bottom_63  top_213
16967  1086290084   bottom_63  top_213
29906  1093197097  bottom_191  top_227
29907  1093197097  bottom_191  top_227

DataFrame without duplicates:
        timestamp           u        v
0      1084560796    bottom_1    top_1
1      1084560848    bottom_1    top_2
2      1084560940    bottom_2    top_3
3      1084561204    bottom_1    top_4
4      1084561337    bottom_1    top_5
...           ...         ...      ...
33715  1098748293   bottom_17  top_597
33716  1098748697   bottom_84  top_597
33717  1098750918   bottom_43  top_395
33718  1098772822  bottom_229  top_354
33719  1098772901  bottom_229  top_354

[33716 rows x 3 columns]


In [24]:
df = df_no_duplicates
df

Unnamed: 0,timestamp,u,v
0,1084560796,bottom_1,top_1
1,1084560848,bottom_1,top_2
2,1084560940,bottom_2,top_3
3,1084561204,bottom_1,top_4
4,1084561337,bottom_1,top_5
...,...,...,...
33715,1098748293,bottom_17,top_597
33716,1098748697,bottom_84,top_597
33717,1098750918,bottom_43,top_395
33718,1098772822,bottom_229,top_354


In [25]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Saving the link stream

In [28]:
df.to_csv(r'ucimovies.txt', header=None, index=None, sep=' ')