# Imports

In [1]:
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

#  Taking raw network

In [3]:
# Get uci raw and convert it to uci.txt format (t, u, v)
cols = ['u','v','weight','timestamp']
df = pd.read_csv('out.soc-sign-bitcoinalpha', sep="\t",  names=cols, skiprows=(1))
df

Unnamed: 0,u,v,weight,timestamp
0,1,2,10,1407470400
1,3,2,10,1376539200
2,4,2,10,1369713600
3,5,2,10,1350014400
4,6,2,10,1347854400
...,...,...,...,...
24181,885,1251,10,1364270400
24182,1251,885,10,1364270400
24183,885,1250,10,1364270400
24184,1250,885,10,1364270400


In [4]:
df.drop(columns=["weight"], inplace=True)
df

Unnamed: 0,u,v,timestamp
0,1,2,1407470400
1,3,2,1376539200
2,4,2,1369713600
3,5,2,1350014400
4,6,2,1347854400
...,...,...,...
24181,885,1251,1364270400
24182,1251,885,1364270400
24183,885,1250,1364270400
24184,1250,885,1364270400


In [5]:
# Changing the position of the columns
df = df[['timestamp', 'u', 'v']]
df

Unnamed: 0,timestamp,u,v
0,1407470400,1,2
1,1376539200,3,2
2,1369713600,4,2
3,1350014400,5,2
4,1347854400,6,2
...,...,...,...
24181,1364270400,885,1251
24182,1364270400,1251,885
24183,1364270400,885,1250
24184,1364270400,1250,885


In [6]:
df['timestamp'] = df['timestamp'].astype(int)
df

Unnamed: 0,timestamp,u,v
0,1407470400,1,2
1,1376539200,3,2
2,1369713600,4,2
3,1350014400,5,2
4,1347854400,6,2
...,...,...,...
24181,1364270400,885,1251
24182,1364270400,1251,885
24183,1364270400,885,1250
24184,1364270400,1250,885


In [7]:
# Check how many occurances we have in every timestamp
timestamps_grouped = df.groupby('timestamp')[['u', 'v']].apply(lambda x: x.shape[0])
print(timestamps_grouped)

timestamp
1289192400    4
1289365200    5
1289451600    1
1289538000    5
1289624400    2
             ..
1452747600    3
1452834000    2
1452920400    4
1453006800    1
1453438800    2
Length: 1647, dtype: int64


In [9]:
df_time = pd.DataFrame()

In [10]:
# Convert timestamps to datetime format
df_time['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df_time

Unnamed: 0,timestamp
0,2014-08-08 04:00:00
1,2013-08-15 04:00:00
2,2013-05-28 04:00:00
3,2012-10-12 04:00:00
4,2012-09-17 04:00:00
...,...
24181,2013-03-26 04:00:00
24182,2013-03-26 04:00:00
24183,2013-03-26 04:00:00
24184,2013-03-26 04:00:00


In [11]:
# Calculate total time span
time_span = df_time['timestamp'].max() - df_time['timestamp'].min()
time_span

Timedelta('1901 days 00:00:00')

In [13]:
# Convert time span to days
total_days = divmod(time_span.total_seconds(), 86400)[0]
print("Total number of days:", total_days)

Total number of days: 1901.0


In [14]:
unique_timestamps = df_time['timestamp'].nunique()

print("Total different timestamps:", unique_timestamps)

Total different timestamps: 1647


# Remove direction then ordering by `t`, `u`, `v`, remove self-loops, duplicates, then save network

## Remove direction and sorting the link stream by `t`, `u`, `v`

In [15]:
# Swap 'u' and 'v' values where 'u' > 'v'
mask = df['u'] > df['v']
df.loc[mask, ['u', 'v']] = df.loc[mask, ['v', 'u']].values

# Sort DataFrame by timestamp, u, and v columns
df.sort_values(by=['timestamp', 'u', 'v'], inplace=True)

# Now df contains the sorted DataFrame with 'u' <= 'v'
df

Unnamed: 0,timestamp,u,v
1276,1289192400,50,700
10469,1289192400,51,539
4005,1289192400,54,537
4004,1289192400,54,1502
922,1289365200,50,538
...,...,...,...
21812,1452920400,1748,1766
21814,1452920400,1748,1766
14393,1453006800,605,2727
5382,1453438800,47,1750


In [16]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Removing self-loops

In [20]:
# Check for self-loops
self_loops = df['u'] == df['v']

if self_loops.any():
    # Print self-loops
    print("Self-loops:")
    print(df[self_loops])

    # Remove self-loops
    df_no_self_loops = df[~self_loops] # Not self-loops 
    
    # Display the DataFrame without self-loops
    print("\nDataFrame without self-loops:")
    print(df_no_self_loops)
    df = df_no_self_loops
else:
    print("No self-loops found.")

No self-loops found.


## Removing duplicate rows

In [21]:
# Count duplicate rows
duplicate_count = df.duplicated(keep=False).sum()

if duplicate_count > 0:
    # Print all duplicate rows
    print("Duplicate rows:")
    print(df[df.duplicated(keep=False)])
    
    # Remove duplicate rows
    df_no_duplicates = df.drop_duplicates()
    
    # Display the DataFrame without duplicates
    print("\nDataFrame without duplicates:")
    print(df_no_duplicates)
else:
    print("No duplicates found.")

Duplicate rows:
        timestamp     u     v
10468  1289365200   538   539
10502  1289365200   538   539
8848   1289624400    22    68
8886   1289624400    22    68
8835   1289710800    22    51
...           ...   ...   ...
22743  1451278800  1698  3558
5231   1451538000    47  1698
5492   1451538000    47  1698
21812  1452920400  1748  1766
21814  1452920400  1748  1766

[14646 rows x 3 columns]

DataFrame without duplicates:
        timestamp     u     v
1276   1289192400    50   700
10469  1289192400    51   539
4005   1289192400    54   537
4004   1289192400    54  1502
922    1289365200    50   538
...           ...   ...   ...
19115  1452920400  1612  1729
21812  1452920400  1748  1766
14393  1453006800   605  2727
5382   1453438800    47  1750
13594  1453438800  1179  1750

[16863 rows x 3 columns]


In [22]:
df = df_no_duplicates
df

Unnamed: 0,timestamp,u,v
1276,1289192400,50,700
10469,1289192400,51,539
4005,1289192400,54,537
4004,1289192400,54,1502
922,1289365200,50,538
...,...,...,...
19115,1452920400,1612,1729
21812,1452920400,1748,1766
14393,1453006800,605,2727
5382,1453438800,47,1750


In [23]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Saving the link stream

In [25]:
df.to_csv(r'bitcoinalpha.txt', header=None, index=None, sep=' ') # Stop here