# Imports

In [1]:
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

#  Taking raw network

In [2]:
# Get uci raw and convert it to uci.txt format (t, u, v)
cols = ['u','v','weight','timestamp']
df = pd.read_csv('out.soc-sign-bitcoinotc', sep="\t",  names=cols, skiprows=(1))
df

Unnamed: 0,u,v,weight,timestamp
0,1,2,4,1.289242e+09
1,1,3,2,1.289242e+09
2,4,5,1,1.289243e+09
3,6,7,7,1.289245e+09
4,8,9,8,1.289254e+09
...,...,...,...,...
35587,4379,1786,1,1.453612e+09
35588,2653,3799,5,1.453679e+09
35589,2653,4798,5,1.453679e+09
35590,8,1082,1,1.453680e+09


In [3]:
df.drop(columns=["weight"], inplace=True)
df

Unnamed: 0,u,v,timestamp
0,1,2,1.289242e+09
1,1,3,1.289242e+09
2,4,5,1.289243e+09
3,6,7,1.289245e+09
4,8,9,1.289254e+09
...,...,...,...
35587,4379,1786,1.453612e+09
35588,2653,3799,1.453679e+09
35589,2653,4798,1.453679e+09
35590,8,1082,1.453680e+09


In [4]:
# Changing the position of the columns
df = df[['timestamp', 'u', 'v']]
df

Unnamed: 0,timestamp,u,v
0,1.289242e+09,1,2
1,1.289242e+09,1,3
2,1.289243e+09,4,5
3,1.289245e+09,6,7
4,1.289254e+09,8,9
...,...,...,...
35587,1.453612e+09,4379,1786
35588,1.453679e+09,2653,3799
35589,1.453679e+09,2653,4798
35590,1.453680e+09,8,1082


In [5]:
df['timestamp'] = df['timestamp'].astype(int)
df

Unnamed: 0,timestamp,u,v
0,1289241911,1,2
1,1289241941,1,3
2,1289243140,4,5
3,1289245277,6,7
4,1289254254,8,9
...,...,...,...
35587,1453612481,4379,1786
35588,1453679428,2653,3799
35589,1453679434,2653,4798
35590,1453679632,8,1082


In [6]:
# Check how many occurances we have in every timestamp
timestamps_grouped = df.groupby('timestamp')[['u', 'v']].apply(lambda x: x.shape[0])
print(timestamps_grouped)

timestamp
1289241911    1
1289241941    1
1289243140    1
1289245277    1
1289254254    1
             ..
1453612481    1
1453679428    1
1453679434    1
1453679632    1
1453684323    1
Length: 35427, dtype: int64


In [8]:
df_time = pd.DataFrame()

In [9]:
# Convert timestamps to datetime format
df_time['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df_time

Unnamed: 0,timestamp
0,2010-11-08 18:45:11
1,2010-11-08 18:45:41
2,2010-11-08 19:05:40
3,2010-11-08 19:41:17
4,2010-11-08 22:10:54
...,...
35587,2016-01-24 05:14:41
35588,2016-01-24 23:50:28
35589,2016-01-24 23:50:34
35590,2016-01-24 23:53:52


In [10]:
# Calculate total time span
time_span = df_time['timestamp'].max() - df_time['timestamp'].min()
time_span

Timedelta('1903 days 06:26:52')

In [11]:
# Convert time span to days
total_days = divmod(time_span.total_seconds(), 86400)[0]
print("Total number of days:", total_days)

Total number of days: 1903.0


In [12]:
unique_timestamps = df_time['timestamp'].nunique()

print("Total different timestamps:", unique_timestamps)

Total different timestamps: 35427


# Remove direction then ordering by `t`, `u`, `v`, remove self-loops, duplicates, then save network

## Remove direction and sorting the link stream by `t`, `u`, `v`

In [13]:
# Swap 'u' and 'v' values where 'u' > 'v'
mask = df['u'] > df['v']
df.loc[mask, ['u', 'v']] = df.loc[mask, ['v', 'u']].values

# Sort DataFrame by timestamp, u, and v columns
df.sort_values(by=['timestamp', 'u', 'v'], inplace=True)

# Now df contains the sorted DataFrame with 'u' <= 'v'
df

Unnamed: 0,timestamp,u,v
0,1289241911,1,2
1,1289241941,1,3
2,1289243140,4,5
3,1289245277,6,7
4,1289254254,8,9
...,...,...,...
35587,1453612481,1786,4379
35588,1453679428,2653,3799
35589,1453679434,2653,4798
35590,1453679632,8,1082


In [14]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Removing self-loops

In [18]:
# Check for self-loops
self_loops = df['u'] == df['v']

if self_loops.any():
    # Print self-loops
    print("Self-loops:")
    print(df[self_loops])

    # Remove self-loops
    df_no_self_loops = df[~self_loops] # Not self-loops 
    
    # Display the DataFrame without self-loops
    print("\nDataFrame without self-loops:")
    print(df_no_self_loops)
    df = df_no_self_loops
else:
    print("No self-loops found.")

No self-loops found.


## Removing duplicate rows

In [21]:
# Count duplicate rows
duplicate_count = df.duplicated(keep=False).sum()

if duplicate_count > 0:
    # Print all duplicate rows
    print("Duplicate rows:")
    print(df[df.duplicated(keep=False)])
    
    # Remove duplicate rows
    df_no_duplicates = df.drop_duplicates()
    
    # Display the DataFrame without duplicates
    print("\nDataFrame without duplicates:")
    print(df_no_duplicates)
else:
    print("No duplicates found.")

Duplicate rows:
        timestamp     u     v
370    1297908772   111   113
371    1297908772   111   113
445    1298654517    92   123
446    1298654517    92   123
1779   1304966388   103   428
1780   1304966388   103   428
3567   1307198593   637   793
3568   1307198593   637   793
4445   1307680900  1009  1010
4446   1307680900  1009  1010
4474   1307697178   859   990
4475   1307697178   859   990
5098   1308428289   978  1051
5099   1308428289   978  1051
9798   1335305634     8  1198
9799   1335305634     8  1198
10675  1339024627     6    15
10676  1339024627     6    15
11423  1341415816  2116  2185
11424  1341415816  2116  2185
11779  1342551805  1847  2011
11780  1342551805  1847  2011
15680  1352584828     8   745
15681  1352584828     8   745
16723  1355243119   968  1456
16724  1355243119   968  1456
20524  1364955435   173  2011
20525  1364955435   173  2011
21422  1365952075  3639  3877
21423  1365952075  3639  3877
21843  1366592251  3625  3639
21844  1366592251  3625 

In [22]:
df = df_no_duplicates
df

Unnamed: 0,timestamp,u,v
0,1289241911,1,2
1,1289241941,1,3
2,1289243140,4,5
3,1289245277,6,7
4,1289254254,8,9
...,...,...,...
35587,1453612481,1786,4379
35588,1453679428,2653,3799
35589,1453679434,2653,4798
35590,1453679632,8,1082


In [23]:
# Sort DataFrame by timestamp, u, and v columns
df_sorted = df.sort_values(by=['timestamp', 'u', 'v'])

# Check if the DataFrame is sorted by 'u' and 'v' columns
if not df_sorted[['u', 'v']].apply(lambda x: x['v'] >= x['u'], axis=1).all():
    print("PROBLEM!!! Unsorted nodes in the DataFrame.")
else:
    print("DataFrame is sorted by 'u' and 'v'.")

# Check if the DataFrame is sorted by 'timestamp'
if not df_sorted['timestamp'].is_monotonic_increasing:
    print("PROBLEM!!! DataFrame is not sorted by 'timestamp'.")
else:
    print("DataFrame is sorted by 'timestamp'.")


DataFrame is sorted by 'u' and 'v'.
DataFrame is sorted by 'timestamp'.


## Saving the link stream

In [24]:
df.to_csv(r'bitcoinotc.txt', header=None, index=None, sep=' ') # Stop here