## Libraries and path variables

In [None]:
!pip install cdlib

In [None]:
!pip install karateclub

In [None]:
!pip install leidenalg

In [None]:
!pip install wurlitzer

In [None]:
!pip install ASLPAw

In [None]:
import pandas as pd
import numpy as np
import os 
import shutil

import networkx.algorithms.community as nx_comm
import itertools
from cdlib import algorithms
import networkx as nx

returns_filepath = '/content/drive/Shareddrives/BTP/CombinedFiles/returns_after_interpolation.xlsx'
closing_price_filepath = '/content/drive/Shareddrives/BTP/CombinedFiles/closing_price_crypto_stocks_after_interpolation.xlsx'  
folderpath = '/content/drive/Shareddrives/BTP/FinalNetworks/SlidingWindow/Network'

from google.colab import drive
drive.mount('/content/drive')

## Helper Functions

In [None]:
# Get graph
def get_adjacency_matrix(corrMatrix, n, threshold=0.3):
  adj_matrix = [[0 for _ in range(n)] for _ in range(n)]
  for i in range(n):
    for j in range(n):
      if i!=j and np.abs(corrMatrix[i][j])>threshold:
        adj_matrix[i][j] = corrMatrix[i][j]
      
  return nx.from_numpy_matrix(np.matrix(adj_matrix))


# Timespan in num of days
def get_interval_data(start_idx, timespan, data):
  col_list = []
  for cols in data.columns:
    if pd.notnull(data[cols][start_idx]):
      col_list.append(cols)
  
  df = data[col_list][start_idx:start_idx+timespan].copy().reset_index().drop(columns=['index'])
  return df

# Generate metrics
def get_metrics(G, filename, avg_weight, timespan, pedge):
  avg_neighbour_degree = []
  degree = G.degree()
  avg_neighbour_degree = nx.average_neighbor_degree(G)
  degree_centrality_nodes = nx.degree_centrality(G)
  eigen_vector_centrality_nodes = nx.eigenvector_centrality(G, max_iter=200)
  katz_centrality_nodes = nx.katz_centrality_numpy(G)
  closeness_centrality_nodes = nx.closeness_centrality(G)
  betweenness_centrality_nodes = nx.betweenness_centrality(G)
  clustering_array = nx.clustering(G)
  arr = []
  for node in G.nodes():
    arr.append([node, degree[node], avg_neighbour_degree[node], degree_centrality_nodes[node], eigen_vector_centrality_nodes[node], katz_centrality_nodes[node], closeness_centrality_nodes[node], betweenness_centrality_nodes[node], clustering_array[node]])
  pd.DataFrame(arr, columns = ['Coin','Degree','AvgNeighbourDegree','DegreeC','EigenVectorC','KatzC','ClosenessC','BetweennessC','ClusteringCoef']).to_excel(filename+'.xlsx',index=False)

  
  G_deg = nx.degree_histogram(G)
  G_deg_sum = [a * b for a, b in zip(G_deg, range(0, len(G_deg)))]
  network_metrics = []
  print(filename.split('/')[-1].split('_')[-1])
  network_metrics.append(timespan)
  network_metrics.append(filename.split('/')[-1].split('_')[-1])
  network_metrics.append(len(G.nodes()))
  network_metrics.append(len(G.edges()))
  network_metrics.append(nx.average_clustering(G))
  network_metrics.append(sum(G_deg_sum)/G.number_of_nodes())
  network_metrics.append(nx.number_connected_components(G))
  
  # If the graph is not connected then calculate metrics for the largest sub-graph 
  if nx.is_connected(G):
    network_metrics.append(nx.average_shortest_path_length(G))
    network_metrics.append(nx.radius(G))
    network_metrics.append(nx.diameter(G))
    network_metrics.append(nx.periphery(G))
    network_metrics.append(nx.center(G))
  else:
    network_metrics.append(nx.average_shortest_path_length(G.subgraph(max(nx.connected_components(G), key=len))))
    network_metrics.append(nx.radius(G.subgraph(max(nx.connected_components(G), key=len))))
    network_metrics.append(nx.diameter(G.subgraph(max(nx.connected_components(G), key=len))))
    network_metrics.append(nx.periphery(G.subgraph(max(nx.connected_components(G), key=len))))
    network_metrics.append(nx.center(G.subgraph(max(nx.connected_components(G), key=len))))
  

  network_metrics.append(nx.transitivity(G))
  network_metrics.append(nx.density(G))
  network_metrics.append(nx.degree_assortativity_coefficient(G))
  network_metrics.append(avg_weight)
  rc = nx.rich_club_coefficient(G, normalized=False, seed=42)
  network_metrics.append(np.round(np.mean(list(rc.values())),4))
  network_metrics.append(pedge)
  network_metrics.append(len(G.edges()) - pedge)

  # Core-expansion (Overlapping communities)
  coms = algorithms.core_expansion(G)
  temp = coms.communities
  network_metrics.append(temp)

  # Leiden 
  coms = algorithms.leiden(G)
  temp = coms.communities
  network_metrics.append(temp)
  network_metrics.append(nx_comm.modularity(G,communities = temp))

  network_metrics.append(nx.dominating_set(G))
  
  return network_metrics

In [None]:
def get_all_metrics(timespan, increment, start_idx, end_idx, dates_arr, data, folderpath,threshold=0.3):
  net_metrics = []
  count =0
  ans=[]
  
  if not os.path.exists(folderpath+'/EdgeList'):
    os.mkdir(folderpath+'/EdgeList')
    os.mkdir(folderpath+'/NodeMetrics')
  
  for i in range(start_idx,end_idx,increment):
    count=0

    # change it to <=end_idx if no network required after end_idx
    if (i+timespan)<=len(data):
      temp = get_interval_data(i, timespan, data)
      node1 = []
      node2 = []
      weight = []
      corr_matrix = np.corrcoef(temp.T)
      
      ans.append([np.mean(corr_matrix), np.var(corr_matrix)])
      crypto_list = temp.columns
      G = get_adjacency_matrix(corr_matrix, len(crypto_list), threshold)
      
      for line in nx.generate_edgelist(G, data=["weight"]):
        line_data = line.split(' ')
        node1.append(crypto_list[int(line_data[0])])
        node2.append(crypto_list[int(line_data[1])])
        weight.append(np.round(float(line_data[2]),2))
        if float(line_data[2])>float(0):
          count+=1
      data_temp = pd.DataFrame(np.transpose([node1,node2,weight]), columns=['Node1','Node2','Weight'])
      data_temp.to_excel(folderpath + '/EdgeList/'+dates_arr[i+1]+'.xlsx', index=False)
      
      graph = nx.Graph()
      weight = []
      for index,row in data_temp.iterrows():
          graph.add_edges_from([(row['Node1'],row['Node2'])],weight = float(row['Weight']))
          weight.append(float(row['Weight']))
      net_metrics.append(get_metrics(graph,folderpath + '/NodeMetrics/'+ dates_arr[i+1], np.mean(weight), timespan,count))
      
  df_new = pd.DataFrame(net_metrics, columns=['Time','File','#Nodes','#Edges','AvgClustering','AvgDegree','ConnectedComponents','CharacteristicPathLength','Radius','Diameter','Periphery','Centers','Transitivity','Density','Assortivity','AvgCorr','RCC','PEdges','Diff','core-expansion','Leiden','LeidenM','Walktrap','WalktrapM','dominating_set'])
  df_new.to_excel(folderpath + '/MasterFile_'+str(int(threshold*10))+'_'+str(timespan)+'_' +str(increment) + '.xlsx',index=False)
  return df_new

## Driver steps

In [None]:
# returns file
data = pd.read_excel(returns_filepath)

# dates of closing price
dates_arr = pd.read_excel(closing_price_filepath)['Date']
dates_arr = [dates_arr[i].split(' ')[0] for i in range(len(dates_arr))]
start_date = '2017-08-08'

# segment size for correlation network
timespan = 14

# inc =  (timespan) for moving window
increment = 1

# if you want data for 8th aug give date of 7th aug
start_index = list(dates_arr).index(start_date)

# last index to possibly start the network
# if end_index=14 then network from 13-20 index is also considered since 13<end_index
end_index = len(data)

# comment else part to add files without deleting the existing subdirectories
if not os.path.exists(folderpath):
  os.mkdir(folderpath)
else:
  shutil.rmtree(folderpath)# Removes all the subdirectories
  os.mkdir(folderpath)

df = get_all_metrics(timespan, increment, start_index, end_index, dates_arr, data, folderpath+'/')