In [None]:
#Reads in edge list and plots it

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import OrderedDict
from matplotlib import style
import statistics
%matplotlib inline


In [None]:
# change defaults to be less ugly
mpl.rc('xtick', labelsize=14, color="#222222") 
mpl.rc('ytick', labelsize=14, color="#222222") 
mpl.rc('font', **{'family':'sans-serif','sans-serif':['Arial']})
mpl.rc('font', size=16)
mpl.rc('xtick.major', size=6, width=1)
mpl.rc('xtick.minor', size=3, width=1)
mpl.rc('ytick.major', size=6, width=1)
mpl.rc('ytick.minor', size=3, width=1)
mpl.rc('axes', linewidth=1, edgecolor="#222222", labelcolor="#222222")
mpl.rc('text', usetex=False, color="#222222")

In [None]:
#Reading in edgelist and creating a graph with data
G = nx.Graph()
import csv

edgelist = []

with open('compiledEdgeList.csv') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    line = 0
    for row in reader:
        if line == 0:
            #print("Headers")
            line += 1
        else:
            edgelist.append([row[0],row[1]])
            line += 1
G.add_edges_from(edgelist)

In [None]:
#getting degree list
N = len(G)
L = G.size()
degrees = [G.degree(node) for node in G]
kmin = min(degrees)
kmax = max(degrees)

#calculating the average clustering coefficient
all_cluster = nx.clustering(G)
all_clustering = all_cluster.values()
avg_cc = sum(all_clustering)/len(all_clustering)
SD_cc = statistics.pstdev(all_clustering)
avg_clusterCoeff = nx.average_clustering(G)

#calculating shortest path 
#https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.average_shortest_path_length.html#networkx.algorithms.shortest_paths.generic.average_shortest_path_length
shortestpath_list = []
for C in (G.subgraph(c).copy() for c in nx.connected_components(G)):
    shortestpath_list.append(nx.average_shortest_path_length(C))
avg_shortestpath = sum(shortestpath_list)/len(shortestpath_list)
SD_shortestpath = statistics.pstdev(shortestpath_list)

#calculating the hubs, nodes with highest degree
top = 0
for node in G:
    current = G.degree(node)
    if current >= 280:
        top = current
        print("Node name: "+node+" -> Degree: "+str(current))
        
density_spotify = nx.density(G)


#printing network details
print("CC: ", str(avg_cc))
print("SD CC:", str(SD_cc))
print()
print("Shortest Path: ", str(avg_shortestpath))
print("Standard Deviation Shortest Path: ", str(SD_shortestpath))
print()
print("281 - R3HAB - 6cEuCEZu7PAE9ZSzLLc2oQ")
print("318 - Armin Van Buuren - 0SfsnGyD8FpIN4U4WCkBZ5")
print("349 - Snoop Dogg - 7hJcb9fa4alzcOq3EaNPoG")
print("423 - Gucci Mane - 13y7CgLHjMVRMDqxdx0Xdo")
print("476 - Frederic Chopin - 7y97mc3bZRFXzT2szRM4L4")
print("693 - Wolfgang Amadeus Mozart - 4NJhFmfw43RLBLjQvxDuRS")
print()
print("Density: ", density_spotify)
print()
print("Clustering Coefficient: ", avg_clusterCoeff)
print()
print("Number of nodes: ", N)
print("Number of edges: ", L)
print()
print("Average degree: ", 2*L/N)
print("Average degree (alternate calculation)", np.mean(degrees))
print()
print("Minimum degree: ", kmin)
print("Maximum degree: ", kmax)

In [None]:
pagerank = nx.pagerank_numpy(G)

In [None]:
print(pagerank.items())

In [None]:
#This cell was to used to find the page rank of the hubs in the network and the artists with the most popular nodes
#The value in the if statement allows the top 7 ranks to be returned
for key, value in pagerank.items():
   if 0.0026544201697824665 <= value:
      print(key + " <--> "+ str(value))

#to get the value of the most popular artists we hardcoded the value into the for loop. the key in the for loop is drake
for key, value in pagerank.items():
   if '3TVXtAsR1Inumwj472S9r4' = key:
      print(key + " <--> "+ str(value))
        
print("Drake: 3TVXtAsR1Inumwj472S9r4 <-->0.0006536280628680085")
print("Bad Bunny: 4q3ewBCX7sLwd24euuV69X <-->0.0005930524029684813")
print("Ed Sheeran: 6eUKZXaKkcviH0Ku9w2n3V <-->0.0007906624150807254")
print("The Weeknd: 1Xyo4u8uXC1ZmMpatF05PJ <-->0.0003952236923240803")
print("Taylor Swift: 06HL4z0CvFAxyc27GXpf02 <-->0.00027426425759206067")
print("Justin Beiber: 1uNFoZAHBGtllmzznpCI3s <-->0.0005666403149823618")
print("Ariana Grande: 66CXWjxzNUsdJxJ2JdwvnR <-->0.0004194871803994584")
print("/////////////////////////////////")
print("Gucci Mane:  13y7CgLHjMVRMDqxdx0Xdo <--> 0.0035658444144083044")
print("Snoop Dogg:  7hJcb9fa4alzcOq3EaNPoG <--> 0.0031572361155687368")
print("Tiësto: 2o5jDhtHVPhrJdv3cEQ99Z <--> 0.0026544201697824665")
print("Armin Van Buuren:  0SfsnGyD8FpIN4U4WCkBZ5 <--> 0.003352196827508962")
print("Wolfgang Amadeus Mozart:  4NJhFmfw43RLBLjQvxDuRS <--> 0.009102627300756309")
print("John Williams:  3dRfiJ2650SZu6GbydcHNb <--> 0.0027041842318097746")
print("Frédéric Chopin:  7y97mc3bZRFXzT2szRM4L4 <--> 0.007676891652695455")

In [None]:
#degree distribution for spotify network.

sorted_d = sorted(degrees)
sorted_og = sorted_d
sorted_d.reverse()
#sorted_d = sorted_d[1:]
#print(sorted_d)

currentMax = 0
max_node = None
secondMax = 0
second_node = None

#plotting the degree distribution of the Spotify network
fig = plt.figure("Degree of a random graph", figsize=(8, 8))
axgrid = fig.add_gridspec(5, 4)
ax1 = fig.add_subplot(axgrid[3:, :2])
style.use('fivethirtyeight')
ax1.set_title("Rank Degree Distribution")
ax1.set_xlabel("Rank")
ax1.set_ylabel("Degree")
ax1.plot(sorted_d, "b-", marker="o")


ax2 = fig.add_subplot(axgrid[3:, 2:])
style.use('fivethirtyeight')
ax2.set_title("Degree Histogram")
ax2.set_xlabel("# of Nodes")
ax2.set_ylabel("Degree")
ax2.hist(sorted_d, color='blue' ,edgecolor='purple',log=True)

fig.tight_layout()
plt.show()

In [None]:
#below is comparing the er model(random network) and BA model(preferential attachment) to our graph. The next two cells 
#print out a graph with their degree distribution and calculates each of their average clustering coefficients

In [None]:
#https://networkx.org/documentation/stable/auto_examples/drawing/plot_degree.html
#source of diagrams above

G_er = nx.gnp_random_graph(20478, 0.02, seed=10374196)

all_cluster_random = nx.clustering(G_er)
avg_clusterCoeff_random = nx.average_clustering(G_er)

degree_sequence = sorted((d for n, d in G_er.degree()), reverse=True)
dmax = max(degree_sequence)

fig = plt.figure("Degree of a random graph", figsize=(8, 8))
# Create a gridspec for adding subplots of different sizes
axgrid = fig.add_gridspec(5, 4)

ax1 = fig.add_subplot(axgrid[3:, :2])
ax1.plot(degree_sequence, "b-", marker="o")
ax1.set_title("Degree Rank Plot")
ax1.set_ylabel("Degree")
ax1.set_xlabel("Rank")

ax2 = fig.add_subplot(axgrid[3:, 2:])
ax2.bar(*np.unique(degree_sequence, return_counts=True))
#x2.hist(degree_sequence, color='blue' ,edgecolor='purple',log=True)
ax2.set_title("Degree histogram")
ax2.set_xlabel("Degree")
ax2.set_ylabel("# of Nodes")

fig.tight_layout()
plt.show()

In [None]:
#https://networkx.org/documentation/stable/auto_examples/drawing/plot_degree.html
#source of diagrams above

#seed = np.random.randint(10000000, 99999999)
#print(seed)
G_ba = nx.barabasi_albert_graph(20478, 4, seed=29546566)

all_cluster_ba = nx.clustering(G_ba)
avg_clusterCoeff_ba = nx.average_clustering(G_ba)

degree_sequence = sorted((d for n, d in G_ba.degree()), reverse=True)
dmax = max(degree_sequence)
degrees = [G_ba.degree(node) for node in G_ba]
kmin = min(degrees)
kmax = max(degrees)

print(kmin)
print(kmax)

fig = plt.figure("Degree of a random graph", figsize=(8, 8))
# Create a gridspec for adding subplots of different sizes
axgrid = fig.add_gridspec(5, 4)

ax1 = fig.add_subplot(axgrid[3:, :2])
ax1.plot(degree_sequence, "b-", marker="o")
ax1.set_title("Degree Rank Plot")
ax1.set_ylabel("Degree")
ax1.set_xlabel("Rank")

ax2 = fig.add_subplot(axgrid[3:, 2:])
#ax2.bar(*np.unique(degree_sequence, return_counts=True, log=True))
ax2.hist(degree_sequence, color='blue' ,edgecolor='purple',log=True)
ax2.set_title("Degree histogram")
ax2.set_xlabel("Degree")
ax2.set_ylabel("# of Nodes")

fig.tight_layout()
plt.show()

In [None]:
#comparing the the cluster values of each graph. Our graph has a high amount of cluster coefficients meaning
#we have alot of communities and get rich club????
plt.plot(sorted(all_cluster_ba.values()))
plt.plot(sorted(all_cluster_random.values()))
plt.plot(sorted(all_cluster.values()))
plt.show()

In [None]:
#The rest of the cells calculate the cluster coefficient and shortest path in batches. For the graph sizes used to 
#calcute this rest of cells take about 11hrs to complete on my hardware -- ryzen 5700x as a processor the list outputs 
#were copied and pasted to a txt file to minimize the amount of time this had to be run.  

In [None]:
#DO NOT RUN!!!!, TAKES TOO LONG
#RESULT ----> 0.020000466431383424
#txt file is the repo if you want to take a look at all the data (Random_network_cluster_coefficient)

i = 0
sum_coeff = 0
total_values = 0
list_coeff_er = []
while i<100:
    seed = np.random.randint(10000000, 99999999)
    G = nx.gnp_random_graph(20478, 0.02, seed=seed)
    temp = nx.average_clustering(G)
    list_coeff_er.append(temp)
    sum_coeff += temp
    print("cluster coefficient: " + str(sum_coeff) + " index: " + str(i))
    i+=1

avg = sum_coeff/100

print("sum_coeff: "+str(sum_coeff))
print("total_values "+str(total_values))
print("avg_coefficient: "+str(avg))

In [None]:
#DO NOT RUN!!!!, TAKES TOO LONG
i = 0
sum_coeff = 0
list_coeff_ba = []
while i<100:
    seed = np.random.randint(10000000, 99999999)
    G = nx.barabasi_albert_graph(20478, 4, seed=seed)
    temp = nx.average_clustering(G)
    list_coeff_ba.append(temp)
    sum_coeff += temp
    print("cluster coefficient: " + str(sum_coeff) + " index: " + str(i))
    i+=1

avg = sum_coeff/10

print("sum_coeff: "+str(sum_coeff))
print()
print("avg_coefficient: "+str(avg))

In [None]:
#DO NOT RUN!!!!, TAKES TOO LONG
i = 0
avg_path = 0
list_path_er = []
while i<10:
    print("GRAPH: " + str(i) + "/////////////////////////////////")
    seed = np.random.randint(10000000, 99999999)
    print(seed)
    G = nx.gnp_random_graph(20478, 0.02, seed=seed)
    temp_counter = 1
    temp_path = 0
    for C in (G.subgraph(c).copy() for c in nx.connected_components(G)):
        temp = nx.average_shortest_path_length(C)
        list_path_er.append(temp)
        path = temp
        print(path)
        temp_path += path
        temp_counter+=1
    print(temp_path/temp_counter)
    avg_path += temp_path/temp_counter
    print(avg_path)
    i+=1

avgpath = avg_path/10

print("total_values "+str("10"))
print("avg_coefficient: "+str(avgpath))

In [None]:
#DO NOT RUN!!!!, TAKES TOO LONG

i = 0
avg_path = 0
list_path_ba = []
while i<10:
    print("GRAPH: " + str(i) + "/////////////////////////////////")
    seed = np.random.randint(10000000, 99999999)
    print(seed)
    G = nx.barabasi_albert_graph(20478, 4, seed=seed)
    temp_counter = 1
    temp_path = 0
    for C in (G.subgraph(c).copy() for c in nx.connected_components(G)):
        temp = nx.average_shortest_path_length(C)
        list_path_ba.append(temp)
        path = temp
        print(path)
        temp_path += path
        temp_counter+=1
    print(temp_path/temp_counter)
    avg_path += temp_path/temp_counter
    print(avg_path)
    i+=1

avgpath = avg_path/10

print("total_values "+str("10"))
print("avg_coefficient: "+str(avgpath))

In [None]:
#This cell calculates the average and the standard deviation of each null model, cluster coefficient and shortest path.
res_coef_er = statistics.pstdev(list_coeff_er)
res_coef_ba = statistics.pstdev(list_coeff_ba)
res_path_er = statistics.pstdev(list_path_er)
res_path_ba = statistics.pstdev(list_path_ba)

avg_coef_er = sum(list_coeff_er)/len(list_coeff_er)
avg_coef_ba = sum(list_coeff_ba)/len(list_coeff_ba)
avg_path_er = sum(list_path_er)/len(list_path_er)
avg_path_ba = sum(list_path_ba)/len(list_path_ba)

print()
print(len(list_path_ba))
print(sum(list_path_ba))
print()
print("Standard dev Coeff er:")
print(res_coef_er)
print("Standard dev Coeff ba: ")
print(res_coef_ba)
print("Standard dev Path er")
print(res_path_er)
print("Standard dev Path ba")
print(res_path_ba)
print("Average Coeff er")
print(avg_coef_er)
print("Average Coeff ba")
print(avg_coef_ba)
print("Average Path er")
print(avg_path_er)
print("Average Path ba")
print(avg_path_ba)

In [None]:
print(list_coeff_er)

In [None]:
print(list_coeff_ba)

In [None]:
print(list_path_er)

In [None]:
print(list_path_ba)