### Comparision of real netwok and random network

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

combined = pd.read_csv('/kaggle/input/music-data/combined_dataset.csv')
edges_df = pd.read_csv('/kaggle/working/filtered_edges.csv')

# Real graph
real_ids = set(combined['spotify_id'])
filtered_edges = edges_df[edges_df['id_0'].isin(real_ids) & edges_df['id_1'].isin(real_ids)]
G_real = nx.from_pandas_edgelist(filtered_edges, 'id_0', 'id_1')

n_nodes = G_real.number_of_nodes()
n_edges = G_real.number_of_edges()

# degree distribution of Real graph 
real_degrees = [d for n, d in G_real.degree()]
real_deg_count = np.bincount(real_degrees)
real_x = np.nonzero(real_deg_count)[0]
real_y = real_deg_count[real_x]

In [None]:
# Random graphs: track values for std dev
degree_vals = defaultdict(list)

for _ in range(100):
    G_rand = nx.gnm_random_graph(n_nodes, n_edges)
    degs = [d for n, d in G_rand.degree()]
    deg_count = np.bincount(degs)
    for i, val in enumerate(deg_count):
        degree_vals[i].append(val)

# Average and std dev
avg_y = []
std_y = []
avg_x = []

for k in sorted(degree_vals.keys()):
    vals = degree_vals[k]
    avg_x.append(k)
    avg_y.append(np.mean(vals))
    std_y.append(np.std(vals))

### With 100 random networks

In [None]:
plt.figure(figsize=(10, 7))

# Real 
plt.scatter(real_x, real_y, label='Real Graph', alpha=0.8, color='blue')

# Random graph average with std deviation error bars
avg_y = np.array(avg_y)
std_y = np.array(std_y)
plt.errorbar(avg_x, avg_y, yerr=std_y, fmt='x', label='Random Graph (mean ± std)', color='red', alpha=0.7)

plt.xscale("log")
plt.yscale("log")
plt.xlabel("Degree (log scale)")
plt.ylabel("Frequency (log scale)")
plt.title("Degree Distribution: Real vs Avg Random Graphs (with Std Dev)")
plt.legend()
plt.grid(True, which="both", ls="--", linewidth=0.5)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

combined = pd.read_csv('/kaggle/input/music-data/combined_dataset.csv')
edges_df = pd.read_csv('/kaggle/working/filtered_edges.csv')

# real graph
real_ids = set(combined['spotify_id'])
filtered_edges = edges_df[edges_df['id_0'].isin(real_ids) & edges_df['id_1'].isin(real_ids)]
G_real = nx.from_pandas_edgelist(filtered_edges, 'id_0', 'id_1')

# random graph (same number of nodes and edges)
n_nodes = G_real.number_of_nodes()
n_edges = G_real.number_of_edges()
G_random = nx.gnm_random_graph(n_nodes, n_edges)

# Relabel random graph with same node labels for clean comparison
mapping = dict(zip(G_random.nodes(), list(G_real.nodes())[:n_nodes]))
G_random = nx.relabel_nodes(G_random, mapping)

### with one random network

In [None]:

def plot_degree_distribution(G, label, color):
    degrees = [d for n, d in G.degree()]
    degree_count = np.bincount(degrees)
    x = np.nonzero(degree_count)[0]
    y = degree_count[x]
    plt.scatter(x, y, label=label, alpha=0.6, marker='o', color=color)

plt.figure(figsize=(8, 6))
plot_degree_distribution(G_real, 'Real Graph', 'blue')
plot_degree_distribution(G_random, 'Random Graph', 'red')
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Degree (log)")
plt.ylabel("Frequency (log)")
plt.title("Degree Distribution: Real vs Random Graph")
plt.legend()
plt.grid(True, which="both", ls="--", linewidth=0.5)
plt.show()

### MAX, MIN nad AVG degree in the graph

In [None]:
degrees = dict(G_real.degree())

max_degree_node = max(degrees, key=degrees.get)
min_degree_node = min(degrees, key=degrees.get)

print(f"Max degree: {degrees[max_degree_node]} (Node ID: {max_degree_node})")
print(f"Min degree: {degrees[min_degree_node]} (Node ID: {min_degree_node})")


In [None]:
total_degree = sum(dict(G_real.degree()).values())
average_degree = total_degree / n_nodes

print(f"Average Degree: {average_degree:.2f}")

### Top 10 Artists by followers

In [None]:
import pandas as pd

combined = pd.read_csv('/kaggle/input/music-data/combined_dataset.csv')
top_artists = combined.sort_values(by='followers', ascending=False).head(10)
print(top_artists[['name', 'followers']].reset_index(drop=True))


### Top 10 Artists by their popularity

In [None]:
top_popular_artists = combined.sort_values(by='popularity', ascending=False).head(10)

print(top_popular_artists[['name', 'popularity']].reset_index(drop=True))
