# Marvel Network

Now that we have a clean dataset, we can actually build the network.

In [None]:
import pandas as pd
import numpy as np

import networkx as nx

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from tqdm.notebook import tqdm

import ast

from fa2 import ForceAtlas2

In [None]:
tqdm.pandas()

In [None]:
characters_df = pd.read_csv("../data/marvel_characters.csv")

characters_df["links"] = characters_df["links"].progress_apply(ast.literal_eval)
characters_df["leader"] = characters_df["leader"].progress_apply(ast.literal_eval)
characters_df["member"] = characters_df["member"].progress_apply(ast.literal_eval)
characters_df["ally"] = characters_df["ally"].progress_apply(ast.literal_eval)
characters_df["enemy"] = characters_df["enemy"].progress_apply(ast.literal_eval)

characters_df

In [None]:
def connect_node(row, g):
  g.add_node(row["name"], leader=row["leader"], member=row["member"], ally=row["ally"], enemy=row["enemy"])
  
  for link in row["links"]:
    g.add_edge(row["name"], link)

def createGraph(df, directed=False):
  
  g = nx.Graph()
  
  if directed:
    g = nx.DiGraph()

  _ = df.progress_apply(connect_node, g=g, axis=1)
    
  return g

In [None]:
g = createGraph(characters_df)

Now that we have a preliminar node, we can try to see the degree distribution, number of nodes and edges, to try to start understanding the graph.

In [None]:
print("Graph basic stats:")
print(f"\tNumber of nodes: {len(g.nodes)}")
print(f"\tNumber of edges: {len(g.edges)}")
print(f"\tAverage degree: {sum(x[1] for x in g.degree)/len(g.degree):.2f}")
print()
print(f"\tMost connected node: {max(g.degree, key=lambda x: x[1])[0]} \
with a degree of {max(g.degree, key=lambda x: x[1])[1]}")

Another relevant information that can be easily obtained is the top 5 characters with the bigger number of links in their wiki page

In [None]:
characters_df.sort_values(by=["number_links"], ascending=False).head()

We can display information about the teams too.

In [None]:
print("The Leader:")
print(f"\tThe character that is the leader in most team is \
{max(g.nodes,key=lambda x: len(nx.get_node_attributes(g,'leader')[x]))}")
print()

print("The Team player")
print(f"\tThe character that belong to more teams is \
{max(g.nodes,key=lambda x: len(nx.get_node_attributes(g,'member')[x]))}")
print()

print("The Likeable")
print(f"\tThe character that is allied to more teams is \
{max(g.nodes,key=lambda x: len(nx.get_node_attributes(g,'ally')[x]))}")
print()

print("The Confrontational")
print(f"\tThe character that is estranged to more teams is \
{max(g.nodes,key=lambda x: len(nx.get_node_attributes(g,'enemy')[x]))}")



In [None]:
characters_df[["len_leaders", "len_members",
               "len_ally", "len_enemy", "total_len"]] = characters_df.progress_apply(lambda x: pd.Series([
                                                                  len(x[3]),
                                                                  len(x[4]),
                                                                  len(x[5]),
                                                                  len(x[6]),
                                                                  len(list(set(x[3] + x[4] + x[5] + x[6])))
                                                                ]),
                                                                        axis=1)

characters_df.head()

In [None]:
characters_df.sort_values("len_leaders", ascending=False).head()

In [None]:
characters_df.sort_values("len_members", ascending=False).head()

In [None]:
characters_df.sort_values("len_ally", ascending=False).head()

In [None]:
characters_df.sort_values("len_enemy", ascending=False).head()

In [None]:
characters_df.sort_values("total_len", ascending=False).head()

In [None]:
characters_df.sort_values("number_links", ascending=False).head()

In [None]:
pd.DataFrame(sorted(g.degree, key=lambda x: x[1], reverse=True), columns=["Name", "degree"]).head()

## Colors

Just colors to paint the graph later

In [None]:
colorPallette = {
  "Avengers": (235, 73, 49),
  "X-Men": (46, 93, 188),
  "Illuminati": (192, 88, 132),
  "Inhuman Royal Guard": (18, 115, 39),
  "Guardians of the Galaxy": (18, 91, 115),
  "Avengers (1,000,000 BC)": (239, 127, 72),
  "Sinister Six": (70, 89, 67),
  "Thunderbolts": (172, 169, 41),
  "Elders of the Universe": (60, 91, 168),
  "Young Avengers": (250, 172, 158),
  "Dark Avengers": (117, 31, 16),
  "Fantastic Four": (44, 159, 253),
  "Strategic Homeland Intervention, Enforcement and Logistics Division": (32, 57, 158),
  "Defenders": (251, 226, 40),
  "Hydra": (71, 137, 70),
  "Black Order": (48, 50, 61),
  "Cabal (Dark Illuminati)": (69, 33, 80),
  "Hand": (80, 77, 33),
  "Heralds of Galactus": (67, 155, 125),
  "Winter Guard": (203, 234, 223)
}

cmap = ListedColormap([(x[0]/255, x[1]/255, x[2]/255) for x in colorPallette.values()])

We can get from the graph the Giant Connected Component.

In [None]:
gcc = g.subgraph(max(nx.connected_components(g), key=len))

print(f"Number of nodes in the GCC: {len(gcc.nodes)}\nNumber of links in GCC: {len(gcc.edges)}")

In [None]:
forceatlas2 = ForceAtlas2(
                          # Behavior alternatives
                          outboundAttractionDistribution=True,  # Dissuade hubs
                          linLogMode=False,  # NOT IMPLEMENTED
                          adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                          edgeWeightInfluence=1.0,

                          # Performance
                          jitterTolerance=1.0,  # Tolerance
                          barnesHutOptimize=True,
                          barnesHutTheta=1.2,
                          multiThreaded=False,  # NOT IMPLEMENTED

                          # Tuning
                          scalingRatio=2.0,
                          strongGravityMode=False,
                          gravity=1.0,

                          # Log
                          verbose=True)
positions = forceatlas2.forceatlas2_networkx_layout(gcc, pos=None, iterations=2000)

In [None]:
colors = []
sizes = []
alphas = []
max_degree = max(gcc.degree(), key=lambda x: x[1])[1]

for node in tqdm(gcc.nodes):
  color = np.array([0, 0, 0])
  size = gcc.degree(node) * 7 + 3
  alpha = max([gcc.degree(node)/max_degree, .2])
  
  for team in gcc.nodes[node]["member"]:
    color = color + np.array(colorPallette[team])/len(gcc.nodes[node]["member"]) 
  
  colors.append(color/255)
  sizes.append(size)
  alphas.append(alpha)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))

nx.draw_networkx_nodes(gcc,
                       positions,
                       linewidths  = 1,
                       node_size   = sizes,
                       node_color  = colors,
                       alpha       = alphas,
                       ax          = ax
                      )

nx.draw_networkx_edges(gcc,
                       positions,
                       edge_color  = "black",
                       arrowstyle  = "-",
                       alpha       = 0.1,
                       width       = .5,
                       ax          = ax
                      )  
plt.axis("off")

plt.show()

## Degree distribution

In [None]:
dg = createGraph(characters_df, directed=True)

dgcc = dg.subgraph(max(nx.weakly_connected_components(dg), key=len))

In [None]:
fig, ((in_ax, out_ax), (in_ax_bp, out_ax_bp)) = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle("Degree distribution")

in_degrees = dict(dgcc.in_degree()).values()
out_degrees = dict(dgcc.out_degree()).values()

counts, bins, bars = in_ax.hist(in_degrees, bins=50)
in_ax.set_title("In degree distribution")
in_ax.set_xlabel("Degree")
in_ax.set_ylabel("Count")

bin_centers = 0.5 * (bins[:-1] + bins[1:])
col = bin_centers - min(bin_centers)
col /= max(col)
col *= 5
col %= 1

for c, p in zip(col, bars):
  plt.setp(p, 'facecolor', cmap(c))

counts, bins, bars = out_ax.hist(out_degrees, bins=20)
out_ax.set_title("Out degree distribution")
out_ax.set_xlabel("Degree")
out_ax.set_ylabel("Count")

bin_centers = 0.5 * (bins[:-1] + bins[1:])
col = bin_centers - min(bin_centers)
col /= max(col)
col *= 2
col %= 1


for c, p in zip(col, bars):
  plt.setp(p, 'facecolor', cmap(c))

in_ax_bp.boxplot(in_degrees, vert=False, labels=["In degree"])
in_ax_bp.set_title("Box plot of the In Degree")
in_ax_bp.set_xlabel("Degree")

out_ax_bp.boxplot(out_degrees, vert=False, labels=["Out degree"])
out_ax_bp.set_title("Box plot of the Out Degree")
out_ax_bp.set_xlabel("Degree")

plt.show()

In [None]:
fig, (hist_ax, bp_ax) = plt.subplots(2, 1, figsize=(10, 15))


degrees = dict(dgcc.degree()).values()

counts, bins, bars = hist_ax.hist(degrees, bins=50)
hist_ax.set_title("Total Degree distribution")
hist_ax.set_xlabel("Degree")
hist_ax.set_ylabel("Count")

bin_centers = 0.5 * (bins[:-1] + bins[1:])
col = bin_centers - min(bin_centers)
col /= max(col)
col *= 5
col %= 1

for c, p in zip(col, bars):
  plt.setp(p, 'facecolor', cmap(c))
  
bp_ax.boxplot(degrees, vert=False, labels=["Total Degree"])
bp_ax.set_title("Box plot of the Total Degree")
bp_ax.set_xlabel("Degree")

plt.show()

Some stats are repeated here, but now is using the DGCC (Directed Giant Connected Component)

In [None]:
print(f"Number of nodes in network: \t {dgcc.number_of_nodes()}")
print(f"Number of links🔗: \t\t {dgcc.number_of_edges()}")

sorted_in  = sorted(dgcc.nodes, key=dgcc.in_degree, reverse=True)
sorted_out = sorted(dgcc.nodes, key=dgcc.out_degree, reverse=True)
sorted_tot = sorted(dgcc.nodes, key=dgcc.degree, reverse=True)

print(f"Most connected in:\t {sorted_in[0]}\n\tin👈:\t{dgcc.in_degree(sorted_in[0])}\n\tout👉:\t{dgcc.out_degree(sorted_in[0])}")
print(f"Most connected out:\t {sorted_out[0]}\n\tin👈:\t{dgcc.in_degree(sorted_out[0])}\n\tout👉:\t{dgcc.out_degree(sorted_out[0])}")
print(f"Most connected overall:\t {sorted_tot[0]}\n\tin👈:\t{dgcc.in_degree(sorted_tot[0])}\n\tout👉:\t{dgcc.out_degree(sorted_tot[0])}")