# ADM - HW5 - The Marvel Universe!

## Libraries & Setup

In [2]:
import networkx as nx   # Network analysis

import pandas as pd   # Data manipulation and analysis
import numpy as np    # Scientific Computing

import matplotlib.pyplot as plt   # Visualization

import warnings
warnings.filterwarnings('ignore')

#import functions

# 1. Data

## Data Preprocessing

In [3]:
nodes_df = pd.read_csv("nodes.csv")
edges_df = pd.read_csv("edges.csv")
hero_network_df = pd.read_csv("hero-network.csv")

In [4]:
nodes_df

Unnamed: 0,node,type
0,2001 10,comic
1,2001 8,comic
2,2001 9,comic
3,24-HOUR MAN/EMMANUEL,hero
4,3-D MAN/CHARLES CHAN,hero
...,...,...
19085,"ZOTA, CARLO",hero
19086,ZOTA,hero
19087,ZURAS,hero
19088,ZURI,hero


In [5]:
edges_df

Unnamed: 0,hero,comic
0,24-HOUR MAN/EMMANUEL,AA2 35
1,3-D MAN/CHARLES CHAN,AVF 4
2,3-D MAN/CHARLES CHAN,AVF 5
3,3-D MAN/CHARLES CHAN,COC 1
4,3-D MAN/CHARLES CHAN,H2 251
...,...,...
96099,ZZZAX,H2 326
96100,ZZZAX,H2 327
96101,ZZZAX,M/CP 8/4
96102,ZZZAX,PM 47


In [6]:
hero_network_df

Unnamed: 0,hero1,hero2
0,"LITTLE, ABNER",PRINCESS ZANDA
1,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
2,BLACK PANTHER/T'CHAL,PRINCESS ZANDA
3,"LITTLE, ABNER",PRINCESS ZANDA
4,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
...,...,...
574462,COLOSSUS II/PETER RA,CALLISTO
574463,CALLISTO,ROGUE /
574464,CALLISTO,CALIBAN/
574465,CALIBAN/,ROGUE /


For the data to be *consistent* across all the dataframes, **all hero names** appearing in the *hero_network* dataframe should **also be found** in the *edges* dataframe

In [7]:
# Retrieve heroes' names appearing in "hero_network" dataframe, but not in "edges" dataframe
def check_inconsistency():
    names = set(hero_network_df.hero1).union(set(hero_network_df.hero2)).difference(set(edges_df.hero))
    out = names if len(names)!=0 else "No inconsistencies found!"
    return out

In [8]:
# Some inconsistency examples
list(check_inconsistency())[:10]

['KLIGGER/SEN. EUGENE ',
 'WORTHINGTON, WARREN ',
 'LIVING LASER/ARTHUR ',
 'CHI DEMON/PROFESSOR ',
 'SILVERCLAW/LA GARRA ',
 'BRADDOCK, JAMES JR. ',
 'CEREBRO MARK XIII | ',
 'MR. SINISTER/NATHAN ',
 'SPAR, DR. KATHERINE ',
 'KILLER SHRIKE/SIMON ']

**Inconsistency reason 1:** some heroes' names in the *hero_network* dataframe have **extra spaces** at the end of their names

In [9]:
hero_network_df.hero1 = [x.strip() for x in hero_network_df.hero1]
hero_network_df.hero2 = [x.strip() for x in hero_network_df.hero2]

In [10]:
check_inconsistency()

{'BLADE', 'SABRE', 'SPIDER-MAN/PETER PAR'}

**Inconsistency reason 2:** some heroes in the *hero_network* dataframe **are missing a "/"** at the end of their names. In fact, *BLADE* and *SABRE* are labelled as **comics** in the *nodes* dataframe, while *BLADE/* and *SABRE/* represent the actual **heroes**

In [11]:
hero_network_df.loc[hero_network_df["hero1"] == "BLADE", "hero1"] = "BLADE/"
hero_network_df.loc[hero_network_df["hero2"] == "BLADE", "hero2"] = "BLADE/"
hero_network_df.loc[hero_network_df["hero1"] == "SABRE", "hero1"] = "SABRE/"
hero_network_df.loc[hero_network_df["hero2"] == "SABRE", "hero2"] = "SABRE/"

In [12]:
check_inconsistency()

{'SPIDER-MAN/PETER PAR'}

**Inconsistency reason 3:** the hero name *"SPIDER-MAN/PETER PARKER"* in the *edges* dataframe has been changed to *"SPIDER-MAN/PETER PAR"* the *hero_network* dataframe

In [13]:
hero_network_df.loc[hero_network_df["hero1"] == "SPIDER-MAN/PETER PAR", "hero1"] = "SPIDER-MAN/PETER PARKER"
hero_network_df.loc[hero_network_df["hero2"] == "SPIDER-MAN/PETER PAR", "hero2"] = "SPIDER-MAN/PETER PARKER"

In [14]:
check_inconsistency()

'No inconsistencies found!'

We also remove entries of the *hero_network* dataframe having the same hero in **both columns** (in order to avoid **self-loops** in the graph)

In [15]:
# With self-loop records
len(hero_network_df)

574467

In [16]:
hero_network_df.drop(hero_network_df[hero_network_df['hero1'] == hero_network_df['hero2']].index, inplace = True)

In [17]:
# Without self-loop records
len(hero_network_df)

572235

Finally, we identify and correct the **spelling mistake** in the hero name *"SPIDER-MAN/PETER PARKER"* present in the *nodes* dataframe (listed as *"SPIDER-MAN/PETER PARKERKER"*)

In [18]:
nodes_df.loc[nodes_df["node"] == "SPIDER-MAN/PETER PARKERKER", "node"] = "SPIDER-MAN/PETER PARKER"

## Graphs setup

### First graph: G1 - Heroes appearing in the same comic together

The graph **G1** should be **undirected**, **weighted** and should allow for **parallel edges**.

**Nodes**: *heroes*

**Edge** between node *Hero_A* and *Hero_B*: the heroes have appeared in the same comic together

**Number of edges** between node *Hero_A* and *Hero_B* = $num\_collaborations$ of the two heroes

**Edge weight**: the *reciprocal* of the *number of collaborations* of the two heroes

$$w((Hero\_A, Hero\_B)) = \frac{1}{num\_collaborations \; of \; Hero\_A \; and \; Hero\_B}$$

In [19]:
# NetworkX MultiGraph: undirected graph, can store multiedges
G1 = nx.MultiGraph()

In [20]:
# Sort the pair of heroes in each record (since the graph is undirected, we are interested in grouping together pair of names, irrespectively of their order)
hero_network_df[["hero1", "hero2"]] = np.sort(hero_network_df[["hero1", "hero2"]], axis=1)

# Group by pair of names and retrieve group size
heroes_pairs = (hero_network_df.groupby(["hero1", "hero2"]).size().reset_index().values.tolist())

In [21]:
# Some pair of heroes and their number of collaborations
heroes_pairs[:10]

[['24-HOUR MAN/EMMANUEL', 'FROST, CARMILLA', 1],
 ['24-HOUR MAN/EMMANUEL', "G'RATH", 1],
 ['24-HOUR MAN/EMMANUEL', 'KILLRAVEN/JONATHAN R', 1],
 ['24-HOUR MAN/EMMANUEL', "M'SHULLA", 1],
 ['24-HOUR MAN/EMMANUEL', 'OLD SKULL', 1],
 ['3-D MAN/CHARLES CHAN', 'AJAK/TECUMOTZIN [ETE', 1],
 ['3-D MAN/CHARLES CHAN', 'ANGEL/WARREN KENNETH', 1],
 ['3-D MAN/CHARLES CHAN', 'ANT-MAN II/SCOTT HAR', 1],
 ['3-D MAN/CHARLES CHAN', 'ANT-MAN/DR. HENRY J.', 2],
 ['3-D MAN/CHARLES CHAN', 'ARABIAN KNIGHT/ABDUL', 1]]

In [22]:
# Build weighted edges
heroes_collaboration = []

for x in heroes_pairs:
    for i in range(x[2]):
        heroes_collaboration.append(tuple([x[0], x[1], round(1/x[2], 4)]))

In [23]:
# Add edges to graph
G1.add_weighted_edges_from(heroes_collaboration)

### Second graph: G2 - Hero appearing in a specific comic

The graph **G2** should be **undirected** and **unweighted**.

**Nodes**: *heroes* and *comics*

**Edge** between node *Hero_A* and *Comic_A*: the hero has appeared in that specific comic

In [24]:
# NetworkX Graph: base undirected graph
G2 = nx.Graph()

In [25]:
# Add nodes with "type" attribute to graph
G2.add_nodes_from([(x[0], {"type": x[1]}) for x in nodes_df.to_records(index=False)])

In [26]:
# Add edges between heroes and comics to graph
G2.add_edges_from(list(edges_df.to_records(index=False)))

# 2. Backend Implementation

# 3. Frontend Implementation