In [None]:
import pandas as pd

In [None]:
# Load the CSV file
file_path = 'marvel_movies_processed.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()

In [None]:
# Display basic information about the dataframe
data.info()

# Display the first few rows of the dataframe
data.head()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Initialize an empty graph
G = nx.Graph()

# Iterate over each movie to add edges between characters
for index, row in data.iterrows():
    movie_characters = row['Characters'].split(', ')  # Adjust this split if characters are stored differently
    for i, char1 in enumerate(movie_characters):
        for char2 in movie_characters[i + 1:]:
            if G.has_edge(char1, char2):
                G[char1][char2]['weight'] += 1
            else:
                G.add_edge(char1, char2, weight=1)

# Draw the network
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, k=0.3)
nx.draw(G, pos, with_labels=True, node_size=50, font_size=10)
plt.show()

In [None]:
# Calculate centrality measures
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# Display top 10 characters by degree centrality
sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]

# Community detection using the Girvan-Newman method
from networkx.algorithms.community import girvan_newman

communities = girvan_newman(G)
top_level_communities = next(communities)
sorted(map(sorted, top_level_communities))

In [None]:
# Create a dataset with network features
features = []
for node in G.nodes():
    features.append([
        node,
        degree_centrality[node],
        betweenness_centrality[node],
        closeness_centrality[node],
        G.degree(node)
    ])

# Convert to DataFrame
features_df = pd.DataFrame(features, columns=['Character', 'DegreeCentrality', 'BetweennessCentrality', 'ClosenessCentrality', 'Degree'])

# Display the features DataFrame
print(features_df.head())

In [None]:
import matplotlib.pyplot as plt

# Visualize the centrality measures
plt.figure(figsize=(10, 6))
plt.barh(features_df['Character'], features_df['DegreeCentrality'])
plt.xlabel('Degree Centrality')
plt.ylabel('Character')
plt.title('Degree Centrality of Marvel Characters')
plt.show()

plt.figure(figsize=(10, 6))
plt.barh(features_df['Character'], features_df['BetweennessCentrality'])
plt.xlabel('Betweenness Centrality')
plt.ylabel('Character')
plt.title('Betweenness Centrality of Marvel Characters')
plt.show()

plt.figure(figsize=(10, 6))
plt.barh(features_df['Character'], features_df['ClosenessCentrality'])
plt.xlabel('Closeness Centrality')
plt.ylabel('Character')
plt.title('Closeness Centrality of Marvel Characters')
plt.show()

In [None]:
from pyvis.network import Network

# Initialize Pyvis network
net = Network(notebook=True)

# Add nodes and edges to the Pyvis network
for node in G.nodes():
    net.add_node(node, label=node)

for edge in G.edges(data=True):
    net.add_edge(edge[0], edge[1], value=edge[2]['weight'])

# Display the network
net.show('marvel_network.html')

In [3]:
import json
import networkx as nx
import pandas as pd

# Load the dataset
file_path = 'marvel_movies_processed.csv'
marvel_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
marvel_data.head()

# Initialize a graph
G = nx.Graph()

# Add nodes and edges
for index, row in marvel_data.iterrows():
    movie = row['Movie Name']
    characters = row['Characters']
    
    # Add movie as a node
    G.add_node(movie, type='movie')
    
    # Add characters and connect them to the movie
    if isinstance(characters, str):
        char_list = [char.strip() for char in characters.split(',')]
        for char in char_list:
            G.add_node(char, type='character')
            G.add_edge(movie, char)

# Display the number of nodes and edges
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

num_nodes, num_edges

(150, 263)

In [6]:
import json

# Prepare data for D3.js
nodes = [{'id': node, 'group': 'movie' if data['type'] == 'movie' else 'character'} for node, data in G.nodes(data=True)]
links = [{'source': source, 'target': target} for source, target in G.edges()]

network_data = {'nodes': nodes, 'links': links}

# Save data to a JSON file
with open('marvel_network.json', 'w') as f:
    json.dump(network_data, f)

'marvel_network.json'

'marvel_network.json'

In [7]:
import json
import networkx as nx
import pandas as pd

# Load the dataset
file_path = 'marvel_movies_processed.csv'
marvel_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
marvel_data.head()

# Initialize a graph
G = nx.Graph()

# Add nodes and edges
for index, row in marvel_data.iterrows():
    movie = row['Movie Name']
    characters = row['Characters']
    
    # Add movie as a node
    G.add_node(movie, type='movie')
    
    # Add characters and connect them to the movie
    if isinstance(characters, str):
        char_list = [char.strip() for char in characters.split(',')]
        for char in char_list:
            G.add_node(char, type='character')
            G.add_edge(movie, char)

# Display the number of nodes and edges
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

num_nodes, num_edges

(150, 263)

In [8]:
import json
from collections import defaultdict
from itertools import combinations

# Initialize dictionaries for nodes and edges
nodes = set()
edges = defaultdict(int)

# Iterate over the dataset to build the nodes and edges
for index, row in marvel_data.iterrows():
    characters = row['Characters']
    if isinstance(characters, str):
        char_list = [char.strip() for char in characters.split(',')]
        nodes.update(char_list)
        for char1, char2 in combinations(char_list, 2):
            if char1 != char2:
                edges[frozenset([char1, char2])] += 1

# Convert nodes to a list of dictionaries
node_list = [{'id': char, 'group': 'character'} for char in nodes]

# Convert edges to a list of dictionaries
edge_list = [{'source': list(edge)[0], 'target': list(edge)[1], 'value': count} for edge, count in edges.items()]

network_data = {'nodes': node_list, 'links': edge_list}

# Save data to a JSON file
with open('marvel_network_characters.json', 'w') as f:
    json.dump(network_data, f)

'marvel_network_characters.json'

'marvel_network_characters.json'

In [10]:
import pandas as pd
import networkx as nx
import json
from collections import defaultdict
from itertools import combinations

# Load the data
df = pd.read_csv('marvel_movies_processed.csv')

# Initialize dictionaries for nodes and edges
nodes = set()
edges = defaultdict(int)

# Iterate over the dataset to build the nodes and edges
for index, row in df.iterrows():
    characters = row['Characters']
    if isinstance(characters, str):
        char_list = [char.strip() for char in characters.split(',')]
        nodes.update(char_list)
        for char1, char2 in combinations(char_list, 2):
            if char1 != char2:
                edges[frozenset([char1, char2])] += 1

# Convert nodes to a list of dictionaries
node_list = [{'id': char, 'group': 'character'} for char in nodes]

# Create the graph
G = nx.Graph()

# Add nodes and edges to the graph
for node in node_list:
    G.add_node(node['id'])

for edge, count in edges.items():
    edge_list = list(edge)
    G.add_edge(edge_list[0], edge_list[1], weight=count)

# Calculate metrics
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# Add metrics to nodes
for node in node_list:
    node['degree_centrality'] = degree_centrality[node['id']]
    node['betweenness_centrality'] = betweenness_centrality[node['id']]
    node['closeness_centrality'] = closeness_centrality[node['id']]

# Convert edges to a list of dictionaries
edge_list = [{'source': list(edge)[0], 'target': list(edge)[1], 'value': count} for edge, count in edges.items()]

network_data = {'nodes': node_list, 'links': edge_list}

# Save data to a JSON file
with open('marvel_network_with_metrics.json', 'w') as f:
    json.dump(network_data, f)

In [12]:
import pandas as pd
import json

# Load the CSV file
movies_df = pd.read_csv('marvel_movies_processed.csv')

# Load the JSON file
with open('marvel_network_with_metrics.json', 'r') as f:
    network_data = json.load(f)

# Create a dictionary to map characters to their movie appearances
character_movies = {}

for _, row in movies_df.iterrows():
    movie_name = row['Movie Name']
    release_date = row['Release Date']
    characters = row['Characters'].split(', ')
    for character in characters:
        if character not in character_movies:
            character_movies[character] = []
        character_movies[character].append({
            'movie_name': movie_name,
            'release_date': release_date
        })

# Enrich the network data with movie appearances
for node in network_data['nodes']:
    character = node['id']
    node['movies'] = character_movies.get(character, [])

# Save the enriched JSON data
with open('marvel_network_with_metrics_characters.json', 'w') as f:
    json.dump(network_data, f, indent=4)


In [14]:
import pandas as pd
import json

# Load the CSV file
movies_df = pd.read_csv('marvel_movies_processed.csv')

# Load the JSON file
with open('marvel_network_with_metrics.json', 'r') as f:
    network_data = json.load(f)

# Create a dictionary to map characters to their movie appearances
character_movies = {}
characters_set = set()

for _, row in movies_df.iterrows():
    movie_name = row['Movie Name']
    release_date = row['Release Date']
    characters = row['Characters'].split(', ')
    characters_set.update(characters)
    for character in characters:
        if character not in character_movies:
            character_movies[character] = []
        character_movies[character].append({
            'movie_name': movie_name,
            'release_date': release_date
        })

# Create a dataframe to calculate correlations
characters_list = list(characters_set)
correlation_matrix = pd.DataFrame(0, index=characters_list, columns=characters_list)

# Fill the dataframe
for _, row in movies_df.iterrows():
    characters = row['Characters'].split(', ')
    for i in range(len(characters)):
        for j in range(i + 1, len(characters)):
            correlation_matrix.at[characters[i], characters[j]] += 1
            correlation_matrix.at[characters[j], characters[i]] += 1

# Calculate the correlation
correlation_matrix = correlation_matrix.corr().fillna(0)

# Enrich the network data with movie appearances and correlations
for node in network_data['nodes']:
    character = node['id']
    node['movies'] = character_movies.get(character, [])
    if character in correlation_matrix.index:
        top_correlations = correlation_matrix[character].nlargest(4).index[1:4]
        node['top_correlations'] = {
            other: correlation_matrix.at[character, other]
            for other in top_correlations
        }

# Save the enriched JSON data
with open('marvel_network_with_metrics_correlation.json', 'w') as f:
    json.dump(network_data, f, indent=4)
