<a href="https://colab.research.google.com/github/Pavun-KumarCH/NLP---Network-Analytics---Witcher---Characters---relationships/blob/main/NPL_Network_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries


In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
%matplotlib inline
import matplotlib.pyplot as plt
import os


In [None]:
#!python3 -m spacy download en_core_web_sm

In [None]:
# Load Spacy English Language Model
# Name Entity Recognition
model_dir = "en_core_web_sm"
NER = spacy.load(model_dir)

In [None]:
!python -m spacy validate



In [None]:
spacy.info()

## Load the Books

In [None]:
import os
os.getcwd()

os.chdir('/Users/pavankumar/Projects/Witcher_project/')

In [None]:
# get all book files in data dictionary
all_books = [b for b in os.scandir('data') if '.txt' in b.name]

In [None]:
all_books

In [None]:
book = all_books[1]
book_text = open(book).read()
book_doc = NER(book_text)

## Visualize identify Entities

In [None]:
displacy.render(book_doc[0 : 2000], style = 'ent', jupyter = True)

In [None]:
# Load the characters data
characters  =  pd.read_csv('Characters data.csv')
characters

## Using Regural Eexpression removing the unwanted (-text-) in brackets


In [None]:
import re
characters['Character'] = characters['Character'].apply(lambda x : re.sub("[\(].*?[\)]","",x))
characters['First Name'] = characters['Character'].apply(lambda x : x.split(" ", 1)[0])

In [None]:
pd.set_option('display.max_rows', None)
characters

### Get Name Entity list per sentence

In [None]:
sentence_entity = []

# loop through sentences, store name entity list for each sentence
for sent in book_doc.sents:
  entity_list = [ent.text for ent in sent.ents]
  sentence_entity.append({'Sentence' : sent, 'Entities' : entity_list})

sentence_entity_df = pd.DataFrame(sentence_entity)

In [None]:
sentence_entity_df

In [None]:
# Function to filter out non-character entites
def filter_entity(ent_list, character_df):
  return[ent for ent in ent_list
         if ent in list(character_df['Character'])
         or ent in list(character_df['First Name'])]

### Example
 here it will only look for the characters which are present in the character data and the rest is been removed

In [None]:
# example
filter_entity(['Adela', 'the', 4], characters)

In [None]:
sentence_entity_df['Character Entites'] = sentence_entity_df['Entities'].apply(lambda x: filter_entity(x, characters))
sentence_entity_df.head(100)

# filter out the sentences that don't have any character entitiy

In [None]:
sentence_entity_filtered_df = sentence_entity_df[sentence_entity_df['Character Entites'].map(len)> 0]
sentence_entity_filtered_df

In [None]:
# Take only first names of characters
sentence_entity_filtered_df['Character Entites'] = sentence_entity_filtered_df['Character Entites'].apply(lambda x : [item.split()[0] for item in x])

In [None]:
pd.reset_option('^display.', silent = True)
sentence_entity_filtered_df

# Create Relationship

In [None]:
window_size = 5
relationships = []

for i in range(sentence_entity_filtered_df.index[-1]):
  end_i = min(i + 5, sentence_entity_filtered_df.index[-1])
  char_list = sum((sentence_entity_filtered_df.loc[i: end_i]['Character Entites']),[])

  # Remove dupilcate characters next to each other
  char_unique = [char_list[i] for i in range(len(char_list))
                if (i == 0) or char_list[i] != char_list[i-1]]

  if len(char_unique) > 1:
    for idx, a in enumerate(char_unique[:-1]):
      b = char_unique[idx + 1]
      relationships.append({'source' : a, 'target' : b})



In [None]:
relationships_df = pd.DataFrame(relationships)
pd.set_option('^display.max_rows',None)
relationships_df

# Lets sort the source and target entities properly

In [None]:
pd.set_option('^display.max_rows',None)
np.sort(relationships_df.values)

In [None]:
# sort the case with a -> b and b->a
relationships_df = pd.DataFrame(np.sort(relationships_df.values, axis =1), columns = relationships_df.columns)
relationships_df

In [None]:
# add a value weigths as 1 to each row
relationships_df['value'] = 1
relationships_df = relationships_df.groupby(['source', 'target'], sort = False, as_index = False).sum()
relationships_df

# Graph analysis and Visualization


In [None]:
# create a graph from pandas dataframe
G = nx.from_pandas_edgelist(relationships_df,
                            source = 'source',
                            target = 'target',
                            edge_attr = 'value',
                            create_using = nx.Graph())

#### Graph. Visualization - Networkx

In [None]:
plt.figure(figsize = (15, 15))
pos = nx.kamada_kawai_layout(G)
nx.draw(G, with_labels = True, node_color = 'skyblue', edge_cmap = plt.cm.Blues, pos = pos)

In [None]:
from pyvis.network import Network
from IPython.display import HTML

net = Network(notebook=True, width='1920px', height='1080px', bgcolor='#222222', font_color='orange', cdn_resources='remote')
node_degree = dict(G.degree)
# setting up node size attribute
nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("witcher.html")

# Display the HTML content in the notebook
HTML(net.html)


## Most Important characters in witcher

In [None]:
# Degree centrality
degree_dict = nx.degree_centrality(G)
degree_dict

In [None]:
degree_df = pd.DataFrame.from_dict(degree_dict, orient = 'index', columns = ['centrality'])
# plot for top 10
degree_df.sort_values('centrality', ascending = False)[0:9].plot(kind = 'bar')

In [None]:
# Betweeness Centrality
betweeness_dict = nx.betweenness_centrality(G)
betweeness_df = pd.DataFrame.from_dict(betweeness_dict, orient = 'index', columns=['centrality'])
# plot for top 10
betweeness_df.sort_values('centrality', ascending = False)[0:9].plot(kind = 'bar')

In [None]:
# Closeness Centrality
closeness_dict = nx.closeness_centrality(G)
closeness_df = pd.DataFrame.from_dict(closeness_dict, orient = 'index', columns = ['centrality'])
# plot for top 10
closeness_df.sort_values('centrality', ascending = False)[0:9].plot(kind = 'bar')

In [None]:
# Save the centrality measures
nx.set_node_attributes(G, degree_dict, 'Degree Centrality')
nx.set_node_attributes(G, betweeness_dict, 'Betweeness Centrality')
nx.set_node_attributes(G, closeness_dict, 'Closeness Centrality')

# Community Detection

In [None]:
import community as community_louvain
communities = community_louvain.best_partition(G)

nx.set_node_attributes(G, communities, 'group')
com_net = Network(notebook = True, width = '1920px', height = '1080px', bgcolor = '#222222', font_color = 'white', cdn_resources = 'remote')
com_net.from_nx(G)
com_net.show("witcher_communities.html")

# Display the HTML content in the notebook
HTML(com_net.html)
