<a href="https://colab.research.google.com/github/Only-Mike/M2-NLP-Network-Analysis/blob/main/NLP_SC_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook for network analysis and Natural Language proccesing.

---


This note book has been created by Kasper R. Haurum, Mike Christensen, Rayian Alam and Snorre K. Brouer.

# Import dataset as df

This NEEDS to be run first before going further into the assignment

In [None]:
import pandas as pd

In [None]:
# Importing dataset

In [None]:
!git clone https://github.com/Only-Mike/M2-NLP-Network-Analysis.git

In [None]:
data1 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2017 csv file.csv')
data2 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2018 csv file.csv')
data3 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2019 csv file.csv')
data4 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2020 csv file.csv')
data5 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2021 csv file.csv')


In [None]:
frames = [data1, data2, data3, data4, data5] #creating frame for all datasets

In [None]:
df = pd.concat(frames) #Concat all datasets to "df"

#Topic modelling

---


In [None]:
#Installing preprocessor to celan our text
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

# explainability (why did the model say it's related to this author)
!pip install eli5

In [None]:
import numpy as np
import tqdm #progress bar
import preprocessor as prepro # text prepro

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

In [None]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [None]:
df.columns

## Cleaning text and embedding the data

In [None]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
data = df[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [None]:
#Take a random sample of 1500 papers. This i for making the model run faster.
data = data.sample(n=1500)

In [None]:
#Making a new column called "text" from the abstract column
data['text'] = data['Abstract']

In [None]:
#Cleaning the text
data['text_clean'] = data['text'].map(lambda t: prepro.clean(t))


In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data['text_clean']),position=0, leave=True)

for text in nlp.pipe(data['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

In [None]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
data['text_clean'] = text_prepro(data['text'])

In [None]:
# preprocess texts
tokens = []

for summary in nlp.pipe(data['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
data['tokens'] = tokens

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 600 words
dictionary.filter_extremes(no_below=4, no_above=0.4, keep_n=600)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data['tokens']]

##Visualization

In [None]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=13, workers = 4, passes=10)

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
pyLDAvis.display(lda_display)

##Model Metrics

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora

In [None]:
corpus = data['tokens']

In [None]:
corpus

In [None]:
dirichlet_dict = corpora.Dictionary(corpus)
bow_corpus = [dirichlet_dict.doc2bow(text) for text in corpus]

# Considering 1-15 topics, as the last is cut off
num_topics = list(range(16)[1:])
num_keywords = 15

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    LDA_models[i] = LdaModel(corpus=bow_corpus,
                             id2word=dirichlet_dict,
                             num_topics=i,
                             update_every=1,
                             chunksize=len(bow_corpus),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]

In [None]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

In [None]:
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

In [None]:
coherences = [CoherenceModel(model=LDA_models[i], texts=corpus, dictionary=dirichlet_dict, coherence='c_v').get_coherence() for i in num_topics[:-1]]

In [None]:
coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]

In [None]:
plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()  

#Topics over time

---



##Topics 2017

In [None]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
data1 = data1[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [None]:
#Take a random sample of 500 papers. This i for making the model run faster.
data1 = data1.sample(n=500)

In [None]:
#Making a new column called "text" from the abstract column
data1['text'] = data1['Abstract']

In [None]:
#Cleaning the text
data1['text_clean'] = data1['text'].map(lambda t: prepro.clean(t))


In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data1['text_clean']),position=0, leave=True)

for text in nlp.pipe(data1['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

In [None]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
data1['text_clean'] = text_prepro(data1['text'])  ##<---- HVAD GØR DEN HER?<-------##

In [None]:
# preprocess texts (we need tokens)  <---- Hvad gør den her? <------
tokens = []

for summary in nlp.pipe(data1['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
data1['tokens'] = tokens

In [None]:

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data1['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max XXXXXXX words
dictionary.filter_extremes(no_below=4, no_above=0.4, keep_n=600)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data1['tokens']]

###Visualization

In [None]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=13, workers = 4, passes=10)

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
pyLDAvis.display(lda_display)

## Topics 2018

In [None]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
data2 = data2[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [None]:
#Take a random sample of 500 papers. This i for making the model run faster.
data2 = data2.sample(n=500)

In [None]:
#Making a new column called "text" from the abstract column
data2['text'] = data2['Abstract']

In [None]:
#Cleaning the text
data2['text_clean'] = data2['text'].map(lambda t: prepro.clean(t))


In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data2['text_clean']),position=0, leave=True)

for text in nlp.pipe(data2['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

In [None]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
data2['text_clean'] = text_prepro(data2['text'])

In [None]:
# preprocess texts
tokens = []

for summary in nlp.pipe(data2['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
data2['tokens'] = tokens

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data2['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 600 words
dictionary.filter_extremes(no_below=4, no_above=0.4, keep_n=600)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data2['tokens']]

###Visualization

In [None]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=13, workers = 4, passes=10)

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
pyLDAvis.display(lda_display)

##Topics 2019

In [None]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
data3 = data3[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [None]:
#Take a random sample of 500 papers. This i for making the model run faster.
data3 = data3.sample(n=500)

In [None]:
#Making a new column called "text" from the abstract column
data3['text'] = data3['Abstract']

In [None]:
#Cleaning the text
data3['text_clean'] = data3['text'].map(lambda t: prepro.clean(t))


In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data3['text_clean']),position=0, leave=True)

for text in nlp.pipe(data3['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

In [None]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
data3['text_clean'] = text_prepro(data3['text'])

In [None]:
# preprocess texts
tokens = []

for summary in nlp.pipe(data3['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
data3['tokens'] = tokens

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data3['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 600 words
dictionary.filter_extremes(no_below=4, no_above=0.4, keep_n=600)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data3['tokens']]

###Visualization

In [None]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=13, workers = 4, passes=10)

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
pyLDAvis.display(lda_display)

##Topics 2020

In [None]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
data4 = data4[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [None]:
#Take a random sample of 500 papers. This i for making the model run faster.
data4 = data4.sample(n=500)

In [None]:
#Making a new column called "text" from the abstract column
data4['text'] = data4['Abstract']

In [None]:
#Cleaning the text
data4['text_clean'] = data4['text'].map(lambda t: prepro.clean(t))


In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data4['text_clean']),position=0, leave=True)

for text in nlp.pipe(data4['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

In [None]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
data4['text_clean'] = text_prepro(data4['text'])

In [None]:
# preprocess texts
tokens = []

for summary in nlp.pipe(data4['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
data4['tokens'] = tokens

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data4['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 600 words
dictionary.filter_extremes(no_below=4, no_above=0.4, keep_n=600)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data4['tokens']]

###Visualization

In [None]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=13, workers = 4, passes=10)

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
pyLDAvis.display(lda_display)

##Topics 2021

In [None]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
data5 = data5[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [None]:
#Take a random sample of 500 papers. This i for making the model run faster.
data5 = data5.sample(n=500)

In [None]:
#Making a new column called "text" from the abstract column
data5['text'] = data5['Abstract']

In [None]:
#Cleaning the text
data5['text_clean'] = data5['text'].map(lambda t: prepro.clean(t))


In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data5['text_clean']),position=0, leave=True)

for text in nlp.pipe(data5['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

In [None]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
data5['text_clean'] = text_prepro(data5['text'])

In [None]:
# preprocess texts
tokens = []

for summary in nlp.pipe(data5['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
data5['tokens'] = tokens

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data5['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 600 words
dictionary.filter_extremes(no_below=4, no_above=0.4, keep_n=600)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data5['tokens']]

###Visualization

In [None]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=13, workers = 4, passes=10)

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In [None]:
pyLDAvis.display(lda_display)

#Label Prediction

---




In [None]:
#Installing preprocessor to celan our text
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

# explainability & UMap
!pip install eli5
!pip install umap-learn -q

In [None]:
#The foundational imports
import pandas as pd
import numpy as np
import tqdm #progress bar
import preprocessor as prepro # text prepro
import matplotlib.pyplot as plt #For plotting using matplot
import seaborn as sns #Seaborn which is for visuals, etc
sns.set()

#IO import - interface
import os
os.chdir('..')

import spacy #spacy for prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module for the spacy

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

In [None]:
# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

In [None]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [None]:
# Importing dataset

In [None]:
data1 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2017 csv file.csv')
data2 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2018 csv file.csv')
data3 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2019 csv file.csv')
data4 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2020 csv file.csv')
data5 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2021 csv file.csv')


In [None]:
frames = [data1, data2, data3, data4, data5] #creating frame for all datasets

In [None]:
df = pd.concat(frames) #Concat all datasets to "df"

##EDA section

In [None]:
df.shape #Checking the shape of our dataaset, it looks okey-dokey doctor jones

In [None]:
df.columns #Printing the variables in the datasheet

In [None]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
#We select Author(s) ID, Author Keywords and the Abstract variables as we want to work with them. Please note that the Author(s) ID is not used.
data = df[['Author(s) ID','Author Keywords', 'Abstract']]

In [None]:
data.shape #Checking the shape of our reassigned dataframe, it is correct as we got two variables.

In [None]:
data.head() #Doing a quick head command to check the values on a snippet of the overall set

In [None]:
#We are going to drop all the values in the variables without a assigned value, in other words (N/As)
data.dropna(inplace=True)
data.info()

In [None]:
#For this assignment we are going to take a sample of our overall dataframe, and setting it as 2000. 
#This also helps when running CPU/Ram intensive programs as the lower sample size runs faster.
sample_list = data.sample(n=2000)

In [None]:
#We are going to set the variable "Author Keywords" as a string, then we are going to use the prepro to clean the variable itself and removing any hashtags, etc.
sample_list['Author Keywords']= sample_list['Author Keywords'].astype(str)
sample_list['Author Keywords']= sample_list['Author Keywords'].map(lambda t: prepro.clean(t))
sample_list['Author Keywords']= sample_list['Author Keywords'].str.replace('#','')

In [None]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [None]:
#Printing the sample list to check out if everything works, and it does as you can see the keywords are cleaned up.
sample_list.head()

In [None]:
#We want to validate the keywords are correctly processed by the prepro lib, so we create a new addition to our sample_list
#The new variable, "keywords_processed" have some additional elements to be removed, it wil also be lowercased for all letters in the lines.

# Load the regular expression library
import re
# Remove punctuation
sample_list['keywords_processed'] = \
sample_list['Author Keywords'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
sample_list['keywords_processed'] = \
sample_list['keywords_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
sample_list['keywords_processed'].head()

In [None]:
sample_list

## Modelling

In [None]:
# We will now assign tokens to the keywords we just ran in the preprosition lib, and these tokens are used when we are going to vectorize, and classify them in a dictionary.
#The follow commands will have some functions disabled to spare computer processing power, as seen with ner being disabled.
#The words will be categorized into nouns, pronouns, adjectives, and adverbiums.

tokens = []

for summary in nlp.pipe(sample_list['keywords_processed'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
#We now run the tokens command on the sample list.
sample_list['tokens'] = tokens

In [None]:
#Vectorizer is used to covert data over to a matrix, and as we want to look at labeling using author keywords we will assign the function to do likewise.

vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(sample_list['Author Keywords'])

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary2 = Dictionary(sample_list['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary2.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)

In [None]:
# construct corpus using this dictionary
corpus_tfidf = [dictionary2.doc2bow(doc) for doc in sample_list['tokens']]

In [None]:
#The following function is used for topic modelling, whereas we will use it for modelling through the author keywords. 
#This function is known as latent dirichlet allocation, LDA, that is aprobabilistic model that assumes each topic is a mixture over an underlying set of words.
#These words are the ones we find in author keywords.
lda_model = LdaMulticore(corpus_tfidf, id2word=dictionary2, num_topics=3, workers = 4, passes=10)

In [None]:
#We now will display the generated model using LDA by selecting the corpus, dictionary, and the processed LDA model.
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus_tfidf, dictionary2)

In [None]:
#And voila. We selected 3 topics upon the recommendation of the professors for this project, and we can see the following clusters below.
pyLDAvis.display(lda_display)

In [None]:
#To ensure we got correctly assigned rokens we will now run a quick check.
TokenCheck = sample_list['tokens']

In [None]:
#We got a sample size of 2000, and it seems to match the author keywords that we have used. Check please.
TokenCheck

In [None]:
#Since we have the LDA create clusters/topics based on the keywords processed we will now assign it to all the abstracts based on the similarity to the keywords they contain
transf_corpus = lda_model.get_document_topics(corpus_tfidf)

In [None]:
#But we will first of all produce a list of the produced labels, so we are going to create the following command to apply it for all items.
l=[lda_model.get_document_topics(item) for item in corpus_tfidf]

In [None]:
#And this is how the labels look like.
l

In [None]:
#So we need to create a applier for the labels ointo the dataframe, this is done using lambda. 
#In short, it is used for functions that is without a name
sorted([('abc', 121),('abc', 231),('abc', 148), ('abc',221)],

       key=lambda x: x[1], reverse=True)

In [None]:
sorted(l[1], key=lambda x: x[1], reverse=True)[0][0]

In [None]:
#Now are are going to create the specific command function that we will apply onto the dataframe so we can get it sorted into labels for the keywords.
#Also appending it, which means we can add items to the label list.
labels = []



for blah in l:

  ll = sorted(blah, key=lambda x: x[1], reverse=True)[0][0]

  labels.append(ll)

In [None]:
labels = pd.DataFrame(labels)

In [None]:
#Now are haved applied it, and do a quick test if the labeling works. It seems to do, as we can see some are applied to label group 0, and some to label group 2
labels.head()

In [None]:
#We are adding the list of labels to the overall sample list
sample_list['labels'] = labels

In [None]:
#And this is how it looks, the labels are now at the far-right of the sample list, which has catagorized the different abstracts based on the author keywords.
sample_list

In [None]:
#Now we are going to see how many entries that are in the three different label groups, and it seems to be a somewhat even spread.
#764 papers belongs to label group 2, 668 to label group 1, and 568 to label group 0.
sample_list['labels'].value_counts()

## SML part

Now that we have established the labels for the different entries in our sample_size list, we wish to see if we can do some supervised machine learning to make models that can predict based on the dataset

In [None]:
#We are now going to test it out by printing a abstract with the label attached it to, the following result is seen below.
print(df.iloc[-1]['Abstract'])
print(sample_list.iloc[-1]['labels'])

In [None]:
# We are making a new text_prepro function which we will run on the supervised machine learning section.
# To make sure we are doing the right thing, we decided to make a new function for the intended purposes in the ML
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes the text in the series
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))
  texts_clean = texts_clean.str.replace('#','')

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
#We reset the index of value counts of the label to see if everything is the same as usual, and the label amounts are indeed the same as previously shown.
sample_list.labels.value_counts().reset_index()

In [None]:
# To ensure there will be no imbalance in the dataset we are going to run a undersampler to ensure the dataset is balanced. 
# The function is seen as RandomUnderSampler, attached to teh sample_list
rus = RandomUnderSampler(random_state=42)
data_df_res, y_res = rus.fit_resample(sample_list, sample_list['labels'])

In [None]:
# Splitting the dataset into the Training set and Test set (since we have a new output variable)
# The selected variables is keywords preprocessed, and the labels we constructed earlier.
X_train, X_test, y_train, y_test = train_test_split(sample_list['keywords_processed'], sample_list['labels'], test_size = 0.4, stratify=sample_list['labels'], random_state = 42)

In [None]:
#instantiate models and "bundle up as pipeline"

tfidf = TfidfVectorizer()
cls = LogisticRegression()

pipe = make_pipeline(tfidf, cls)

In [None]:
pipe.fit(X_train,y_train) # fit model

In [None]:
# evaluate model performance on training set

y_eval = pipe.predict(X_train)
report = classification_report(y_train, y_eval)
print(report)

The aforementioned model is the one we have used to "train" the model, it seems to perform at a adqeuate performance since the values are around 80-95% accuracy

In [None]:
# overall weights for the label groups, positive and negative weights seen.
eli5.show_weights(pipe, top=20, target_names=[0, 1, 2])

In [None]:
y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

As we can see the test set is not as accurate as perhaps warranted, however nonetheless it was around 30% accurate. 

## Matrix assignment and distance

We will now attempt to link up the labels we have created that was made through the author keywords, and link them up to the Abstracts of the papers. In other words, the labels will be categorized onto the abstracts based on the similarity of the words inside the abstracts. 

To do this we will first assign a ID to the abstracts, and the processed keywords of author keywords, and then assign them accordingly to the sample_size list. This will then allow us to search based on the assigned IDs for the similarity between the two, allowing to see the adjacent topics based on distance.

In [None]:
 from sklearn.preprocessing import LabelEncoder
 import scipy.sparse as ss

In [None]:
#We are going to use labelencoder to assign a label to abstract and keywords for making IDs to catagorization purposes.
#The following two lines below this one that shows that.
le_abstract = LabelEncoder()
le_keywords = LabelEncoder()

In [None]:
sample_list['Abstract_id'] = le_abstract.fit_transform(sample_list['Abstract'])

In [None]:
sample_list['keywords_processed_id'] = le_keywords.fit_transform(sample_list['keywords_processed'])

In [None]:
#This is a pandas function that returns a new array of given shape and type, filled with ones. We will need this for the matrix.
ones = np.ones(len(sample_list), np.uint32)

In [None]:
#The matrix is given the parameters of the abstract ID, and the keywords ID.
matrix = ss.coo_matrix((ones, (sample_list['Abstract_id'], sample_list['keywords_processed_id'])))

In [None]:
#And now we will dense it so we can display it.
matrix.todense()

In [None]:
#We are now printing the sample_list so we can see the addition of the assigned IDs, which can be seen at the far_right. 
sample_list

In [None]:
#To validate this, we are going to print out the matrix when the value of 1 is equals to 1. As seen when abstract ID is equal to 1, it returns the abstract ID of 231.
# Next line will show that in a more elaborate fashion.
np.where(matrix.todense()[1] == 1)

In [None]:
#As seen here, when the abstract ID is the same as one, it is equal to the keywords preprocessed ID of 231. This means they have the biggest similarity.
sample_list[sample_list['Abstract_id'] == 1]

In [None]:
#Same is done in reversed order, and the result is the same.
sample_list[sample_list['keywords_processed_id'] == 231]

In [None]:
#We are now going to implemented Truncated, this is a dimensionality reduction method using truncated SVD (aka LSA).
from sklearn.decomposition import TruncatedSVD

In [None]:
#We are setting up a command function with the parameters of three components (labels), and the random state as always being 42.
svd = TruncatedSVD(n_components=3, n_iter=7, random_state=42)

In [None]:
#We are now going to setup matrixes for the abstract ID and keywords ID. It will look the following way:
matrix_keywords = svd.fit_transform(matrix)

In [None]:
matrix_abstract = svd.fit_transform(matrix.T)

In [None]:
#To see if it works, we will now print matrix_keywords to check if it is working accordingly.
matrix_keywords

In [None]:
#As we got the sample size being 2000, it does indeed look like it functions as intended.
matrix

In [None]:
# Another way to also find the distances between the matrix variables is using cosine distances, we will also do the same.
from sklearn.metrics.pairwise import cosine_distances

In [None]:
cosine_distance_matrix_keywords = cosine_distances(matrix_keywords)

In [None]:
#Again, it seems to match accordingly to the sample size that we are running in the project.
cosine_distance_matrix_keywords.shape

In [None]:
#This function will look up the similarity of the abstracts to the keywords in the the previous matrix.
#The way it is setup is seen as below.
def similar_abstract(abstract, n):
  ix = le_keywords.transform([abstract])[0]
  sim_abstract = le_abstract.inverse_transform(np.argsort(cosine_distance_matrix_keywords[ix,:])[:n])
  return sim_abstract

In [None]:
#If we look at the keywords starting at 858, we can see the following similarties to other keywords being 388, 1941, 1384, and 1658.
np.argsort(cosine_distance_matrix_keywords[858,:])[:5]

In [None]:
#Now we are printing out 5 different abstracts that have this similarity to the keywords, it is abit heavy but nonetheless shows the following result.
le_abstract.inverse_transform(np.argsort(cosine_distance_matrix_keywords[858,:])[:5])

In [None]:
#The final test is checking what abstract ID is similar when we set the keywords ID in the matrix to 7, in this case it would be abstract ID 30.
sample_list[sample_list.keywords_processed_id == 7]

# Network analysis

---

In this section you'll see our network analysis.  This section contains the:

*   bipartite network of author - paper network and a adjency matrix hereoff.
*   Centrality and community network regarding universities - paper to try and illustrate which universities has the highest output of papers.
 




##Bipartite network ▶ Author - Paper

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt


In [None]:
!git clone https://github.com/Only-Mike/M2-NLP-Network-Analysis.git

In [None]:
# Importing datasets

In [None]:
data1 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2017 csv file.csv')
data2 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2018 csv file.csv')
data3 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2019 csv file.csv')
data4 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2020 csv file.csv')
data5 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2021 csv file.csv')


In [None]:
frames = [data1, data2, data3, data4, data5] #creating frame for all datasets

In [None]:
df = pd.concat(frames) #Concat all datasets to "df"

In [None]:
#Reduce the sample to 250 papers
df = df.sample(n = 250)

In [None]:
df

In [None]:
df = df.drop(index=df[df['Authors'] == '[No author name available]'].index, axis=0)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
#Make an edgelist
edgelist = []
for i in df.iterrows():
  targets = i[1]['Authors'].split(',')
  edgelist_i = [(i[1]['EID'], j) for j in targets]
  edgelist.extend(edgelist_i)

In [None]:
edgelist[3]  #<---- Show small sample of the data

In [None]:
len(edgelist)

In [None]:
import networkx as nx

In [None]:
from networkx.algorithms import bipartite

In [None]:
c0 = set([c[1] for c in edgelist])
c1 = set([c[0] for c in edgelist])

In [None]:
B = nx.Graph()

In [None]:
# add nodes and edges in their modes
B.add_nodes_from(c0, bipartite=0)
B.add_nodes_from(c1, bipartite=1)
B.add_edges_from(edgelist)

In [None]:
top_nodes = [n for n in B.nodes if B.nodes[n]['bipartite'] == 0]


In [None]:
G = bipartite.collaboration_weighted_projected_graph(B, top_nodes)

In [None]:
#this code removes nodes with degress (edges) of 1 or less
to_be_removed = [x for  x in G.nodes() if G.degree(x) <= 1]
G.remove_nodes_from(to_be_removed)

In [None]:
edges_df = nx.to_pandas_edgelist(B)
edges_df

In [None]:
# For visualization
!pip install -U bokeh
!pip install -q holoviews

In [None]:
# Import the libraries and link to the bokeh backend
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show

# Setting the default figure size a bit larger
defaults = dict(width=750, height=750, padding=0.1,
                xaxis=None, yaxis=None)
hv.opts.defaults(
    opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))

In [None]:
G_layout = nx.layout.kamada_kawai_layout(G)

In [None]:
g_plot = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                                        directed=False,
                                                                        edge_alpha=0.4,
                                                                        node_size= 5,
                                                                        #node_color='seniority', cmap='Set1',
                                                                        legend_position='right')
show(hv.render(g_plot))

## Centrality and communty network ▶ University - Paper

In [None]:
#Import the data again to run the full dataset

In [None]:
data1 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2017 csv file.csv')
data2 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2018 csv file.csv')
data3 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2019 csv file.csv')
data4 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2020 csv file.csv')
data5 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2021 csv file.csv')


In [None]:
frames = [data1, data2, data3, data4, data5] #creating frame for all datasets

In [None]:
df = pd.concat(frames) #Concat all datasets to "df"

In [None]:
df = df.drop(index=df[df['Authors'] == '[No author name available]'].index, axis=0)

In [None]:
df['Affiliations'].value_counts(ascending=False).nlargest(20)

In [None]:
#Changing the EID column from string to integer.
df['EID'] = df['EID'].str.replace(r's', '')
df['EID'] = df['EID'].str.replace(r'.', '')
df['EID'] = df['EID'].str.replace(r'-', '')
df['EID'].astype(int)

In [None]:
df['EID']

### Edgelist construction

In [None]:
# select Authors and EID
data_select = [['Authors', 'EID', 'Affiliations']]

In [None]:
# create edge Dataframe by merging it with itself.
edges = pd.merge(data_select, data_select, on='Affiliations')
edges.head()

In [None]:
# Filter out the self-edges
edges = edges[edges['EID_x'] != edges['EID_y']]

In [None]:
# grouping to aggregate multiple co-occurences and to generate a weight: 
edges = edges.groupby(['EID_x', 'EID_y']).size().reset_index()

In [None]:
edges.reset_index(drop=True, inplace=True)

In [None]:
# column "0" is now our weight
edges.head()

In [None]:
# Eename the "0" column to weight
edges.rename({0:'weight'}, axis = 1, inplace=True)

In [None]:
edges.head()

In [None]:
len(edges)

In [None]:
# Create network object from pandas edgelist
G = nx.from_pandas_edgelist(edges, source='EID_x', target='EID_y', edge_attr='weight', create_using=nx.Graph())

In [None]:
# We can create a node-attribute dictionary directly from the dataframe (using pandas to_dict)
node_attributes = data_select[['EID','Affiliations']].set_index('EID').drop_duplicates().to_dict('index')

In [None]:
# We now can include the degree as node-attribute
nx.set_node_attributes(G, {G.degree(): 'degree'})

In [None]:
# and use the node_attribute object to include all that in the graph object
nx.set_node_attributes(G, node_attributes)

In [None]:
len(G.nodes())

In [None]:
len(G.edges())

In [None]:
# Subset the graph keeping only nodes with degree > 1
G = nx.subgraph(G, [n for n,d in G.degree() if d > 1])

In [None]:
# Here we can calculate different centrality indicators as well as partition (community detection)
centrality_dgr = nx.degree_centrality(G)
centrality_eig = nx.eigenvector_centrality_numpy(G, weight = 'weight')

In [None]:
from community import community_louvain

In [None]:
partition = community_louvain.best_partition(G)

In [None]:
# All these indicators can now be set as attribute of the Graph
nx.set_node_attributes(G, centrality_dgr, 'dgr')
nx.set_node_attributes(G, centrality_eig, 'eig')
nx.set_node_attributes(G, partition, 'partition')

In [None]:
#Quick plot of the centrality degree
nx.draw_kamada_kawai(G, node_color=list(partition.values()), node_size=[v * 5 for v in dict(G.degree()).values()])

In [None]:
# Turn the Graph object (NetworkX) into a Dataframe
nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

In [None]:
nodes_df.head()

In [None]:
# Sort dataframe by eigenvector.
nodes_df.sort_values('eig', ascending=False)[:10]

In [None]:
# How many communities are there.
nodes_df.partition.nunique()

In [None]:
#define top10_com as partition value counts
top10_com = nodes_df.partition.value_counts()[:10].index

In [None]:
top10_com_nodes = nodes_df[nodes_df.partition.isin(top10_com)].index

# Make a subgraph
g_sub = nx.subgraph(G, top10_com_nodes)


In [None]:
# Now we will limit the resulting dataframe to the top10 communities
nodes_df_top10 = nodes_df[nodes_df.partition.isin(top10_com)]

In [None]:
nodes_df_top10

In [None]:
# with the highest eigenvector centrality
top_affiliations = nodes_df_top10.groupby('partition')['eig'].nlargest(5).reset_index()

In [None]:
top_affiliations

In [None]:
# After that we need to bring back ID's (rename) and Names (merge)
top_affiliations.rename({'level_1':'EID'}, axis=1, inplace=True)
top_affiliations = pd.merge(top_affiliations, data_select[['Affiliations','EID']].drop_duplicates(), on='EID', how='inner')

In [None]:
top_affiliations

### Visualisations



In [None]:
!pip install -qq holoviews
!pip install -qq -U bokeh
!pip install -qq datashader

In [None]:
# Import the libraries and link to the bokeh backend
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show
kwargs = dict(width=800, height=800, xaxis=None, yaxis=None)
opts.defaults(opts.Nodes(**kwargs), opts.Graph(**kwargs))

In [None]:
# keeping only top nodes (extreme subsetting)
top_central_nodes = nodes_df[nodes_df.eig > nodes_df.eig.quantile(0.99)].index

In [None]:
# Create subset graph
g_sub = nx.subgraph(G, top_central_nodes)

In [None]:
#Calculate the centrality degree
cent_degree = dict(nx.degree(G))

In [None]:
#Calculate the eigen degree
cent_eigen = dict(nx.eigenvector_centrality(G))

In [None]:
#Plot centrality degree graph
nx.set_node_attributes(G, cent_degree, 'cent_degree')

g_plot = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                  node_size='cent_degree')

show(hv.render(g_plot))

In [None]:
#Plot centrality eigenvalue graph
nx.set_node_attributes(G, cent_eigen, 'cent_eigen')

g_plot = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                  node_size='cent_eigen')

show(hv.render(g_plot))

In [None]:
# Find the optimal partition with the Louvain algorithm.
com = community_louvain.best_partition(G)

In [None]:
# The number of communities detected
max(com.values())

In [None]:
#Plot community network
nx.set_node_attributes(G, com, 'community')

g_plot = hv.Graph.from_networkx(G, G_layout).opts(tools=['hover'],
                                                  node_size='cent_degree', 
                                                  node_color='community', cmap=plt.cm.Set1,
                                                  legend_position='right',
                                                  edge_alpha=0.25)

show(hv.render(g_plot))