<a href="https://colab.research.google.com/github/Only-Mike/M2-NLP-Network-Analysis/blob/main/NLP_SC_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1 --- Topic modelling

---


In [1]:
#Installing preprocessor to celan our text
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

# explainability (why did the model say it's related to this author)
!pip install eli5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import tqdm #progress bar
import preprocessor as prepro # text prepro

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


In [3]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [4]:
!git clone https://github.com/Only-Mike/M2-NLP-Network-Analysis.git

fatal: destination path 'M2-NLP-Network-Analysis' already exists and is not an empty directory.


In [5]:
# Importing dataset

In [6]:
data1 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2017 csv file.csv')
data2 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2018 csv file.csv')
data3 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2019 csv file.csv')
data4 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2020 csv file.csv')
data5 = pd.read_csv('/content/M2-NLP-Network-Analysis/Supply chain network - 2021 csv file.csv')


In [7]:
frames = [data1, data2, data3, data4, data5] #creating frame for all datasets

In [8]:
df = pd.concat(frames) #Concat all datasets to "df"

In [9]:
df.shape

(8573, 22)

In [10]:
df.columns

Index(['Authors', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Cited by',
       'DOI', 'Link', 'Affiliations', 'Authors with affiliations', 'Abstract',
       'Author Keywords', 'Index Keywords', 'Funding Details',
       'Correspondence Address', 'Publisher', 'ISSN', 'ISBN', 'CODEN',
       'Document Type', 'Source', 'EID'],
      dtype='object')

## Cleaning text and embedding the data

In [11]:
#Choosing the columns we need to work with and storing them in the value "data". This is for saving the "df" dataframe for the prediction model later on.
data = df[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [12]:
#Take a random sample of 1500 papers. This i for making the model run faster.
data.sample(n=1500)

Unnamed: 0,Authors,Author(s) ID,Title,Abstract,Year,Source title
207,"Abdelsamad C., Samir T., Aziz S., Jamila E.",57211289567;57212747985;42162322200;55987066300;,Artificial neural network based meta-heuristic...,"Nowadays, reducing total costs while enhancing...",2021,Indonesian Journal of Electrical Engineering a...
1420,"Almanaseer M., Zhang G.",57407895700;56961166100;,The Preference of VMI Contract on Traditional ...,This research studies the performance and outc...,2021,Uncertainty and Operations Research
1282,"Mubarik M., Zuraidah R., Rasi B.R.M.",57214289371;57213609318;57213603173;,"Triad of big data supply chain analytics, supp...",The objective of the paper is to examine the i...,2019,Humanities and Social Sciences Letters
362,"Hammadi L., de Cursi E.S., Barbu V.S., Ouahman...",57189521310;6602568777;11240949600;6505784001;...,SCOR model for customs supply chain process de...,Integrated supply chain management has gained ...,2018,World Customs Journal
343,"Moreno-Camacho C.A., Montoya-Torres J.R., Jaeg...",56941843800;57219048886;36630589900;56613860900;,Sustainability metrics for real case applicati...,Increasing pressure from governments and stake...,2019,Journal of Cleaner Production
...,...,...,...,...,...,...
1599,"Brahmana S., Hendar, Mu'minah I., Razimi M.S.B.A.",57208509006;57214840591;57189521427;56050769100;,"Supply Chain governance, corporate governance ...",Supply Chain governance is an emerging phenome...,2019,International Journal of Supply Chain Management
280,"Ye N., Kueh T.-B., Hou L., Liu Y., Yu H.",56400335200;57218139918;57218142147;5587099720...,A bibliometric analysis of corporate social re...,The involvement of corporate social responsibi...,2020,Journal of Cleaner Production
1623,"Yang Y., Meng L., Zhang B., Yu Y.",36095455600;55982146100;57202734053;57200121013;,Supply chain network production and outsourcin...,To study the production and outsourcing decisi...,2019,Jisuanji Jicheng Zhizao Xitong/Computer Integr...
15,"Mudgal S., Gupta P.K., Yadav A.K., Mahajan V.",57211998745;57199836986;57211997819;57196673706;,Artificial neural network for reliability eval...,This paper presents the modelling of power sys...,2020,"2020 21st National Power Systems Conference, N..."


In [13]:
#Making a new column called "text" from the abstract column
data['text'] = data['Abstract']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
#Cleaning the text
data['text_clean'] = data['text'].map(lambda t: prepro.clean(t))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data['text_clean']),position=0, leave=True)

for text in nlp.pipe(data['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

100%|█████████▉| 8567/8573 [02:02<00:00, 131.06it/s]

In [16]:
# write everything into a single function for simplicity later on
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [17]:
data['text_clean'] = text_prepro(data['text'])  ##<---- HVAD GØR DEN HER?<-------##

100%|██████████| 8573/8573 [01:55<00:00, 74.55it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
# preprocess texts (we need tokens)  <---- Hvad gør den her? <------
tokens = []

for summary in nlp.pipe(data['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [19]:
data['tokens'] = tokens

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
## SPØRGSMÅL OMKRING FILTER 

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data['tokens'])

# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max XXXXXXX words
dictionary.filter_extremes(no_below=4, no_above=0.4, keep_n=600)

# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data['tokens']]

##Visualization

In [21]:
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=13, workers = 4, passes=10)

In [22]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [23]:
pyLDAvis.display(lda_display)

##Model Metrics

In [24]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora

In [25]:
corpus = data['tokens']

In [26]:
corpus

0       [urban, population, growth, extremely, rapid, ...
1       [general, supply, chain, environmentally, sens...
2       [proceeding, paper, topic, light, weight, incr...
3       [merchant, operation, approach, commodity, ene...
4       [consumer, institutional, trader, interest, lo...
                              ...                        
1995    [agile, supply, chain, speed, responsiveness, ...
1996    [challenge, market, dynamic, configuration, in...
1997    [purpose, work, review, present, article, datu...
1998    [study, stage, mathematical, model, facilitate...
1999    [pointer, network, pn, breakthrough, recent, y...
Name: tokens, Length: 8573, dtype: object

In [27]:
dirichlet_dict = corpora.Dictionary(corpus)
bow_corpus = [dirichlet_dict.doc2bow(text) for text in corpus]

# Considering 1-15 topics, as the last is cut off
num_topics = list(range(16)[1:])
num_keywords = 15

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    LDA_models[i] = LdaModel(corpus=bow_corpus,
                             id2word=dirichlet_dict,
                             num_topics=i,
                             update_every=1,
                             chunksize=len(bow_corpus),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]

KeyboardInterrupt: ignored

In [None]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

In [None]:
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

In [None]:
coherences = [CoherenceModel(model=LDA_models[i], texts=corpus, dictionary=dirichlet_dict, coherence='c_v').get_coherence() for i in num_topics[:-1]]

In [None]:
coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]

In [None]:
plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()  

# Part 2 --- Label Prediction

---




## Data Cleaning

In [None]:
df.info()

In [None]:
time_data = df['Document Type']
time_data.index = df['Year']

df.head()

In [None]:
yearly_counts = time_data.resample('A').count()

fig, ax = plt.subplots(3, figsize=(18,16))
ax[2].plot(yearly_counts);
ax[2].set_title('Yearly Counts');
plt.show()