In [2]:
# OCTIS needs Python Version 3.10!!
# See requirements.txt for package installation.

from src.data_cleaning import load_data, clean_duplicates
from src.data_preprocessing import preprocess, split_in_time_points
from src.data_transformation import add_bigrams, filter_extreme_values, transform_in_octis_format, save_doc_ids_dates
from src.lda_octis import find_best_model, load_octis_data, update_model, calculate_coherence
from src.topic_analysis import find_emerging_topics, find_trending_topics

import pandas as pd
import numpy as np

# Download nltk stopwords, tokenization and PoS-Taggging
#nltk.download('popular')
#nltk.download('stopwords')

!pip3 install spacy
!python3 -m spacy download en_core_web_sm

# Data

### 1. Data Import

In [3]:
# Import Scopus Data (82.598 papers)
df = load_data()
df

Unnamed: 0,title,description,coverDate
0,Food systems are responsible for a third of gl...,We have developed a new global food emissions ...,2021-03-01
1,Digital transformation: A multidisciplinary re...,Digital transformation and resultant business ...,2021-01-01
2,Predicted growth in plastic waste exceeds effo...,"Plastic pollution is a planetary threat, affec...",2020-09-01
3,DeepAR: Probabilistic forecasting with autoreg...,"Probabilistic forecasting, i.e., estimating a ...",2020-07-01
4,Research opportunities for a more resilient po...,Purpose: The COVID-19 crisis has caused major ...,2020-06-19
...,...,...,...
82593,Resilient metallurgical supplier management - ...,The resilient supplier management is a crucial...,2020-01-01
82594,Optimization of photochemical degradation of d...,The objective of this research work is to appl...,2020-01-01
82595,Goal Geometric Programming,,2019-01-01
82596,Eating less meat ‘to save the planet’: Studyin...,Recently published healthy eating guidelines i...,2019-01-01


# Data Cleaning

In [4]:
# Remove duplicates and None values
corpus = clean_duplicates(df)
corpus

None values removed.
Number of (title + desc) duplicates: 118. Duplicates Removed.
Dataframe has now {len(clean_df)} entries.


Unnamed: 0,title,description,coverDate
0,Food systems are responsible for a third of gl...,We have developed a new global food emissions ...,2021-03-01
1,Digital transformation: A multidisciplinary re...,Digital transformation and resultant business ...,2021-01-01
2,Predicted growth in plastic waste exceeds effo...,"Plastic pollution is a planetary threat, affec...",2020-09-01
3,DeepAR: Probabilistic forecasting with autoreg...,"Probabilistic forecasting, i.e., estimating a ...",2020-07-01
4,Research opportunities for a more resilient po...,Purpose: The COVID-19 crisis has caused major ...,2020-06-19
...,...,...,...
82592,A multi-period stochastic casualty evacuation ...,"In this paper, we propose a new optimization a...",2020-01-01
82593,Resilient metallurgical supplier management - ...,The resilient supplier management is a crucial...,2020-01-01
82594,Optimization of photochemical degradation of d...,The objective of this research work is to appl...,2020-01-01
82596,Eating less meat ‘to save the planet’: Studyin...,Recently published healthy eating guidelines i...,2019-01-01


# Data Preprocessing

### 1. Remove all digit-only chars, special chars, diacritics
### 2. Filter terms with 2-30 chars
### 3. Strip multiple whitespaces, transform to lowercase

### 4. Tokenization, PoS Tagging

### 5. Lemmatization

### 6. Stop word filtering

In [None]:
# Takes ~4 minutes!
preprocessed_corpus = preprocess(corpus)
# Save corpus in file to avoid long run time (pickle preserves the data structure more accurately -> able to handle complex file types (e.g., lists)
preprocessed_corpus.to_pickle('Data/corpus.pkl')
preprocessed_corpus

Unnamed: 0,title,description,coverDate,title_tokens,desc_tokens,tokens
0,food systems are responsible for third of glob...,we have developed new global food emissions da...,2021-03-01,"[(food, NN), (system, NNS), (responsible, JJ),...","[(develop, VBN), (new, JJ), (global, JJ), (foo...","[food, system, responsible, third, global, ant..."
1,digital multidisciplinary reflection and resea...,digital transformation and resultant business ...,2021-01-01,"[(digital, JJ), (multidisciplinary, JJ), (refl...","[(digital, JJ), (transformation, NN), (resulta...","[digital, multidisciplinary, reflection, resea..."
2,predicted growth in plastic waste exceeds effo...,plastic pollution is planetary affecting nearl...,2020-09-01,"[(predicted, JJ), (growth, NN), (plastic, JJ),...","[(plastic, JJ), (pollution, NN), (planetary, J...","[predicted, growth, plastic, waste, exceeds, e..."
3,probabilistic forecasting with autoregressive ...,probabilistic estimating time future probabili...,2020-07-01,"[(probabilistic, JJ), (forecasting, NN), (auto...","[(probabilistic, JJ), (estimating, NN), (time,...","[probabilistic, forecasting, autoregressive, r..."
4,research opportunities for more resilient supp...,the crisis has caused major supply chain and t...,2020-06-19,"[(research, NN), (opportunity, NNS), (resilien...","[(crisis, NN), (cause, VBN), (major, JJ), (sup...","[research, opportunity, resilient, supply, cha..."
...,...,...,...,...,...,...
82592,stochastic casualty evacuation network design ...,in this we propose new optimization approach f...,2020-01-01,"[(stochastic, JJ), (casualty, NN), (evacuation...","[(propose, VBP), (new, JJ), (optimization, NN)...","[stochastic, casualty, evacuation, network, de..."
82593,resilient metallurgical supplier management re...,the resilient supplier management is crucial p...,2020-01-01,"[(resilient, JJ), (metallurgical, JJ), (suppli...","[(resilient, NN), (supplier, NN), (management,...","[resilient, metallurgical, supplier, managemen..."
82594,optimization of photochemical degradation of d...,the objective of this research work is to appl...,2020-01-01,"[(optimization, NN), (photochemical, JJ), (deg...","[(objective, NN), (research, NN), (work, NN), ...","[optimization, photochemical, degradation, dai..."
82596,eating less meat save the studying the develop...,recently published healthy eating guidelines i...,2019-01-01,"[(eat, VBG), (less, JJR), (meat, NN), (save, V...","[(publish, VBN), (healthy, JJ), (eat, VBG), (g...","[eat, less, meat, save, study, development, su..."


### 7. Transform into Bigrams and DTM

### 8. Extreme Value Filtering -> Before DTM so Kernel doesn't crash!
Plus divide data into time intervals

In [6]:
corpus = pd.read_pickle('Data/corpus.pkl')
corpus

Unnamed: 0,title,description,coverDate,title_tokens,desc_tokens,tokens
0,food systems are responsible for third of glob...,we have developed new global food emissions da...,2021-03-01,"[(food, NN), (system, NNS), (responsible, JJ),...","[(develop, VBN), (new, JJ), (global, JJ), (foo...","[food, system, responsible, third, global, ant..."
1,digital multidisciplinary reflection and resea...,digital transformation and resultant business ...,2021-01-01,"[(digital, JJ), (multidisciplinary, JJ), (refl...","[(digital, JJ), (transformation, NN), (resulta...","[digital, multidisciplinary, reflection, resea..."
2,predicted growth in plastic waste exceeds effo...,plastic pollution is planetary affecting nearl...,2020-09-01,"[(predicted, JJ), (growth, NN), (plastic, JJ),...","[(plastic, JJ), (pollution, NN), (planetary, J...","[predicted, growth, plastic, waste, exceeds, e..."
3,probabilistic forecasting with autoregressive ...,probabilistic estimating time future probabili...,2020-07-01,"[(probabilistic, JJ), (forecasting, NN), (auto...","[(probabilistic, JJ), (estimating, NN), (time,...","[probabilistic, forecasting, autoregressive, r..."
4,research opportunities for more resilient supp...,the crisis has caused major supply chain and t...,2020-06-19,"[(research, NN), (opportunity, NNS), (resilien...","[(crisis, NN), (cause, VBN), (major, JJ), (sup...","[research, opportunity, resilient, supply, cha..."
...,...,...,...,...,...,...
82592,stochastic casualty evacuation network design ...,in this we propose new optimization approach f...,2020-01-01,"[(stochastic, JJ), (casualty, NN), (evacuation...","[(propose, VBP), (new, JJ), (optimization, NN)...","[stochastic, casualty, evacuation, network, de..."
82593,resilient metallurgical supplier management re...,the resilient supplier management is crucial p...,2020-01-01,"[(resilient, JJ), (metallurgical, JJ), (suppli...","[(resilient, NN), (supplier, NN), (management,...","[resilient, metallurgical, supplier, managemen..."
82594,optimization of photochemical degradation of d...,the objective of this research work is to appl...,2020-01-01,"[(optimization, NN), (photochemical, JJ), (deg...","[(objective, NN), (research, NN), (work, NN), ...","[optimization, photochemical, degradation, dai..."
82596,eating less meat save the studying the develop...,recently published healthy eating guidelines i...,2019-01-01,"[(eat, VBG), (less, JJR), (meat, NN), (save, V...","[(publish, VBN), (healthy, JJ), (eat, VBG), (g...","[eat, less, meat, save, study, development, su..."


In [7]:
# split data into years
dfs_by_year = split_in_time_points(corpus)
years = len(dfs_by_year)

Result: number of dataframes/ calculated years/ t: 7


In [8]:
# Filter data at each time point and write into folders for different t
for year in np.arange(1, years+1):
    batch_corpus = dfs_by_year[year]
    
    # Save doc id plus date in file
    doc_id_dates_df = save_doc_ids_dates(base_folder_path = 'Data/', batch_corpus = batch_corpus, year = year, save_to_folder = True)

    # Add bigrams
    batch_corpus = add_bigrams(batch_corpus)
    
    # Filter extreme values
    document_list = filter_extreme_values(batch_corpus)

    # Transform in octis format
    documents_df, vocabulary = transform_in_octis_format(base_folder_path = 'Data/', document_list = document_list, t = year, save_to_folder = True)

# Topic Modeling

### 1. Model Building and Selection (choose k)

In [9]:
# Have a look at corpus size for the different t
dataset1 = load_octis_data(1)
dataset2 = load_octis_data(2)
dataset3 = load_octis_data(3)
dataset4 = load_octis_data(4)
dataset5 = load_octis_data(5)
dataset6 = load_octis_data(6)
dataset7 = load_octis_data(7)
print(f'Length of dataset1: {len(dataset1.get_corpus())}')
print(f'Length of dataset2: {len(dataset2.get_corpus())}')
print(f'Length of dataset3: {len(dataset3.get_corpus())}')
print(f'Length of dataset4: {len(dataset4.get_corpus())}')
print(f'Length of dataset5: {len(dataset5.get_corpus())}')
print(f'Length of dataset6: {len(dataset6.get_corpus())}')
print(f'Length of dataset7: {len(dataset7.get_corpus())}')

Length of dataset1: 9815
Length of dataset2: 11081
Length of dataset3: 12780
Length of dataset4: 13812
Length of dataset5: 15092
Length of dataset6: 16240
Length of dataset7: 1255


In [10]:
# Try a model for t = 1 with Mühlroth values (Repeat a few times, if it throws an error before calculating the first model)
best_model_1, best_model_output_1, best_k_1 = find_best_model(t = 1)

Coherence Score for Model with k = 69: 0.4499297055236276
Coherence Score for Model with k = 70: 0.45129722050639715
Coherence Score for Model with k = 71: 0.4733637079426238
Coherence Score for Model with k = 72: 0.44556061847538686
Coherence Score for Model with k = 73: 0.45685490300073367
Coherence Score for Model with k = 74: 0.4640863621871422
Coherence Score for Model with k = 75: 0.4669738106663015
Coherence Score for Model with k = 76: 0.45333456979392994
Coherence Score for Model with k = 77: 0.4619502980706343
Coherence Score for Model with k = 78: 0.4447683211568757
Coherence Score for Model with k = 79: 0.4617600683340465
Coherence Score for Model with k = 80: 0.4365703519939886
Coherence Score for Model with k = 81: 0.4567291447253628
Coherence Score for Model with k = 82: 0.4643782904406693
Coherence Score for Model with k = 83: 0.45013276542928465
Coherence Score for Model with k = 84: 0.42166109336049246
Coherence Score for Model with k = 85: 0.4578744804467408
Coherenc

In [11]:
# View Topics
for topic in best_model_output_1['topics']:
    print(' '.join(topic))
coherence = calculate_coherence(model_output = best_model_output_1, t = 1)
print(f'Coherence Score for Model with k = {best_k_1}: {coherence}')

service quality logistics model game production cost party customer logistics_service
management sustainability social green environmental research supply industry sustainable paper
model business use change sustainable sustainability development approach study develop
supply chain supply_chain information management competitive strategy disruption advantage global
supply chain supply_chain study manufacturer performance result firm decision show
online study offline return mass retailing use customization shopping online_retailer
demand contract retailer share optimal policy revenue profit manufacturer capacity
use study practice research paper data case management base organization
waste management solid waste_management solid_waste use municipal energy system municipal_solid
remanufactured increase product use remanufactured_product return vehicle study result impact
inventory model cost demand time problem part distribution spare propose
product quality customer coal market cost pr

# Topic Analysis

### 1. Update Model with new data

In [12]:
# Reevaluate model with new data at t = 2 with test range of k = [5, 10, 15, 20, 25, 30]
best_model_2, best_model_output_2, best_k_2= find_best_model(t = 2, test_range = True)

Coherence Score for Model with k = 5: 0.4842783276421262
Coherence Score for Model with k = 10: 0.5362210068433599
Coherence Score for Model with k = 15: 0.49447651766606343
Coherence Score for Model with k = 20: 0.533871814171566
Coherence Score for Model with k = 25: 0.5503197725453807
Coherence Score for Model with k = 30: 0.5025987111781155
Coherence Score for the best model with k = 25: 0.5503197725453807


In [13]:
coherence_2 = calculate_coherence(model_output = best_model_output_2, t = 2)
# Update Model at time point t = 3
best_model_3, best_model_output_3, best_k_3 = update_model(t_now = 3, old_coherence = coherence_2, old_k = best_k_2, test_range = True)

Coherence Score for Model using old k = 25: 0.5076807151990521
This coherence of 0.5076807151990521 is worse than the old coherence of 0.5503197725453807.
Finding new model...

Coherence Score for Model with k = 5: 0.5169903179764579
Coherence Score for Model with k = 10: 0.5239696855742813
Coherence Score for Model with k = 15: 0.5306139024299402
Coherence Score for Model with k = 20: 0.5471145448274553
Coherence Score for Model with k = 25: 0.5026885130686735
Coherence Score for Model with k = 30: 0.5294691916960421
Coherence Score for the best model with k = 20: 0.5471145448274553
Coherence Score for Model using a new best k = 20: 0.5471145448274553
This coherence for k = 20 of 0.5471145448274553 is still worse than the old coherence of 0.5503197725453807.
Continue to use old k of 25.


In [14]:
coherence_3 = calculate_coherence(model_output = best_model_output_3, t = 3)
# Update Model at time point t = 4
best_model_4, best_model_output_4, best_k_4 = update_model(t_now = 4, old_coherence = coherence_3, old_k = best_k_3, test_range = True)

Coherence Score for Model using old k = 25: 0.5312710714747276
This coherence of 0.5312710714747276 is better than the old coherence of 0.5076807151990521.
Continue to use old k of 25.


### 2. Emergence Detection

In [None]:
# find emerging topics at t = 3
# Mühlroth & Grottke suggested pi = 0.3625 but sometimes a less conservative value (larger pi) has to be used to find emerging topics! 
topics_emerging_popular = find_emerging_topics(model_new = best_model_3, model_output_new = best_model_output_3, 
                                               model_old = best_model_2, model_output_old = best_model_output_2, pi = 0.7)
print('Topic emergences: ', topics_emerging_popular)

# View emerging topics
print('These are the emerging topics:\n')
for k in topics_emerging_popular:
    if topics_emerging_popular[k] == 'emerging':
        print(best_model_output_3['topics'][k])

Topic emergences:  {0: 'emerging', 1: 'emerging', 2: 'popular', 3: 'popular', 4: 'popular', 5: 'emerging', 6: 'emerging', 7: 'emerging', 8: 'popular', 9: 'popular', 10: 'emerging', 11: 'popular', 12: 'emerging', 13: 'emerging', 14: 'emerging', 15: 'emerging', 16: 'popular', 17: 'emerging', 18: 'popular', 19: 'emerging', 20: 'popular', 21: 'emerging', 22: 'popular', 23: 'emerging', 24: 'emerging'}
These are the emerging topics:

['supplier', 'performance', 'company', 'selection', 'sustainability', 'sustainable', 'information', 'use', 'criterion', 'study']
['material', 'organic', 'use', 'clothing', 'mineral', 'system', 'size', 'pattern', 'import', 'raw']
['design', 'learn', 'product', 'learning', 'quality', 'student', 'paper', 'improve', 'safety', 'use']
['method', 'use', 'forecast', 'green', 'time', 'result', 'data', 'base', 'study', 'propose']
['production', 'use', 'environmental', 'increase', 'metal', 'system', 'textile', 'economic', 'impact', 'study']
['carbon', 'emission', 'carbon_e

### 3. Trend Detection

In [16]:
# Find trending topics
topics_trending = find_trending_topics(model_output=best_model_output_3, t=3)
print('Trending vs. declining topics: ', topics_trending)

# Find topics that are emerging & trending => Establish strategy!
print('These are the emerging + trending topics:\n')
for k in topics_trending:
    if topics_trending[k] == 'trending' and topics_emerging_popular[k] == 'emerging':
        print(best_model_output_3['topics'][k])

Trending vs. declining topics:  {0: 'declining', 1: 'declining', 2: 'trending', 3: 'declining', 4: 'trending', 5: 'declining', 6: 'trending', 7: 'declining', 8: 'trending', 9: 'trending', 10: 'declining', 11: 'declining', 12: 'trending', 13: 'declining', 14: 'declining', 15: 'declining', 16: 'trending', 17: 'declining', 18: 'trending', 19: 'declining', 20: 'trending', 21: 'trending', 22: 'trending', 23: 'declining', 24: 'trending'}
These are the emerging + trending topics:

['method', 'use', 'forecast', 'green', 'time', 'result', 'data', 'base', 'study', 'propose']
['model', 'use', 'removal', 'treatment', 'process', 'time', 'electrocoagulation', 'water', 'result', 'wastewater']
['meat', 'diet', 'animal', 'food', 'public', 'consumption', 'procurement', 'dietary', 'vegetarian', 'sustainable']
['service', 'use', 'model', 'human', 'base', 'cold_chain', 'quality', 'cold', 'body', 'measurement']


### 4. Visualization

In [17]:
# Create a DataFrame from the dictionaries
topics_df = pd.DataFrame({
    'Emergence': topics_emerging_popular,
    'Growth': topics_trending
})

# Create a pivot table to get the matrix
pivot_table = topics_df.pivot_table(index='Emergence', columns='Growth', aggfunc='size', fill_value=0)
print(pivot_table)

Growth     declining  trending
Emergence                     
emerging          11         4
popular            2         8
