# Topic Modeling using Latent Dirichlet Allocation (LDA) and Combined Topic Models (CTM)
## 1. Setup
### 1.1 Dependencies
Installing all dependencies needed to run the simulations

In [None]:
!pip install contextualized-topic-models==2.2.0

### 1.2 Imports

In [18]:
import re
import random
import os
import urllib
import urllib.request
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### 1.3 Data load 

In [19]:
# Path to the data files
path_before_1990 = 'titles_before_1990.txt'
path_from_1990_to_2009 = 'titles_from_1990_to_2009.txt'
path_from_2010 = 'titles_from_2010.txt'

In [20]:

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
num_titles = 500000  # the (max)number of titles to load 


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.
    
    Divide the papers into 3 time periods. 
    
    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title: 
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:  
                    # only include titles with at least four words
                    continue
                got_title = True
        
        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010
    
    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

14627025it [00:50, 291463.37it/s]


In [21]:
def load_titles(path):
  with open(path) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]
  return titles

## 1.4 Data preparation 

In [22]:
NUM_LDA_TOPICS = 8 # The number of different topics to identify
NUM_FEATURES = 10000
MAX_DF=0.5
MIN_DF=0.01

In [23]:
# Simple text preprocessing by removing all letters which are not in roman alphabet
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    #text = re.sub(r'\b\w{1,3}\b', ' ', text)
    #text = re.sub(' +', ' ', text)
    text = text.lower()
    return text

In [24]:
# Now we turn the documents (or titles in this case) into a matrix feature representation.
def vectorize_data(titles, max_df=MAX_DF, min_df=MIN_DF, max_features=NUM_FEATURES):
  tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
  tf = tf_vectorizer.fit_transform(titles)
  tf_feature_names = tf_vectorizer.get_feature_names_out()
  return tf, tf_feature_names

## 2. Topic Modeling using LDA

In [25]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 5

### 2.1 before the 1990s



In [26]:
# Load the titles
titles_before_1990 = load_titles(path_before_1990)
print("{} titles before 1990".format(len(titles_before_1990)))

40000 titles before 1990


In [27]:
# Show some random samples
random.sample(titles_before_1990, 10)

['On the Paging Performance of Array Algorithms.',
 'Intransitive Indifference in Preference Theory: A Survey.',
 'The System LD.',
 'R68-39 Simulation of the Transfer Function of a Crustacean Muscle Bundle.',
 'Levelling Terrain Trees: A Transshipment Problem.',
 'Investigations in protothetic.',
 'Statistical modeling and feature selection for seismic pattern recognition.',
 'Automation of chemical plant modelling.',
 'Arithmetic codes resembling neural encoding.',
 'A radical proposal for computer algebra in education.']

In [28]:
preprocessed_titles_before_1990 = [preprocess_text(title) for title in titles_before_1990]

In [29]:
# Show some preprocessed samples
random.sample(preprocessed_titles_before_1990, 10)

['the inconsistency of certain formal logic',
 'history of mechanical computing machinery',
 'an approach to organizing microinstructions which minimizes the width of control store words',
 'theories of causal ordering reply to de kleer and brown',
 'nearly  memories of the ',
 'algorithm  lopsi a simultaneous iteration method for real matrices f',
 'extendible hashing with overflow',
 'parallel processing in ada',
 'feature analysis of turbo prolog',
 'on choosing identifiers']

In [30]:
tf_01, tf_feature_names_01 = vectorize_data(preprocessed_titles_before_1990, max_df=0.95, min_df=0.01)

In [31]:
lda_01 = LatentDirichletAllocation(n_components=8, max_iter=10, learning_method='online', random_state=42).fit(tf_01)

In [32]:
for topic_idx, topic in enumerate(lda_01.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names_01[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: theory problems algorithms simulation decision parallel application solution applications optimal digital control
Topic 1: computer logic model programs performance digital design networks using simulation applications systems
Topic 2: problem programming optimal language digital software processing research solution parallel linear control
Topic 3: data method network models application languages solution processing problem programming using analysis
Topic 4: note information linear functions applications finite technical programming problem systems time optimal
Topic 5: algorithm design analysis approach sets performance new using linear implementation parallel optimal
Topic 6: systems using parallel performance implementation decision distributed linear control digital processing design
Topic 7: control networks new recognition distributed time pattern optimal systems approach digital linear


### 2.2 from 1990 to 2009:

In [33]:
titles_from_1990_to_2009 = load_titles(path_from_1990_to_2009)
print("{} titles from 1990 to 2009".format(len(titles_from_1990_to_2009)))

330200 titles from 1990 to 2009


In [34]:
preprocessed_titles_from_1990_to_2009 = [preprocess_text(title) for title in titles_from_1990_to_2009]

In [35]:
tf_02, tf_feature_names_02 = vectorize_data(titles_from_1990_to_2009, max_df=0.95, min_df=0.01)

In [36]:
lda_02 = LatentDirichletAllocation(n_components=8, max_iter=10, learning_method='online', random_state=42).fit(tf_02)

In [37]:
for topic_idx, topic in enumerate(lda_02.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names_02[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: using control data methods robust nonlinear optimal structure high multiple linear models
Topic 1: adaptive estimation image graphs software computer power development digital scheme robust high
Topic 2: time algorithm linear new network nonlinear models algorithms optimal efficient high robust
Topic 3: method problems management optimization case space equations finite nonlinear new order linear
Topic 4: problem study learning detection distributed web case algorithms algorithm scheduling network multiple
Topic 5: systems based dynamic evaluation knowledge programming nonlinear linear robust time control structure
Topic 6: model design approach information applications fuzzy modeling non computing robust new theory
Topic 7: analysis networks performance application multi neural order wireless mobile recognition network high


### 2.3 - from 2010 onwards

In [38]:
# Load the titles
titles_from_2010 = load_titles(path_from_2010)
print("{} titles from from 2010".format(len(titles_from_2010)))

802625 titles from from 2010


In [39]:
# Preprocess the titles by removing certain characters
preprocessed_titles_from_2010 = [preprocess_text(title) for title in titles_from_2010]

In [40]:
tf_03, tf_feature_names_03 = vectorize_data(preprocessed_titles_from_2010, max_df=0.95, min_df=0.01)

In [41]:
lda_03 = LatentDirichletAllocation(n_components=8, max_iter=10, learning_method='online', random_state=42).fit(tf_03)

In [42]:
for topic_idx, topic in enumerate(lda_03.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names_03[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: networks model detection adaptive neural wireless sensor network nonlinear power selection using
Topic 1: design information linear mobile novel recognition social computing images research cloud network
Topic 2: systems approach algorithm online tracking improved smart cloud nonlinear linear control optimal
Topic 3: method estimation performance efficient robust distributed problem problems evaluation stochastic sensing nonlinear
Topic 4: learning optimization models deep scheme energy hybrid equations prediction machine nonlinear algorithms
Topic 5: based analysis data dynamic communication feature selection network image classification multiple fuzzy
Topic 6: using network image nonlinear study new power classification optimal multiple fuzzy selection
Topic 7: control application applications framework modeling time sensor methods management scheduling nonlinear optimal


## 3. Topic Modeling using CTM

New method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/) and here is the quick tutorial guide : 



In [43]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

num_ctm_topics = 8 

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [44]:
#  remove stop words, and return the processed, unprocessed, and vocabulary data 

def load_text(text):
  sp = WhiteSpacePreprocessing(text, stopwords_language='english')
  preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()
  return preprocessed_documents, unpreprocessed_corpus, vocab

In [45]:
all_titles = titles_before_1990 + titles_from_1990_to_2009 + titles_from_2010

### before 1990

In [46]:

preprocessed_documents_before_1990, unpreprocessed_corpus_before_1990, vocab_before_1990 = load_text(preprocessed_titles_before_1990)



In [47]:
contextualized_txt_01 = random.sample(all_titles,len(unpreprocessed_corpus_before_1990))

In [48]:
# here we use the same model, paraphrase-distilroberta-base-v1, as in the tutorial
tp = TopicModelDataPreparation("multi-qa-mpnet-base-cos-v1")
training_dataset = tp.fit(text_for_contextual=contextualized_txt_01, text_for_bow=preprocessed_documents_before_1990)

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/197 [00:00<?, ?it/s]

In [49]:
ctm1 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=20)
ctm1.fit(training_dataset) # run the model

Epoch: [20/20]	 Seen Samples: [787680/787680]	Train Loss: 32.92283893672766	Time: 0:00:29.809987: : 20it [09:13, 27.70s/it]


In [50]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm1.get_topic_lists(10)[i]))

Topic 0: computer networks software network performance architecture design simulation introduction graphics
Topic 1: recognition pattern de von der machines und image sequential automatic
Topic 2: system information data language management database research review natural science
Topic 3: systems control model theory optimal distributed models decision linear adaptive
Topic 4: arithmetic algebras letter editor properties grammars form groups characterization codes
Topic 5: note problem algorithms technical parallel algorithm time automata complexity sets
Topic 6: completeness modal calculus logics symbolic predicate meeting association semantics behavior
Topic 7: analysis logic programming functions using linear approach method algorithm new


We can see that in this period, some topics generated by the CTM models are still not coherent enough or too ambiguous to show what is the topic. For example, in topic 0, there are keywords "network", "architecture","operations"(more bottom layer) but there are also "software". However, generally for other topics, it is clear to tell what the keywords about. For example, topic 2, there are "data", "system", "database", "management", so we can tell that this topic is very likely to be "Database System". Other topics like "parallel computing","graph theory" are also very clear. The last topic is interesting. It has keywords like "languages","logic","semantics","symbolic","grammar" so we called it "Computational Linguistics".

In [51]:
assigned_topic1=['Computer Network','Programming','Database System','Parallel Computing','Graph Theory','Control System','Multiprogramming','Computational Linguistics']
for i in range(0,8):
    print(str(i+1)+' '+assigned_topic1[i], end=': \n')
    print(' '.join(ctm1.get_topic_lists(5)[i]))
    print('-----------------------------------------------')

1 Computer Network: 
computer networks software network performance
-----------------------------------------------
2 Programming: 
recognition pattern de von der
-----------------------------------------------
3 Database System: 
system information data language management
-----------------------------------------------
4 Parallel Computing: 
systems control model theory optimal
-----------------------------------------------
5 Graph Theory: 
arithmetic algebras letter editor properties
-----------------------------------------------
6 Control System: 
note problem algorithms technical parallel
-----------------------------------------------
7 Multiprogramming: 
completeness modal calculus logics symbolic
-----------------------------------------------
8 Computational Linguistics: 
analysis logic programming functions using
-----------------------------------------------


### From 1990 to 2009

In [52]:
preprocessed_documents_from_1990_to_2009, unpreprocessed_corpus_from_1990_to_2009, vocab_from_1990_to_2009 = load_text(preprocessed_titles_from_1990_to_2009)

In [53]:
contextualized_txt_02 = random.sample(all_titles,len(unpreprocessed_corpus_from_1990_to_2009))

In [None]:
tp = TopicModelDataPreparation("multi-qa-mpnet-base-cos-v1")
training_dataset = tp.fit(text_for_contextual=contextualized_txt_02, text_for_bow=preprocessed_documents_from_1990_to_2009)

Batches:   0%|          | 0/1627 [00:00<?, ?it/s]

In [None]:
ctm2 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=20)
ctm2.fit(training_dataset) # run the model

In [None]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm2.get_topic_lists(10)[i]))

Except topic 0 and topic 5, other topics are easy to decide. For example topic 7 includes many keywords related to computer vision, such as "image","detection","segmentation". Topic 3 includes many keywords related to Computer Networks, such as "networks","protocal","traffic".

In [None]:
assigned_topic1=['Programming','Memory Management','Internet technology','Computer Networks','Control System','Digitalization','Neural Networks','Computer Vision']

for i in range(0,8):
    print(str(i+1)+' '+assigned_topic1[i], end=': \n')
    print(' '.join(ctm2.get_topic_lists(5)[i]))
    print('-----------------------------------------------')

### From 2010 onwards

In [None]:
preprocessed_documents_from_2010, unpreprocessed_corpus_from_2010, vocab_from_2010 = load_text(preprocessed_titles_from_2010)

In [None]:
contextualized_txt_03 = random.sample(all_titles,len(unpreprocessed_corpus_from_2010))

In [None]:
tp = TopicModelDataPreparation("multi-qa-mpnet-base-cos-v1")
training_dataset = tp.fit(text_for_contextual=contextualized_txt_03, text_for_bow=preprocessed_documents_from_2010)

In [None]:
ctm3 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=20)
ctm3.fit(training_dataset) # run the model

In [None]:
for i in range(0,8):
  
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm3.get_topic_lists(10)[i]))

Except topic 0, topic 2 and topic 7, other topics are easy to decide. For example topic 4 includes many keywords related to deep learning, such as "deep learning","classification","convolutional". Topic 6 includes many keywords related to cloud computing, such as "cloud","iot","security".

In [None]:
assigned_topic1=['','Big Data','Optimization','Deep Learning','Networks Science','Control System','Cloud Computing','Knowledge Discovery']

for i in range(0,8):
    print(str(i+1)+' '+assigned_topic1[i], end=': \n')
    print(' '.join(ctm3.get_topic_lists(5)[i]))
    print('-----------------------------------------------')

## 4. Comparison between LDA and CTM

Now, for the three periods, we have the topics generated by both the LDA and the Contextualized Topics Model(CTM). We are going to decide whether the CTM set of topics really has higher level of coherence than the topics set generated by LDA. 

  Coherence can be measured in numerous ways, like [Lau et al., 2014; Roder et al. ¨ , 2015](https://dl.acm.org/doi/10.1145/2684822.2685324).

 In this assignment, we can tell that the topics generated by CTM are more coherent by simply human observation. 