
using Latent Dirichlet Allocation (LDA)  and Combined Topic Models (CTM).  
## 1. Setup
### 1.1 Dependencies
Installing all dependencies needed to run the simulations

In [None]:
!pip install contextualized-topic-models==2.2.0

### 1.2 Imports

In [None]:
import re
import random
import os
import urllib
import urllib.request
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### 1.3 Google Drive
We connect Google Drive in order to access stored data.

In [None]:
# Enable access to files stored in Google Drive
from google.colab import drive
# Leave this like it is
mountpoint = '/content/drive/' 
drive.mount(mountpoint)

Mounted at /content/drive/


In [None]:
# Adapt this path to the folder where your data is stored in google drive
base_path = 'My Drive/UZH_ML4NLP/Projects/Project-06/data' 
data_path = os.path.join(mountpoint, base_path)
# Cd into the directory with the git repo
% cd $data_path

[Errno 2] No such file or directory: '/content/drive/My Drive/UZH_ML4NLP/Projects/Project-06/data'
/content


### 1.4 Constants


In [None]:
NUM_LDA_TOPICS = 8 # The number of different topics to identify
NUM_FEATURES = 10000
MAX_DF=0.5
MIN_DF=0.01

In [None]:
# Path to the data files
path_before_1990 = 'titles_before_1990.txt'
path_from_1990_to_2009 = 'titles_from_1990_to_2009.txt'
path_from_2010 = 'titles_from_2010.txt'

### 1.5 Data Acquisition

In [None]:
# Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
num_titles = 500000  # the (max)number of titles to load 


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.
    
    Divide the papers into 3 time periods. 
    
    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title: 
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:  
                    # only include titles with at least four words
                    continue
                got_title = True
        
        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010
    
    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)


Mounted at /content/drive


13531631it [00:37, 356913.18it/s]


## 2. Topic Modeling
### 2.1 Using Latent Dirichlet Allocation (LDA)

In [None]:
def load_titles(path):
  with open(path) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]
  return titles

In [None]:
# Simple text preprocessing by removing 
# all letters which are not in roman alphabet
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    #text = re.sub(r'\b\w{1,3}\b', ' ', text)
    #text = re.sub(' +', ' ', text)
    text = text.lower()
    return text

In [None]:
# Now we turn the documents (or titles in this case) into a matrix feature representation.
def vectorize_data(titles, max_df=MAX_DF, min_df=MIN_DF, max_features=NUM_FEATURES):
  tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
  tf = tf_vectorizer.fit_transform(titles)
  tf_feature_names = tf_vectorizer.get_feature_names_out()
  return tf, tf_feature_names

#### 2.1.1 - Before the 1990s:

In [None]:
# Load the titles
titles_before_1990 = load_titles(path_before_1990)
print("{} titles before 1990".format(len(titles_before_1990)))

40000 titles before 1990


In [None]:
# Show some random samples
random.sample(titles_before_1990, 10)

['Notes from the Vice Chairperson.',
 'Implications of holography for information systems.',
 'Designing Networks with Compact Routing Tables.',
 'Clausal Intuitionistic Logic I - Fixed-Point Semantics.',
 'Balanced extensions of graphs and hypergraphs.',
 'Extraction of chemical reaction information from primary journal text using computational linguistics techniques. 1. Lexical and syntactic phases.',
 'A Message in Cipher Written by General Cornwallis during the Revolutionary War.',
 'Discriminant analysis with a stochastic supervisor.',
 'R68-42 On Designing Generalized File Records for Management Information Systems.',
 'Teachware development for education in CAD.']

In [None]:
preprocessed_titles_before_1990 = [preprocess_text(title) for title in titles_before_1990]

In [None]:
# Show some preprocessed samples
random.sample(preprocessed_titles_before_1990, 10)

['electronic scanners with speech output  a communication system for the physically handicapped and mentally retarded',
 'programmable industrial automation',
 'precise scientific computation with a microprocessor',
 'application of phasecontrast metallography in a production laboratory',
 'd reconstruction of the blood vessels of the brain from a stereoscopic pair of subtraction angiograms',
 'reviews and things cryptologic',
 'note on the iei',
 'goumldels proof and the liar paradox',
 'determination of transient response of a drift transistor using the diffusion equation',
 'mosaic models for imagesii geometric properties of components in coverage mosaics']

In [None]:
tf_01, tf_feature_names_01 = vectorize_data(preprocessed_titles_before_1990, max_df=0.95, min_df=0.01)

In [None]:
lda_01 = LatentDirichletAllocation(n_components=8, max_iter=10, learning_method='online', random_state=42).fit(tf_01)

In [None]:
for topic_idx, topic in enumerate(lda_01.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names_01[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: theory problems algorithms simulation decision parallel application solution applications optimal digital control
Topic 1: computer logic model programs digital performance design networks using applications systems simulation
Topic 2: problem programming optimal language digital processing software research solution parallel linear control
Topic 3: data method network models application languages solution processing problem using analysis programming
Topic 4: note information linear functions applications finite technical programming systems time problem decision
Topic 5: algorithm design analysis approach sets performance new using implementation parallel linear digital
Topic 6: systems using parallel performance implementation decision distributed linear control digital design processing
Topic 7: control networks new recognition distributed time pattern optimal systems approach digital linear


Topics:
0. Graph/networks algorithms (seems to be mostly about algorithms that (maybe) operate on graphs/networks)
1. pattern recognition (and maybe robotics)
2. ...

#### 2.1.2 - From 1990 to 2009:

In [None]:
titles_from_1990_to_2009 = load_titles(path_from_1990_to_2009)
print("{} titles from 1990 to 2009".format(len(titles_from_1990_to_2009)))

327307 titles from 1990 to 2009


In [None]:
random.sample(titles_from_1990_to_2009, 10)

['Reinventing Academic Publishing Online. Part I: Rigor, Relevance and Practice.',
 'A fuzzy-logic architecture for autonomous multisensor data fusion.',
 '3-D finite element analysis of induction logging in a dipping formation mark.',
 'SVM-based feature selection of latent semantic features.',
 'An extensive comparison of recent classification tools applied to microarray data.',
 'Throughput Analysis of TCP-Friendly Rate Control in Mobile Hotspots.',
 'Unequivocal majority and Maskin-monotonicity.',
 'A new approach to the verification of chinese signatures with variant orientations and scales using relaxation and state-space search methods.',
 'Algebraic Matching Theory.',
 'Information systems evaluation and the information systems development process.']

In [None]:
preprocessed_titles_from_1990_to_2009 = [preprocess_text(title) for title in titles_from_1990_to_2009]

In [None]:
random.sample(preprocessed_titles_from_1990_to_2009, 10)

['cue validity modulates the neural correlates of covert endogenous orienting of attention in parietal and frontal cortex',
 'abstracting soft constraints framework properties examples',
 'a comment on the severity of the effects of nonwhite noise in fmri timeseries',
 'agent hell a scenario of worst practices',
 'the evolution of sdh a view from telecom new zealand',
 'accurate d image colour histogram transformation',
 'solution of a semicoercive contact problem in a nonlinear thermoelastic rheology',
 'an operadic approach to internal structures',
 'layoff costs and underutilization of labour in fisheries',
 'spectrum sensing architecture and use case study distributed sensing over rayleigh fading channels']

In [None]:
tf_02, tf_feature_names_02 = vectorize_data(titles_from_1990_to_2009, max_df=0.95, min_df=0.01)

In [None]:
lda_02 = LatentDirichletAllocation(n_components=8, max_iter=10, learning_method='online', random_state=42).fit(tf_02)

In [None]:
for topic_idx, topic in enumerate(lda_02.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names_02[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: time algorithm linear new network models algorithms efficient high robust equations management
Topic 1: method study problems evaluation case space programming equations finite linear new performance
Topic 2: design approach nonlinear optimal fuzzy modeling computing robust control equations new time
Topic 3: based control model methods computer robust time linear simulation network detection dynamic
Topic 4: using analysis networks performance problem multi dynamic neural wireless mobile recognition network
Topic 5: systems data information multiple digital linear robust management time control nonlinear analysis
Topic 6: adaptive application structure non theory knowledge scheme management robust linear control finite
Topic 7: learning estimation applications order image distributed graphs web software power real development


#### 2.1.3 - From 2010 onwards:

In [None]:
# Load the titles
titles_from_2010 = load_titles(path_from_2010)
print("{} titles from from 2010".format(len(titles_from_2010)))

720322 titles from from 2010


In [None]:
# Show some random samples
random.sample(titles_from_2010, 10)

['Power Sensitivity Analysis of Multi-Frequency, Multi-Polarized, Multi-Temporal SAR Data for Soil-Vegetation System Variables Characterization.',
 'Schneier on Security: Privacy and Control.',
 'Stability and implementation of a cycle-based max pressure controller for signalized traffic networks.',
 'Modeling progressive mesh streaming: Does data dependency matter?',
 'High order weighted essentially non-oscillatory WENO-Z schemes for hyperbolic conservation laws.',
 'Wireless Information and Power Transfer in Multiway Massive MIMO Relay Networks.',
 'Investigating Statistical Privacy Frameworks from the Perspective of Hypothesis Testing.',
 'A complete 3D simulation of a crystallization process induced by supercritical CO<sub>2</sub> to predict particle size.',
 'Test Architecture for Systolic Array of Edge-Based AI Accelerator.',
 'Processing Speech and Thoughts during Silent Reading: Direct Reference Effects for Speech by Fictional Characters in Voice-Selective Auditory Cortex and 

In [None]:
# Preprocess the titles by removing certain characters
preprocessed_titles_from_2010 = [preprocess_text(title) for title in titles_from_2010]

In [None]:
# Vectorize
tf_03, tf_feature_names_03 = vectorize_data(preprocessed_titles_from_2010, max_df=0.95, min_df=0.01)

In [None]:
lda_03 = LatentDirichletAllocation(n_components=8, max_iter=10, learning_method='online', random_state=42).fit(tf_03)

In [None]:
for topic_idx, topic in enumerate(lda_03.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names_03[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: systems networks wireless performance novel evaluation sensing sensor communication linear distributed power
Topic 1: based detection design linear mobile recognition prediction computing images cloud feature method
Topic 2: data method network optimization application applications modeling equations methods hybrid cloud problems
Topic 3: adaptive nonlinear information models framework energy deep management social scheduling systems tracking
Topic 4: control model estimation robust sensor selection tracking systems linear nonlinear stochastic distributed
Topic 5: learning neural efficient optimal distributed scheme problem multiple problems stochastic machine deep
Topic 6: approach algorithm study new time algorithms online tracking case improved research optimization
Topic 7: using analysis image dynamic power classification fuzzy communication machine feature learning selection


# Combined Topic Models

New method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/). 

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs). 

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing

num_ctm_topics = 8  # you can also choose a higher number of topics

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# def lemmatization(sentence):
#   words = sentence.split()
#   lem_words = [WordNetLemmatizer().lemmatize(i) for i in words]
#   new_sentence = ' '.join(lem_words)
#   return new_sentence

def load_text(text):
  # lem_text = [lemmatization(i) for i in text]
  sp = WhiteSpacePreprocessing(text, stopwords_language='english')
  preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()
  return preprocessed_documents, unpreprocessed_corpus, vocab

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
all_titles = titles_before_1990 + titles_from_1990_to_2009 + titles_from_2010

In [None]:
print("the length of the list all_titles is {}".format(len(all_titles)))

the length of the list all_titles is 1087629


### Before the 1990s:

In [None]:
#check the processed data from part 1
preprocessed_titles_before_1990[0:5]

['object model capabilities for distributed object management',
 'distributed object management technology',
 'muffin a distributed database machine',
 'algebraical optimization of ftaexpressions',
 'wissensrepraumlsentation und maschinelles lernen']

In [None]:
# here we do lemmatization, remove stop words, and return the processed, unprocessed, and vocabulary data 
preprocessed_documents_before_1990, unpreprocessed_corpus_before_1990, vocab_before_1990 = load_text(preprocessed_titles_before_1990)



In [None]:
contextualized_txt_01 = random.sample(all_titles,len(unpreprocessed_corpus_before_1990))

In [None]:
# here we use the same model, paraphrase-distilroberta-base-v1, as in the tutorial
tp = TopicModelDataPreparation("multi-qa-mpnet-base-cos-v1")
training_dataset = tp.fit(text_for_contextual=contextualized_txt_01, text_for_bow=preprocessed_documents_before_1990)

Batches:   0%|          | 0/197 [00:00<?, ?it/s]



In [None]:
ctm1 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=20)
ctm1.fit(training_dataset) # run the model

Epoch: [20/20]	 Seen Samples: [787640/787640]	Train Loss: 32.876268093633556	Time: 0:00:05.907914: : 20it [01:57,  5.89s/it]


In [None]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm1.get_topic_lists(10)[i]))

Topic 0: computer review introduction research software network new simulation graphics computers
Topic 1: note problem problems technical programming solution editor letter optimal scheduling
Topic 2: system data design information distributed processing language database management expert
Topic 3: algorithm parallel algorithms sequential binary fast machines using circuits matrix
Topic 4: sets der graphs und von automata finite zur properties ein
Topic 5: control analysis systems recognition pattern adaptive using model linear optimal
Topic 6: timing vehicle augmented biological multiprogramming combined priorities texts references usage
Topic 7: logic theory de languages theorem modal propositional semantics calculus set


### From 1990 to 2009

In [None]:
#check the processed data from part 1
preprocessed_titles_from_1990_to_2009[0:5]

['an evaluation of objectoriented dbms developments  edition',
 'darwin on the incremental migration of legacy information systems',
 'integrating heterogeneous autonomous distributed applications using the dom prototype',
 'integrating objectoriented applications and middleware with relational databases',
 'towards a transaction management system for dom']

In [None]:
# here we do lemmatization, remove stop words, and return the processed, unprocessed, and vocabulary data 
preprocessed_documents_from_1990_to_2009, unpreprocessed_corpus_from_1990_to_2009, vocab_from_1990_to_2009 = load_text(preprocessed_titles_from_1990_to_2009)



In [None]:
contextualized_txt_02 = random.sample(all_titles,len(unpreprocessed_corpus_from_1990_to_2009))

In [None]:
tp = TopicModelDataPreparation("multi-qa-mpnet-base-cos-v1")
training_dataset = tp.fit(text_for_contextual=contextualized_txt_02, text_for_bow=preprocessed_documents_from_1990_to_2009)

Batches:   0%|          | 0/1613 [00:00<?, ?it/s]

0it [2:00:50, ?it/s]


In [None]:
ctm2 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=20)
ctm2.fit(training_dataset) # run the model

Epoch: [20/20]	 Seen Samples: [6451020/6451020]	Train Loss: 37.61339285155257	Time: 0:00:45.557992: : 20it [15:10, 45.52s/it]


In [None]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm2.get_topic_lists(10)[i]))

Topic 0: problems problem equations solution finite solutions numerical equation methods method
Topic 1: theoretic rates spatially serial arrival modes simplified utilizing membership window
Topic 2: information development case web knowledge study technology management research electronic
Topic 3: networks wireless performance sensor mobile routing scheduling protocol network traffic
Topic 4: systems control robust stability linear design adaptive uncertain nonlinear output
Topic 5: special issue introduction computer editorial de logic intelligence book language
Topic 6: analysis data model fuzzy neural models molecular approach classification prediction
Topic 7: image images detection recognition segmentation using compression estimation brain speech


### From 2010 onwards

In [None]:
#check the processed data from part 1
preprocessed_titles_from_2010[0:5]

['spectre attacks exploiting speculative execution',
 'computer science curricula ',
 'differences in productivity and impact across the different computer science subareas',
 'klaus tschira stiftung gemeinnuumltzige gmbh kts',
 'catchment classification by runoff behaviour with selforganizing maps som']

In [None]:
# here we do lemmatization, remove stop words, and return the processed, unprocessed, and vocabulary data 
preprocessed_documents_from_2010, unpreprocessed_corpus_from_2010, vocab_from_2010 = load_text(preprocessed_titles_from_2010)



In [None]:
contextualized_txt_03 = random.sample(all_titles,len(unpreprocessed_corpus_from_2010))

In [None]:
tp = TopicModelDataPreparation("multi-qa-mpnet-base-cos-v1")
training_dataset = tp.fit(text_for_contextual=contextualized_txt_03, text_for_bow=preprocessed_documents_from_2010)

Batches:   0%|          | 0/3567 [00:00<?, ?it/s]



In [None]:
ctm3 = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_ctm_topics, num_epochs=20)
ctm3.fit(training_dataset) # run the model

Epoch: [20/20]	 Seen Samples: [14267720/14267720]	Train Loss: 44.330207667517456	Time: 0:01:39.785597: : 20it [33:26, 100.34s/it]


In [None]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm3.get_topic_lists(10)[i]))

Topic 0: adjustment selforganizing window nested simplified weighting train progressive optimum cross
Topic 1: analysis surface data imaging land models temperature using water estimation
Topic 2: optimization fuzzy algorithm multiobjective problem decision swarm approach genetic evolutionary
Topic 3: image deep learning detection recognition classification neural feature segmentation images
Topic 4: networks wireless sensor power energy cognitive cellular allocation interference spectrum
Topic 5: nonlinear systems control linear equations class boundary equation stability fractional
Topic 6: computing cloud smart applications internet special things issue autonomous security
Topic 7: social information online knowledge media case role technology factors review


### Summary 

Now, for the three periods, we have the topics generated by both the LDA and the Contextualized Topics Model(CTM). We are going to decide whether the CTM set of topics really has higher level of coherence than the topics set generated by LDA. A long standing open question is how to quantify coherence. Coherence can be measured in numerous ways, like [Lau et al., 2014; Roder et al. ¨ , 2015](https://dl.acm.org/doi/10.1145/2684822.2685324). In this assignment, we can tell that the topics generated by CTM are more coherent by simply human observation. 

Assign a name to each topic based on the topic's top words:

topics before 1990:

In [None]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm1.get_topic_lists(20)[i]))

Topic 0: computer review introduction research software network new simulation graphics computers technology artificial book science report intelligence operations architecture communication local
Topic 1: note problem problems technical programming solution editor letter optimal scheduling networks solving times dynamic queues linear queue decision allocation integer
Topic 2: system data design information distributed processing language database management expert structures chemical retrieval development interactive base systems online knowledge structure
Topic 3: algorithm parallel algorithms sequential binary fast machines using circuits matrix detection method trees fault switching tree search efficient computing transform
Topic 4: sets der graphs und von automata finite zur properties ein fuumlr boolean recursive uumlber die degrees automaten arithmetic eine relations
Topic 5: control analysis systems recognition pattern adaptive using model linear optimal approach application es

We can see that in this period, some topics generated by the CTM models are still not coherent enough or too ambiguous to show what is the topic. For example, in topic 0, there are keywords "network", "architecture","operations"(more bottom layer) but there are also "software". However, generally for other topics, it is clear to tell what the keywords about. For example, topic 2, there are "data", "system", "database", "management", so we can tell that this topic is very likely to be "Database System". Other topics like "parallel computing","graph theory" are also very clear. The last topic is interesting. It has keywords like "languages","logic","semantics","symbolic","grammar" so we called it "Computational Linguistics".

In [None]:
assigned_topic1=['Computer Network','Programming','Database System','Parallel Computing','Graph Theory','Control System','Multiprogramming','Computational Linguistics']
for i in range(0,8):
    print(str(i+1)+' '+assigned_topic1[i], end=': \n')
    print(' '.join(ctm1.get_topic_lists(5)[i]))
    print('-----------------------------------------------')

1 Computer Network: 
computer review introduction research software
-----------------------------------------------
2 Programming: 
note problem problems technical programming
-----------------------------------------------
3 Database System: 
system data design information distributed
-----------------------------------------------
4 Parallel Computing: 
algorithm parallel algorithms sequential binary
-----------------------------------------------
5 Graph Theory: 
sets der graphs und von
-----------------------------------------------
6 Control System: 
control analysis systems recognition pattern
-----------------------------------------------
7 Multiprogramming: 
timing vehicle augmented biological multiprogramming
-----------------------------------------------
8 Computational Linguistics: 
logic theory de languages theorem
-----------------------------------------------


topics from 1990 to 2009

In [None]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm2.get_topic_lists(20)[i]))

Topic 0: problems problem equations solution finite solutions numerical equation methods method solving order differential approximation generalized boundary functions convergence difference value
Topic 1: theoretic rates spatially serial arrival modes simplified utilizing membership window composite cross underwater stage equivalent absolute replacement variations various transient
Topic 2: information development case web knowledge study technology management research electronic system support collaborative paper software health business framework process online
Topic 3: networks wireless performance sensor mobile routing scheduling protocol network traffic scheme access power distributed qos allocation atm dynamic communication packet
Topic 4: systems control robust stability linear design adaptive uncertain nonlinear output feedback controller stabilization optimal discretetime state timevarying controllers approach delays
Topic 5: special issue introduction computer editorial de l

Except topic 0 and topic 5, other topics are easy to decide. For example topic 7 includes many keywords related to computer vision, such as "image","detection","segmentation". Topic 3 includes many keywords related to Computer Networks, such as "networks","protocal","traffic".

In [None]:
assigned_topic1=['Programming','Memory Management','Internet technology','Computer Networks','Control System','Digitalization','Neural Networks','Computer Vision']
for i in range(0,8):
    print(str(i+1)+' '+assigned_topic1[i], end=': \n')
    print(' '.join(ctm2.get_topic_lists(5)[i]))
    print('-----------------------------------------------')

1 Programming: 
problems problem equations solution finite
-----------------------------------------------
2 Memory Management: 
theoretic rates spatially serial arrival
-----------------------------------------------
3 Internet technology: 
information development case web knowledge
-----------------------------------------------
4 Computer Networks: 
networks wireless performance sensor mobile
-----------------------------------------------
5 Control System: 
systems control robust stability linear
-----------------------------------------------
6 Digitalization: 
special issue introduction computer editorial
-----------------------------------------------
7 Neural Networks: 
analysis data model fuzzy neural
-----------------------------------------------
8 Computer Vision: 
image images detection recognition segmentation
-----------------------------------------------


topics from 2009

In [None]:
for i in range(0,8):
    print(f'Topic {i}:', end=' ')
    print(' '.join(ctm3.get_topic_lists(20)[i]))

Topic 0: adjustment selforganizing window nested simplified weighting train progressive optimum cross partition overlapping fractal impulse employing pairwise multi redundancy polar transformer
Topic 1: analysis surface data imaging land models temperature using water estimation series forest satellite mapping modeling soil radar comparison field cover
Topic 2: optimization fuzzy algorithm multiobjective problem decision swarm approach genetic evolutionary model hybrid particle new problems selection search programming set algorithms
Topic 3: image deep learning detection recognition classification neural feature segmentation images representation network face convolutional object features based sparse machine fusion
Topic 4: networks wireless sensor power energy cognitive cellular allocation interference spectrum channels channel radio protocol relay massive mimo ad cooperative communication
Topic 5: nonlinear systems control linear equations class boundary equation stability fraction

Except topic 0, topic 2 and topic 7, other topics are easy to decide. For example topic 4 includes many keywords related to deep learning, such as "deep learning","classification","convolutional". Topic 6 includes many keywords related to cloud computing, such as "cloud","iot","security".

In [None]:
assigned_topic1=['','Big Data','Optimization','Deep Learning','Networks Science','Control System','Cloud Computing','Knowledge Discovery']
for i in range(0,8):
    print(str(i+1)+' '+assigned_topic1[i], end=': \n')
    print(' '.join(ctm3.get_topic_lists(5)[i]))
    print('-----------------------------------------------')

1 : 
adjustment selforganizing window nested simplified
-----------------------------------------------
2 Big Data: 
analysis surface data imaging land
-----------------------------------------------
3 Optimization: 
optimization fuzzy algorithm multiobjective problem
-----------------------------------------------
4 Deep Learning: 
image deep learning detection recognition
-----------------------------------------------
5 Networks Science: 
networks wireless sensor power energy
-----------------------------------------------
6 Control System: 
nonlinear systems control linear equations
-----------------------------------------------
7 Cloud Computing: 
computing cloud smart applications internet
-----------------------------------------------
8 Knowledge Discovery: 
social information online knowledge media
-----------------------------------------------
