In [1]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
from nltk.corpus import stopwords as stop_words

# nltk.download('stopwords')

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark import SparkContext, SparkConf
import pyLDAvis as vis
import numpy as np
from random import sample
import pandas as pd


from contextualized_tm.subset import get_data_sample

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
DATA_PATH = '../../../notebooks/MADE/data.parquet'  # Input path to dataset in parquet format

In [4]:
spark = SparkSession.builder.appName("CombinedTM").getOrCreate()

In [6]:
ids, abstracts = get_data_sample(spark=spark, data_path=DATA_PATH, sample_frac=0.01)

In [7]:
documents = [line.strip() for line in abstracts]

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [8]:
len(preprocessed_documents)

29396

In [9]:
tp = TopicModelDataPreparation("allenai/scibert_scivocab_uncased")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Some weights of the model checkpoint at /home/ippk93/.cache/torch/sentence_transformers/allenai_scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/147 [00:00<?, ?it/s]

In [10]:
tp.vocab[:10]

['ability',
 'able',
 'abstract',
 'abstraction',
 'academic',
 'access',
 'according',
 'account',
 'accuracy',
 'accurate']

In [11]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=40, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [293960/293960]	Train Loss: 460.74953352829147	Time: 0:00:04.170146: : 10it [00:38,  3.87s/it]
Sampling: [20/20]: : 20it [00:44,  2.25s/it]


In [12]:
# Uncomment if you want to save the model
# ctm.save(models_dir="./")

In [13]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

Sampling: [10/10]: : 10it [00:22,  2.25s/it]


In [14]:
ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

In [15]:
topics_predictions = ctm.get_thetas(training_dataset, n_samples=5) # get all the topic predictions

Sampling: [5/5]: : 5it [00:10,  2.14s/it]


In [16]:
topics_list = []
for i in range(len(topics_predictions)):
    topic_number = np.argmax(topics_predictions[i])
    topics = sample(ctm.get_topic_lists()[topic_number], 5)
    topics_list.append(topics)
# topic_number = np.argmax(topics_predictions[0]) # get the topic id of the first document

In [17]:
id_tags = [{"old_id": ids[i], "tags": topics_list[i]} for i in range(len(topics_list))]

In [18]:
df = pd.DataFrame(id_tags)
df.head()

Unnamed: 0,old_id,tags
0,556faa4a2401b4b38c2376e1,"[element, subset, edge, prove, spaces]"
1,556faa942401b4b38c2376f5,"[systems, nonlinear, proposed, control, contro..."
2,556fb9392401b4b38c237b0c,"[surface, segmentation, using, object, images]"
3,556fb9842401b4b38c237b20,"[method, design, proposed, nonlinear, systems]"
4,556fbb492401b4b38c237ba7,"[visual, based, face, detection, using]"


In [19]:
ctm.get_topic_lists()[0]

['software',
 'development',
 'systems',
 'design',
 'engineering',
 'process',
 'approach',
 'paper',
 'system',
 'language']

In [22]:
ctm.get_topic_lists(5)[5] 

['technologies', 'community', 'project', 'personal', 'internet']

In [24]:
print(unpreprocessed_corpus[5])

Due to economic and ecological reasons the energy consumption of consumer electronics products like TVs and set-top boxes (STBs) is of increasing importance. As nowadays also Personal Video Recorders (PVRs) are widely used, the question is raised, how time-shifted TV affects the energy use of such devices. In this paper a framework for analyzing the impact of recording and later playback of TV content on the energy consumption of a PVR is presented. The underlying method allows taking the viewing habits of users into account. The framework is used to analyze the energy consumption of a recommender-based PVR that, based on personalized recommendations, automatically records TV content for the user. Further, a study of an approach for an energy efficient PVR design, which allows deactivating peripheral PVR components when not needed, is analyzed w/r/t potential energy savings.


### Save

In [164]:
# Uncomment if you want to save the data

# import fastparquet
# fastparquet.write('id_tags.parquet', df, file_scheme='hive', write_index=False)

In [165]:
# pf = fastparquet.ParquetFile('id_tags.parquet', )
# pf.to_pandas(index=False)

Unnamed: 0,old_id,tags
0,556f9f9e2401b4b38c2373f0,"[video, bit, compression, coding, proposed]"
1,556fa3b02401b4b38c237522,"[mutual, confidence, redundant, phases, window]"
2,556fa54a2401b4b38c23758c,"[robots, robot, human, force, trajectory]"
3,556fa13b2401b4b38c23746d,"[camera, robots, motion, position, vehicle]"
4,556fa5c72401b4b38c2375b7,"[facial, deep, features, trained, convolutional]"
...,...,...
437994,6051d8e69e795eb49a3cb838,"[real, model, series, time, algorithm]"
437995,6054328e9e795e40330e1fa2,"[texture, object, image, images, scene]"
437996,60545df19e795e4033115a30,"[smart, systems, sensor, iot, monitoring]"
437997,6058404b9e795e4ac8d1766d,"[graphs, connected, every, edge, cycles]"


### Evaluation

In [39]:
from contextualized_topic_models.evaluation.measures import CoherenceUMASS, CoherenceNPMI

texts = [doc.split() for doc in unpreprocessed_corpus]

umass = CoherenceUMASS(texts=texts, topics=ctm.get_topic_lists(10))
npmi = CoherenceNPMI(texts=texts, topics=ctm.get_topic_lists(10))
umass.score(), npmi.score()

(-3.196950714625764, 0.019660956053620143)