In [1]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

from mycolorpy import colorlist as mcp

import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

import pyspark as ps

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

import string

In [2]:
FIXED_JSON_PATH = 'D:\MADE\dblpv13_fixed.json'
N_TOPICS = 20

In [3]:
spark = ps.sql.SparkSession.builder.appName("PySpark for dblpv13").config(key="spark.executor.memory", value="8g").getOrCreate()
df = spark.read.json(FIXED_JSON_PATH)

In [4]:
df_pd = df.to_pandas_on_spark()



In [5]:
abstracts = df_pd[df_pd['abstract'].str.len() > 0]['abstract']

In [6]:
abstracts.head()

1    Drought is the first place in all the natural ...
3    As process variations become a significant pro...
4    360° represents the concerns that are addresse...
8    "2BTextures", a two-movement audio/visual expe...
9    Constructing a system that can cope with a dyn...
Name: abstract, dtype: object

In [7]:
abstracts_np = abstracts.sample(frac=0.05, random_state=47).to_numpy()



In [8]:
stopwords_list = [
    *stopwords.words('english'),
    *stopwords.words('german'),
    'propos', 'u', 'allow',
    'also', 'approach', 'ha',
    'one', 'two', 'three',
    'different', 'upper', 'bound',
    'show', 'based', 'propose',
    'describe', 'present', 'paper',
    'demonstrate', 'result', 'ad',
    'hoc', 'better', 'proposed',
    'commonly', 'used', 'et',
    'al', 'different', "'s",
    'address', 'effectiveness', 'recent',
    'user', "'", 'taking',
    'well', 'known', 'take',
    'showed', 'using', 'high',
    'la', 'http', 'xlink', 'www', 
    'org','w3', 'xmlns', 'mml', 'de',
    'td']

In [9]:
def custom_tokenizer(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    tokens_words = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(w) for w in tokens_words]
    return lemmatized_words

In [10]:
vectorizer = TfidfVectorizer(stop_words=stopwords_list, max_features=int(0.02 * len(abstracts_np)), 
                             tokenizer=custom_tokenizer)
document_term_matrix = vectorizer.fit_transform(abstracts_np)



In [11]:
lsa_model = TruncatedSVD(n_components=N_TOPICS)
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)

In [12]:
def get_top_k_words(k):
    top_words = []
    vocabulary = vectorizer.get_feature_names()
    for i, component in enumerate(lsa_model.components_):
        sorted_words = sorted(zip(vocabulary, component), key= lambda x: x[1], reverse=True)[:k]
        top_words.append([w[0] for w in sorted_words])
    return top_words

In [13]:
top_words = get_top_k_words(10)



In [14]:
for i, words in enumerate(top_words):
    print(f"Topic {i + 1}:", ' '.join(words))

Topic 1: system model method algorithm data network problem image performance information
Topic 2: image algorithm method problem feature graph function set classification segmentation
Topic 3: network algorithm problem node scheme graph channel power wireless control
Topic 4: network image node data sensor feature wireless protocol neural scheme
Topic 5: system control image power channel signal controller method scheme simulation
Topic 6: model network learning neural method parameter prediction fuzzy control training
Topic 7: graph image network g system vertex n edge control model
Topic 8: data graph channel g n scheme signal code vertex power
Topic 9: system data control graph robot controller sensor fuzzy network algorithm
Topic 10: model service image algorithm data system cloud web resource query
Topic 11: method service equation solution problem function web data numerical security
Topic 12: robot control sensor graph motion mobile energy wa device service
Topic 13: service ch

In [15]:
model = TSNE(n_components=2, n_iter=500, verbose=1, random_state=47, n_jobs=12)
result_vectors = model.fit_transform(lsa_topic_matrix)



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 193504 samples in 0.005s...
[t-SNE] Computed neighbors for 193504 samples in 781.536s...
[t-SNE] Computed conditional probabilities for sample 1000 / 193504
[t-SNE] Computed conditional probabilities for sample 2000 / 193504
[t-SNE] Computed conditional probabilities for sample 3000 / 193504
[t-SNE] Computed conditional probabilities for sample 4000 / 193504
[t-SNE] Computed conditional probabilities for sample 5000 / 193504
[t-SNE] Computed conditional probabilities for sample 6000 / 193504
[t-SNE] Computed conditional probabilities for sample 7000 / 193504
[t-SNE] Computed conditional probabilities for sample 8000 / 193504
[t-SNE] Computed conditional probabilities for sample 9000 / 193504
[t-SNE] Computed conditional probabilities for sample 10000 / 193504
[t-SNE] Computed conditional probabilities for sample 11000 / 193504
[t-SNE] Computed conditional probabilities for sample 12000 / 193504
[t-SNE] Computed conditional proba

[t-SNE] Computed conditional probabilities for sample 125000 / 193504
[t-SNE] Computed conditional probabilities for sample 126000 / 193504
[t-SNE] Computed conditional probabilities for sample 127000 / 193504
[t-SNE] Computed conditional probabilities for sample 128000 / 193504
[t-SNE] Computed conditional probabilities for sample 129000 / 193504
[t-SNE] Computed conditional probabilities for sample 130000 / 193504
[t-SNE] Computed conditional probabilities for sample 131000 / 193504
[t-SNE] Computed conditional probabilities for sample 132000 / 193504
[t-SNE] Computed conditional probabilities for sample 133000 / 193504
[t-SNE] Computed conditional probabilities for sample 134000 / 193504
[t-SNE] Computed conditional probabilities for sample 135000 / 193504
[t-SNE] Computed conditional probabilities for sample 136000 / 193504
[t-SNE] Computed conditional probabilities for sample 137000 / 193504
[t-SNE] Computed conditional probabilities for sample 138000 / 193504
[t-SNE] Computed con

In [16]:
cmap = np.array(mcp.gen_color(cmap="hsv",n=N_TOPICS))

In [17]:
lsa_keys = lsa_topic_matrix.argmax(axis=1).tolist()

In [18]:
def get_topic_mean_pos():
    mean_pos = []
    for t in range(N_TOPICS):
        articles = []
        for i, key in enumerate(lsa_keys):
            if key == t:
                articles.append(result_vectors[i])    
        mean_pos.append(np.mean(np.vstack(articles), axis=0))
    return mean_pos

In [19]:
output_notebook()

In [20]:
mean_pos = get_topic_mean_pos()
top_3_words = [' '.join(w) for w in get_top_k_words(3)]
plot = figure(title="t-SNE clustering", plot_width=800, plot_height=800)
plot.scatter(x=result_vectors[:, 0], y=result_vectors[:, 1], color=cmap[lsa_keys])

for t in range(N_TOPICS):
    label = Label(x=mean_pos[t][0], y=mean_pos[t][1], 
                  text=top_3_words[t], text_color=cmap[t])
    plot.add_layout(label)

show(plot)

