In [1]:
import pandas as pd

In [2]:
# read preprocessed data
df = pd.read_csv('preprocessed_data.csv')

In [3]:
# totoal number of rows
print('Total number of rows:', len(df))

Total number of rows: 19631


In [4]:
df.columns

Index(['Article Title', 'Source Title', 'Language',
       'Times Cited, All Databases', 'Highly Cited Status', 'Hot Paper Status',
       'Publication Year', 'Decade', 'Group', 'WoS Categories new',
       'Research Areas new', 'Keywords Plus lemmatized',
       'Author Keywords lemmatized', 'All Keywords', 'Addresses new',
       'Affiliations new', 'Abstract lemmatized'],
      dtype='object')

In [5]:
# any empty list in 'Abstract lemmatized'
print('Number of empty list in Abstract lemmatized:', len(df[df['Abstract lemmatized'] == '[]']))

Number of empty list in Abstract lemmatized: 116


In [6]:
# drop the row with empty list in 'Abstract lemmatized'
df = df[df['Abstract lemmatized'] != '[]']
print('Total number of rows after dropping empty list in Abstract lemmatized:', len(df))

Total number of rows after dropping empty list in Abstract lemmatized: 19515


In [7]:
df['Abstract lemmatized'] = df['Abstract lemmatized'].apply(eval)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Abstract lemmatized'].apply(lambda x: ' '.join(x)))
feature_names = tfidf.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [9]:
# set the threshold, filter out the words with TF-IDF score less than the threshold
threshold = 0.3
# mean or max
important_words = tfidf_df.max()[tfidf_df.max() > threshold].sort_values(ascending=False)
print(f"Number of important words: {len(important_words)}")

# get the list of important words without value
important_words_list = important_words.index.tolist()
print(important_words_list)
print(len(important_words_list))

Number of important words: 10499
10499


In [10]:
df['Abstract lemmatized filtered'] = df['Abstract lemmatized'].apply(lambda x: [i for i in x if i in important_words])

In [11]:
df[['Abstract lemmatized filtered', 'Abstract lemmatized']].head(1)

Unnamed: 0,Abstract lemmatized filtered,Abstract lemmatized
0,"[online, banking, fraud, criminal, account, tr...","[online, banking, fraud, occurs, whenever, cri..."


In [12]:
len(df['Abstract lemmatized filtered'][0])

77

In [13]:
len(df['Abstract lemmatized'][0])

126

In [14]:
# top 100 frequency words
from collections import Counter

word_freq = Counter([word for abstract in df['Abstract lemmatized filtered'] for word in abstract])
top_100_words = word_freq.most_common(100)
top_100_words

[('learning', 36711),
 ('model', 27580),
 ('data', 26001),
 ('method', 15373),
 ('approach', 11248),
 ('analysis', 10420),
 ('algorithm', 10299),
 ('research', 10257),
 ('artificial', 9251),
 ('system', 8449),
 ('intelligence', 8338),
 ('performance', 8134),
 ('use', 7982),
 ('network', 7894),
 ('feature', 7703),
 ('student', 7227),
 ('accuracy', 6801),
 ('prediction', 6777),
 ('technique', 6695),
 ('different', 6621),
 ('information', 6614),
 ('time', 6561),
 ('classification', 6055),
 ('support', 5671),
 ('show', 5606),
 ('two', 5592),
 ('decision', 5397),
 ('social', 5310),
 ('process', 5207),
 ('risk', 5097),
 ('area', 5078),
 ('neural', 4931),
 ('technology', 4873),
 ('factor', 4797),
 ('human', 4738),
 ('problem', 4684),
 ('application', 4620),
 ('forest', 4592),
 ('language', 4537),
 ('set', 4268),
 ('work', 4210),
 ('random', 4199),
 ('variable', 4189),
 ('well', 4105),
 ('task', 4048),
 ('effect', 3954),
 ('development', 3948),
 ('tool', 3831),
 ('value', 3824),
 ('term', 3712

In [15]:
# PAM topic model in python with TOMOTOPY]
# https://bab2min.github.io/tomotopy/v0.4.1/en/#tomotopy.PAModel
import tomotopy as tp

# create a PAM model
# hpam_model = tp.HPAModel(k1=1, k2=1, rm_top=10)
# pam_model = tp.PAModel(k1=10, k2=n, rm_top=10)
# lda_model = tp.LDAModel(k=n, rm_top=10)

# add documents to the model
# for doc in df['Abstract lemmatized filtered']:
#     pam_model.add_doc(words=doc)

# pam_model.train(100)
# print(f"Perplexity: {pam_model.perplexity}")
# print(f"Coherence Score: {tp.coherence.Coherence(pam_model, coherence='c_v').get_score()}")

In [45]:
# https://bab2min.github.io/tomotopy/v0.10.0/en/coherence.html

score = {}
for k1 in [3, 5, 10]:
    for k2 in [10, 20, 30, 40, 60, 80]:
        pam_model = tp.PAModel(k1=k1, k2=k2, rm_top=10)
        for doc in df['Abstract lemmatized filtered']:
            pam_model.add_doc(words=doc)
        pam_model.train(100)
        temp = tp.coherence.Coherence(pam_model, coherence='c_v').get_score()
        if k1 not in score:
            score[k1] = []       
        score[k1].append(temp)
        print(f"k1: {k1}, k2: {k2}, Coherence Score: {temp}")

k1: 3, k2: 10, Coherence Score: 0.6419883360465367
k1: 3, k2: 20, Coherence Score: 0.5619897534449896
k1: 3, k2: 30, Coherence Score: 0.5693170969684919
k1: 3, k2: 40, Coherence Score: 0.6549325895806154
k1: 3, k2: 60, Coherence Score: 0.4994863589604696
k1: 3, k2: 80, Coherence Score: 0.6880812595287958
k1: 5, k2: 10, Coherence Score: 0.6348259355127812
k1: 5, k2: 20, Coherence Score: 0.7229863405926154
k1: 5, k2: 30, Coherence Score: 0.684477332830429
k1: 5, k2: 40, Coherence Score: 0.6127033445611596
k1: 5, k2: 60, Coherence Score: 0.6168673080205918
k1: 5, k2: 80, Coherence Score: 0.6384602048993109
k1: 10, k2: 10, Coherence Score: 0.5980878348834813
k1: 10, k2: 20, Coherence Score: 0.6201522500813006
k1: 10, k2: 30, Coherence Score: 0.6369456820189953
k1: 10, k2: 40, Coherence Score: 0.6555472469329835
k1: 10, k2: 60, Coherence Score: 0.6399682956933975
k1: 10, k2: 80, Coherence Score: 0.6668743439018726


In [46]:
score

{3: [0.6419883360465367,
  0.5619897534449896,
  0.5693170969684919,
  0.6549325895806154,
  0.4994863589604696,
  0.6880812595287958],
 5: [0.6348259355127812,
  0.7229863405926154,
  0.684477332830429,
  0.6127033445611596,
  0.6168673080205918,
  0.6384602048993109],
 10: [0.5980878348834813,
  0.6201522500813006,
  0.6369456820189953,
  0.6555472469329835,
  0.6399682956933975,
  0.6668743439018726]}

In [47]:
# round
score[3] = [round(i, 2) for i in score[3]]
score[5] = [round(i, 2) for i in score[5]]
score[10] = [round(i, 2) for i in score[10]]

In [48]:
# draw a line plot to show the relationship between the number of topics and the coherence score
from pyecharts import options as opts
from pyecharts.charts import Line

x = [10, 20, 30, 40, 60, 80]
x = [str(i) for i in x]

(
    Line(init_opts=opts.InitOpts(width='1200px', height='600px'))
    .set_global_opts(xaxis_opts=opts.AxisOpts(name="Number of Topics"), 
                     yaxis_opts=opts.AxisOpts(name="Coherence Score", min_=0.50, max_=0.80))
    .add_xaxis(x)
    .add_yaxis("k1=3", score[3], markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")]), is_smooth=True, is_symbol_show=False)
    .add_yaxis("k1=5", score[5], markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")]), is_smooth=True, is_symbol_show=False)
    .add_yaxis("k1=10", score[10], markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")]), is_smooth=True, is_symbol_show=False)
    .render('visualize/coherence_score.html')
)

'/Users/ZOU/Desktop/code/visualize/coherence_score.html'

In [77]:
pam_model = tp.PAModel(k1=5, k2=20, rm_top=10)
for doc in df['Abstract lemmatized filtered']:
    pam_model.add_doc(words=doc)
pam_model.train(1000)   
print(f"Coherence Score: {tp.coherence.Coherence(pam_model, coherence='c_v').get_score()}")

Coherence Score: 0.726948709487915


In [78]:
pam_model.save('output/Topic/pam_model.bin', full=False)

In [79]:
print(f"Perplexity: {pam_model.perplexity}")

Perplexity: 16898.636837366503


In [89]:
with open('output/Topic/pam_model.txt', 'w') as f:
    for k2 in range(pam_model.k2):
        # print(f"Sub Topic #{k2}")
        words = pam_model.get_topic_words(k2, top_n=10)
        words = [i[0] for i in words]
        print(words)
        f.write(f"Sub Topic #{k2} : {words}\n")

['intelligence', 'technology', 'review', 'literature', 'digital', 'management', 'industry', 'field', 'business', 'future']
['health', 'patient', 'treatment', 'clinical', 'child', 'risk', 'disorder', 'outcome', 'intervention', 'medical']
['network', 'neural', 'deep', 'feature', 'classification', 'image', 'convolutional', 'training', 'performance', 'term']
['customer', 'review', 'consumer', 'product', 'service', 'online', 'marketing', 'user', 'tourism', 'demand']
['area', 'forest', 'image', 'land', 'spatial', 'resolution', 'map', 'remote', 'satellite', 'sensing']
['emotion', 'speech', 'feature', 'human', 'video', 'emotional', 'recognition', 'facial', 'expression', 'visual']
['brain', 'cognitive', 'eeg', 'activity', 'task', 'control', 'pattern', 'feature', 'functional', 'participant']
['urban', 'spatial', 'city', 'area', 'use', 'land', 'environment', 'location', 'building', 'pattern']
['user', 'process', 'design', 'application', 'use', 'tool', 'information', 'knowledge', 'human', 'time']


In [81]:
pam_model.summary()

<Basic Info>
| PAModel (current version: 0.12.7)
| 19515 docs, 1478621 words
| Total Vocabs: 10499, Used Vocabs: 10489
| Entropy of words: 7.52903
| Entropy of term-weighted words: 7.52903
| Removed Vocabs: learning model data method approach analysis algorithm research artificial system
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 1
| Log-likelihood per word: -9.73499
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 10 (the number of top words to be removed)
| k1: 5 (the number of super topics between 1 ~ 32767)
| k2: 20 (the number of sub topics between 1 ~ 32767)
| alpha: [0.1] (initial hyperparameter of Dirichlet distribution for document-super topic, given as a single `float` in case of symmetric prior and as a list with length `k1` of `float` in case of asymmetric prior.)
| subalpha: [0.1] (initial hyperparameter of Dirichlet distribution f

In [103]:
for k1 in range(pam_model.k1):
    print(pam_model.get_sub_topics(k1, top_n=5))

[(19, 0.07883527874946594), (0, 0.07428934425115585), (14, 0.07196716219186783), (12, 0.07024933397769928), (8, 0.06904072314500809)]
[(0, 0.08187738060951233), (8, 0.0737392008304596), (19, 0.06926894187927246), (12, 0.06720220297574997), (14, 0.06703048944473267)]
[(12, 0.08173529803752899), (4, 0.07667384296655655), (18, 0.0706726610660553), (19, 0.07062199711799622), (0, 0.06738975644111633)]
[(14, 0.08528045564889908), (8, 0.07650860399007797), (0, 0.07582851499319077), (2, 0.06666853278875351), (12, 0.06577897071838379)]
[(19, 0.07868848741054535), (8, 0.07016510516405106), (12, 0.06981845200061798), (14, 0.0690605491399765), (18, 0.06587956100702286)]


In [82]:
# get the most likely topic for each document
df['super topic'] = [max(doc.get_topics(), key=lambda x: x[1])[0] for doc in pam_model.docs]
df['sub topic'] = [max(doc.get_sub_topics(), key=lambda x: x[1])[0] for doc in pam_model.docs]


In [83]:
df['super topic'].value_counts()

1    4009
0    3948
2    3885
3    3860
4    3813
Name: super topic, dtype: int64

In [84]:
df['sub topic'].value_counts()

14    1678
18    1569
0     1478
19    1412
15    1402
8     1376
10    1350
4     1129
2      987
12     968
16     872
1      791
11     770
7      751
3      680
6      598
17     575
5      430
9      388
13     311
Name: sub topic, dtype: int64

In [85]:
# get the value of sub topic group by super topic
df.groupby('super topic')['sub topic'].value_counts()

super topic  sub topic
0            14           343
             18           325
             0            324
             19           308
             15           307
                         ... 
4            6            126
             17           109
             5             78
             9             76
             13            66
Name: sub topic, Length: 100, dtype: int64

In [86]:
# draw a pie chart to show the distribution of sub topics with pyecharts
from pyecharts.charts import Pie

dist_sub_topic = df['sub topic'].value_counts().reset_index()
dist_sub_topic.columns = ['sub topic', 'count']

c = (
    Pie()
    .add("", [list(z) for z in zip(dist_sub_topic['sub topic'], dist_sub_topic['count'])])
#     .set_colors([
#     '#FF9999',  # Light Red
#     '#FFCC99',  # Light Orange
#     '#FFFF99',  # Light Yellow
#     '#CCFF99',  # Light Green
#     '#99FF99',  # Light Mint
#     '#99FFCC',  # Light Cyan
#     '#99FFFF',  # Light Sky Blue
#     '#99CCFF',  # Light Blue
#     '#9999FF',  # Light Purple
#     '#CC99FF'   # Light Lavender
# ])
    .set_global_opts(title_opts=opts.TitleOpts(title="Sub Topic Distribution"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="Sub Topic {b}: {c}"))
    .render("visualize/Sub_Topic_Distribution.html")
)

In [100]:
import networkx as nx
import nx2vos

G = nx.Graph()
for i in range(pam_model.k1):
    for j in range(pam_model.k2):
        G.add_node(f"super {i}")
        # value is the total number of documents when super topic = i and sub topic = j 
        val = df[(df['super topic'] == i) & (df['sub topic'] == j)].shape[0]
        G.add_edge(f"super {i}", f"sub {j}", weight=val**3)

    # Save network files to json for VosViewer
    nx2vos.write_vos_json(G, f'output/Topic/topics.json')

In [91]:
# conduct Mann Kendall test
# topic_prevalence is the number of documents in each topic divided by the total number of documents every year

topic_prevalence = df.groupby(['Group', 'sub topic']).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1)

In [92]:
topic_prevalence

sub topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.030249,0.012456,0.039146,0.017794,0.02847,0.035587,0.010676,0.001779,0.188612,0.003559,0.024911,0.0,0.096085,0.010676,0.282918,0.049822,0.003559,0.007117,0.023132,0.133452
2,0.014599,0.017518,0.046715,0.026277,0.091971,0.029197,0.055474,0.020438,0.094891,0.017518,0.023358,0.005839,0.056934,0.007299,0.294891,0.059854,0.021898,0.005839,0.021898,0.087591
3,0.055693,0.035291,0.053488,0.02702,0.061208,0.029225,0.039702,0.032809,0.106148,0.01544,0.058726,0.022057,0.046319,0.01737,0.124897,0.090157,0.052661,0.019024,0.046595,0.06617
4,0.072854,0.036195,0.067285,0.039907,0.0529,0.018561,0.02645,0.036195,0.085847,0.02181,0.085847,0.029698,0.047332,0.013457,0.091879,0.070534,0.046404,0.022274,0.061717,0.072854
5,0.080172,0.037223,0.061203,0.037938,0.051539,0.022548,0.030064,0.034717,0.06514,0.018611,0.073014,0.041875,0.043665,0.018253,0.07015,0.078024,0.041875,0.027201,0.076235,0.090551
6,0.080477,0.046498,0.053055,0.032787,0.057824,0.020268,0.028912,0.037556,0.059016,0.020268,0.080775,0.047392,0.053055,0.018182,0.065872,0.065574,0.042921,0.037258,0.087332,0.064978
7,0.086921,0.048199,0.043596,0.039534,0.057677,0.017872,0.026537,0.046575,0.044138,0.02437,0.079069,0.051449,0.050636,0.015705,0.044679,0.063634,0.052803,0.037097,0.104522,0.064988
8,0.104766,0.048411,0.03177,0.040091,0.061649,0.017776,0.027988,0.054463,0.034796,0.023071,0.058623,0.059002,0.044629,0.014372,0.03177,0.068457,0.040847,0.04236,0.131241,0.063918


In [93]:
# calculate the total value for each row
topic_prevalence['total'] = topic_prevalence.sum(axis=1)
topic_prevalence

sub topic,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,total
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.030249,0.012456,0.039146,0.017794,0.02847,0.035587,0.010676,0.001779,0.188612,0.003559,...,0.0,0.096085,0.010676,0.282918,0.049822,0.003559,0.007117,0.023132,0.133452,1.0
2,0.014599,0.017518,0.046715,0.026277,0.091971,0.029197,0.055474,0.020438,0.094891,0.017518,...,0.005839,0.056934,0.007299,0.294891,0.059854,0.021898,0.005839,0.021898,0.087591,1.0
3,0.055693,0.035291,0.053488,0.02702,0.061208,0.029225,0.039702,0.032809,0.106148,0.01544,...,0.022057,0.046319,0.01737,0.124897,0.090157,0.052661,0.019024,0.046595,0.06617,1.0
4,0.072854,0.036195,0.067285,0.039907,0.0529,0.018561,0.02645,0.036195,0.085847,0.02181,...,0.029698,0.047332,0.013457,0.091879,0.070534,0.046404,0.022274,0.061717,0.072854,1.0
5,0.080172,0.037223,0.061203,0.037938,0.051539,0.022548,0.030064,0.034717,0.06514,0.018611,...,0.041875,0.043665,0.018253,0.07015,0.078024,0.041875,0.027201,0.076235,0.090551,1.0
6,0.080477,0.046498,0.053055,0.032787,0.057824,0.020268,0.028912,0.037556,0.059016,0.020268,...,0.047392,0.053055,0.018182,0.065872,0.065574,0.042921,0.037258,0.087332,0.064978,1.0
7,0.086921,0.048199,0.043596,0.039534,0.057677,0.017872,0.026537,0.046575,0.044138,0.02437,...,0.051449,0.050636,0.015705,0.044679,0.063634,0.052803,0.037097,0.104522,0.064988,1.0
8,0.104766,0.048411,0.03177,0.040091,0.061649,0.017776,0.027988,0.054463,0.034796,0.023071,...,0.059002,0.044629,0.014372,0.03177,0.068457,0.040847,0.04236,0.131241,0.063918,1.0


In [94]:
import pymannkendall as mk

monotonic_trend = []
for i in range(pam_model.k2):
    result = mk.original_test(topic_prevalence[i].values)
    # print(f"Sub topic {i} p-value: {result.p}")
    if result.p < 0.05:
        print(f"Sub topic {i} has a monotonic trend,", result.trend)
        monotonic_trend.append(i)

Sub topic 0 has a monotonic trend, increasing
Sub topic 1 has a monotonic trend, increasing
Sub topic 3 has a monotonic trend, increasing
Sub topic 5 has a monotonic trend, decreasing
Sub topic 7 has a monotonic trend, increasing
Sub topic 8 has a monotonic trend, decreasing
Sub topic 9 has a monotonic trend, increasing
Sub topic 11 has a monotonic trend, increasing
Sub topic 14 has a monotonic trend, decreasing
Sub topic 17 has a monotonic trend, increasing
Sub topic 18 has a monotonic trend, increasing
Sub topic 19 has a monotonic trend, decreasing


In [95]:
# draw a line plot for each topic prevalence with pyecharts
line = Line(init_opts=opts.InitOpts(width='1500px', height='800px'))

for i in range(pam_model.k2):
    line.add_xaxis(topic_prevalence.index)
    line.add_yaxis(f"Sub topic {i}", topic_prevalence[i], is_smooth=True, label_opts=opts.LabelOpts(is_show=False))

line.set_global_opts(title_opts=opts.TitleOpts(title="Sub topic prevalence"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%", orient="vertical"))
line.render('visualize/Topic_Trend_All.html')

'/Users/ZOU/Desktop/code/visualize/Topic_Trend_All.html'

In [96]:
# draw a line plot for topic has monotonic trend with pyecharts
line = Line(init_opts=opts.InitOpts(width='1500px', height='800px'))

for i, topic in enumerate(monotonic_trend):
    line.add_xaxis(topic_prevalence.index)
    line.add_yaxis(f"Sub topic {topic}", topic_prevalence[topic], is_smooth=True, label_opts=opts.LabelOpts(is_show=False))

line.set_global_opts(title_opts=opts.TitleOpts(title="Sub topic prevalence with monotonic trend"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%", orient="vertical"))
line.render('visualize/Topic_Trend_Monotonic.html')

'/Users/ZOU/Desktop/code/visualize/Topic_Trend_Monotonic.html'