In [2]:
import pandas as pd

In [3]:
# read preprocessed data
df = pd.read_csv('preprocessed_data.csv')

In [4]:
# totoal number of rows
print('Total number of rows:', len(df))

Total number of rows: 19631


In [5]:
df.columns

Index(['Article Title', 'Source Title', 'Language',
       'Times Cited, All Databases', 'Highly Cited Status', 'Hot Paper Status',
       'Publication Year', 'Decade', 'Group', 'WoS Categories new',
       'Research Areas new', 'Keywords Plus lemmatized',
       'Author Keywords lemmatized', 'All Keywords', 'Addresses new',
       'Affiliations new', 'Abstract lemmatized'],
      dtype='object')

In [6]:
# df.head()

In [7]:
df['All Keywords'][0]

"['economic optimization machine learning output', 'payment fraud risk management', 'integration of machine learning and statistical risk modelling', 'banking fraud', 'ensemble model', 'anomaly detection', 'system']"

In [8]:
df['All Keywords'] = df['All Keywords'].apply(eval)

In [9]:
word_freq = pd.Series([word for keywords in df['All Keywords'] for word in keywords]).value_counts()
word_freq = list(word_freq.items())
word_freq.sort(key=lambda x: x[1], reverse=True)
len(word_freq)

46038

In [10]:
word_freq[:30]

[('machine learning', 8757),
 ('model', 1830),
 ('artificial intelligence', 1705),
 ('classification', 1310),
 ('neural network', 930),
 ('performance', 853),
 ('impact', 849),
 ('prediction', 844),
 ('random forest', 815),
 ('big data', 809),
 ('deep learning', 758),
 ('support vector machine', 614),
 ('algorithm', 611),
 ('social medium', 580),
 ('natural language processing', 576),
 ('regression', 563),
 ('system', 561),
 ('risk', 550),
 ('selection', 529),
 ('information', 516),
 ('behavior', 434),
 ('management', 426),
 ('network', 416),
 ('technology', 375),
 ('sentiment analysis', 364),
 ('artificial neural network', 346),
 ('remote sensing', 339),
 ('covid 19', 336),
 ('data mining', 323),
 ('framework', 315)]

In [13]:
# draw a word cloud based on All Keywords

import pyecharts.options as opts
from pyecharts.charts import WordCloud

# plot the word cloud
(
    WordCloud()
    .add(series_name="", data_pair=word_freq[:600], word_size_range=[6, 66])
    .set_global_opts(
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
    .render("visualize/All_Keywords_Cloud.html")
)

'/Users/ZOU/Desktop/code/visualize/All_Keywords_Cloud.html'

In [10]:
# check how many empty list All Keywords
print('Number of empty All Keywords:', df['All Keywords'].apply(lambda x: len(x) == 0).sum())

Number of empty All Keywords: 1115


In [11]:
# build a co-occurence network of All Keywords for each group
# Save network files to json for VosViewer
import networkx as nx
import nx2vos

for group in range(1, 9):
    # build a co-occurrence network of All Keywords for each group
    # each row of the column 'All Keywords' is a list of keywords
    G = nx.Graph()
    for keywords in df[df['Group'] == group]['All Keywords']:
        for i in range(len(keywords)):
            for j in range(i+1, len(keywords)):
                if G.has_edge(keywords[i], keywords[j]):
                    G[keywords[i]][keywords[j]]['weight'] += 1
                else:
                    G.add_edge(keywords[i], keywords[j], weight=1)

    # Save network files to json for VosViewer
    nx2vos.write_vos_json(G, f'output/ALL/All_Keywords_G{group}.json')

    # save link in txt in weight descending order
    sorted_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
    with open(f'output/ALL/All_Keywords_G{group}_links.txt', 'w') as f:
        for edge in sorted_edges:
            f.write(f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']}\n")

In [12]:
# read the txt file
# show the links all the group have in common
# common_links = []
# for group in range(1, 9):
#     with open(f'output/ALL/All_Keywords_G{group}_links.txt', 'r') as f:
#         links = set()
#         for line in f:
#             links.add(line.split('\t')[0] + '\t' + line.split('\t')[1])
#         if group == 1:
#             common_links = links
#         else:
#             common_links = common_links.intersection(links)
# common_links
# len(common_links)

In [13]:
# re-build the network for each group by removing the common links
# for group in range(1, 9):
#     # build a co-occurrence network of Author Keywords for each group
#     # each row of the column 'All Keywords' is a list of keywords
#     G = nx.Graph()
#     for keywords in df[df['Group'] == group]['All Keywords']:
#         keywords = eval(keywords)
#         for i in range(len(keywords)):
#             for j in range(i+1, len(keywords)):
#                 if G.has_edge(keywords[i], keywords[j]):
#                     G[keywords[i]][keywords[j]]['weight'] += 1
#                 else:
#                     G.add_edge(keywords[i], keywords[j], weight=1)

#     # remove the common links
#     for link in common_links:
#         G.remove_edge(link.split('\t')[0], link.split('\t')[1])

#     # Save network files to json for VosViewer
#     nx2vos.write_vos_json(G, f'output/ALL/All_Keywords_G{group}_no_common.json')

#     # save link in txt in weight descending order
#     sorted_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
#     with open(f'output/ALL/All_Keywords_G{group}_no_common_links.txt', 'w') as f:
#         for edge in sorted_edges:
#             f.write(f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']}\n")

In [14]:
df['Keywords Plus lemmatized'] = df['Keywords Plus lemmatized'].apply(eval)

In [15]:
# check how many empty list Keywords Plus lemmatized
print('Number of empty Keywords Plus lemmatized:', df['Keywords Plus lemmatized'].apply(lambda x: len(x) == 0).sum())

Number of empty Keywords Plus lemmatized: 5619


In [16]:
# do the same for Keywords Plus lemmatized
for group in range(1, 9):
    # build a co-occurrence network of Keywords Plus lemmatized for each group
    # each row of the column 'All Keywords' is a list of keywords
    G = nx.Graph()
    for keywords in df[df['Group'] == group]['Keywords Plus lemmatized']:
        for i in range(len(keywords)):
            for j in range(i+1, len(keywords)):
                if G.has_edge(keywords[i], keywords[j]):
                    G[keywords[i]][keywords[j]]['weight'] += 1
                else:
                    G.add_edge(keywords[i], keywords[j], weight=1)

    # Save network files to json for VosViewer
    nx2vos.write_vos_json(G, f'output/KP/Keywords_Plus_G{group}.json')

    # save link in txt in weight descending order
    sorted_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
    with open(f'output/KP/Keywords_Plus_G{group}_links.txt', 'w') as f:
        for edge in sorted_edges:
            f.write(f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']}\n")

In [17]:
df['Author Keywords lemmatized'] = df['Author Keywords lemmatized'].apply(eval)

In [18]:
# check how many empty list Author Keywords lemmatized
print('Number of empty Author Keywords lemmatized:', df['Author Keywords lemmatized'].apply(lambda x: len(x) == 0).sum())

Number of empty Author Keywords lemmatized: 2327


In [19]:
# do the same for Author Keywords lemmatized
for group in range(1, 9):
    # build a co-occurrence network of Author Keywords lemmatized for each group
    # each row of the column 'Author Keywords' is a list of keywords
    G = nx.Graph()
    for keywords in df[df['Group'] == group]['Author Keywords lemmatized']:
        for i in range(len(keywords)):
            for j in range(i+1, len(keywords)):
                if G.has_edge(keywords[i], keywords[j]):
                    G[keywords[i]][keywords[j]]['weight'] += 1
                else:
                    G.add_edge(keywords[i], keywords[j], weight=1)

    # Save network files to json for VosViewer
    nx2vos.write_vos_json(G, f'output/AK/Author_Keywords_G{group}.json')

    # save link in txt in weight descending order
    sorted_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
    with open(f'output/AK/Author_Keywords_G{group}_links.txt', 'w') as f:
        for edge in sorted_edges:
            f.write(f"{edge[0]}\t{edge[1]}\t{edge[2]['weight']}\n")