In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

import gensim
from gensim import corpora
from gensim.models import LdaModel

from dotenv import load_dotenv
import os
import requests
import csv
import time
import json

[nltk_data] Downloading package punkt to /Users/besto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/besto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/besto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/besto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
load_dotenv()

# Elsevier Developer Portal API Key
ELS_API_KEY = os.getenv("ELS_API_KEY")

# Crawling Journal

In this section, we gonna create dataset that collected by using Elsevier API.

In the `search_results` directory, the search results of 1000 articles from the Future Generation Computer Systems journal will be stored in JSON file format. The search results will include the `Publisher Item Identifier (PII)`, which will be used to find the `title`, `abstract`, and `keywords`.

steps
1. read each file in search_results directory
2. extract `PII` from each result
3. find the `title`, `abstract`, and `keywords` by using Article Retrieval API with `PII`
4. create dataset

In [3]:
def get_article_data(pii: str) -> list[str, str | list[str]]:
    url = f"https://api.elsevier.com/content/article/pii/{pii}"
    headers = {
        "Accept": "application/json",
        "X-ELS-APIKey": ELS_API_KEY
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"{response.status_code}: {response.text}")

    resp_json = response.json()
    coredata = resp_json["full-text-retrieval-response"]["coredata"]
    title, abstract = coredata["dc:title"], coredata["dc:description"]
    if "dcterms:subject" in coredata:
        keywords = [ sub["$"] for sub in coredata["dcterms:subject"] ]
    else:
        keywords = []
    
    return [title, abstract, keywords]

def create_dataset_file(filename: str, headers: list[str], data_list: list[list[str | list[str]]]) -> None:
    with open(filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(data_list)


In [4]:
try:
    data_list = []
    for root, dirs, files in os.walk("./search_results"):
        for file in files:
            filepath = root + "/" + file

            with open(filepath) as f:
                search_results = json.load(f)

            for result in search_results["results"]:
                pii = result["pii"]
                data = get_article_data(pii)
                data_list.append(data)

            time.sleep(10) # prevent a rate limit error

    headers = ["title", "abstract", "keywords"]
    create_dataset_file(filename="dataset.csv", headers=headers, data_list=data_list)
except Exception as e:
    print(e)

# Load Dataset

In [5]:
df = pd.read_csv("dataset.csv")
print(df.shape)
df.head()

(1000, 3)


Unnamed: 0,title,abstract,keywords
0,On the improvement of wireless mesh sensor net...,Wireless Mesh Sensor Networks (WMSNs) have rec...,"['Wireless Mesh Sensor Networks', 'Hidden term..."
1,Software Tools and Techniques for Big Data Com...,,[]
2,Editorial Board,,[]
3,Contents,,[]
4,Dynamic counter-measures for risk-based access...,\n Risk-based access control ...,"['ISO 27001', 'ISMS', 'Risk management', 'Acce..."


In [6]:
df.isnull().sum()

title         0
abstract    136
keywords      0
dtype: int64

In [7]:
df = df.dropna()
df.isnull().sum()

title       0
abstract    0
keywords    0
dtype: int64

In [8]:
df.shape

(864, 3)

# Preprocessing

In [9]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def text_preprocessing(text: str) -> list[str]:
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    
    pos_tags = pos_tag(filtered_tokens)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]


    nouns = [word for word, pos in pos_tag(lemmatized_tokens) if pos in ['NN', 'NNS', 'NNP', 'NNPS']]

    return nouns

# Keyword Dictionary

In [10]:
def get_keyword_dicts(s: pd.Series) -> dict[str, int]:
    keyword_dict = {}
    for keywords_str in s:
        keyword_list = ast.literal_eval(keywords_str)
        for keyword in keyword_list:
            if keyword not in keyword_dict:
                keyword_dict[keyword] = 1
            else:
                keyword_dict[keyword] += 1
    return dict(sorted(keyword_dict.items(), key=lambda item: item[1], reverse=True))

In [11]:
keyword_dict = get_keyword_dicts(df["keywords"])

In [12]:
print("-"*30)
print("top frequency 1-10")
print("-"*30)
top10_keyword_frequency = list(keyword_dict.keys())[:10]
for key in top10_keyword_frequency:
    print(key, keyword_dict[key])

------------------------------
top frequency 1-10
------------------------------
Cloud computing 156
Big data 33
Internet of Things 31
Security 26
Scheduling 25
Scientific workflows 21
MapReduce 19
Resource management 19
Energy efficiency 18
Virtualization 16


In [13]:
print("-"*30)
print("top frequency 1-20")
print("-"*30)
top20_keyword_frequency = list(keyword_dict.keys())[:20]
for key in top20_keyword_frequency:
    print(key, keyword_dict[key])

------------------------------
top frequency 1-20
------------------------------
Cloud computing 156
Big data 33
Internet of Things 31
Security 26
Scheduling 25
Scientific workflows 21
MapReduce 19
Resource management 19
Energy efficiency 18
Virtualization 16
Cloud 16
Privacy 15
Cloud storage 14
Hadoop 13
Performance evaluation 13
Provenance 13
Clustering 13
Task scheduling 12
Resource allocation 12
Quality of service 12


In [14]:
print("-"*30)
print("top frequency 1-30")
print("-"*30)
top30_keyword_frequency = list(keyword_dict.keys())[:30]
for key in top30_keyword_frequency:
    print(key, keyword_dict[key])

------------------------------
top frequency 1-30
------------------------------
Cloud computing 156
Big data 33
Internet of Things 31
Security 26
Scheduling 25
Scientific workflows 21
MapReduce 19
Resource management 19
Energy efficiency 18
Virtualization 16
Cloud 16
Privacy 15
Cloud storage 14
Hadoop 13
Performance evaluation 13
Provenance 13
Clustering 13
Task scheduling 12
Resource allocation 12
Quality of service 12
Data mining 12
QoS 11
Wireless sensor networks 11
Big Data 11
Semantic web 11
IoT 11
Load balancing 10
Simulation 10
Game theory 10
Distributed computing 10


# Topic Modeling

In [15]:
texts = [text_preprocessing(abs) for abs in df["abstract"]]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)
topics = lda_model.print_topics()
for topic in topics:
    print(topic)

(0, '0.024*"service" + 0.023*"cloud" + 0.023*"application" + 0.016*"resource" + 0.015*"system" + 0.015*"network" + 0.012*"use" + 0.012*"paper" + 0.010*"model" + 0.009*"management"')
(1, '0.018*"data" + 0.015*"system" + 0.013*"iot" + 0.012*"query" + 0.010*"event" + 0.009*"entity" + 0.009*"network" + 0.009*"use" + 0.008*"paper" + 0.008*"method"')
(2, '0.042*"scheme" + 0.021*"data" + 0.019*"security" + 0.017*"paper" + 0.014*"encryption" + 0.012*"attack" + 0.011*"use" + 0.011*"user" + 0.011*"network" + 0.010*"secure"')
(3, '0.065*"data" + 0.019*"system" + 0.015*"process" + 0.013*"city" + 0.011*"paper" + 0.011*"application" + 0.011*"service" + 0.010*"cloud" + 0.009*"use" + 0.009*"approach"')
(4, '0.043*"data" + 0.024*"model" + 0.023*"resource" + 0.012*"use" + 0.009*"paper" + 0.008*"result" + 0.008*"machine" + 0.008*"problem" + 0.008*"network" + 0.008*"analysis"')
(5, '0.017*"service" + 0.016*"system" + 0.016*"approach" + 0.014*"model" + 0.014*"result" + 0.013*"application" + 0.013*"data" + 

# TF-IDF

# K-means clustering