### 1.Download the required installation package

In [2]:
pip install pybliometrics

Collecting pybliometrics
  Downloading pybliometrics-4.1-py2.py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 920 kB/s eta 0:00:011
Collecting tqdm
  Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 1.4 MB/s  eta 0:00:01
Installing collected packages: tqdm, pybliometrics
Successfully installed pybliometrics-4.1 tqdm-4.66.6
Note: you may need to restart the kernel to use updated packages.


In [10]:
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval

In [14]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[K     |████████████████████████████████| 26.6 MB 19.2 MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-7.0.5-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 2.2 MB/s  eta 0:00:01
[?25hCollecting scipy<1.14.0,>=1.7.0
  Downloading scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[K     |████████████████████████████████| 38.6 MB 17.8 MB/s eta 0:00:01   |█████▏                          | 6.3 MB 8.9 MB/s eta 0:00:04
[?25hCollecting numpy<2.0,>=1.18.5
  Downloading numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[K     |████████████████████████████████| 18.2 MB 17.8 MB/s eta 0:00:01   |█████████                       | 5.1 MB 6.8 MB/s eta 0:00:02
[?25hCollecting wrapt
  Downloading wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x

# 1.Sort vocabulary.txt

In [5]:
def sort_final_combined_dict(input_filename):
    data = {}
    with open(input_filename, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2:
                continue
            key, value_str = parts
            try:
                value = int(value_str)
            except ValueError:
                continue
            data[key] = value
    # 按值降序排序
    sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)
    return sorted_items

if __name__ == '__main__':
    input_filename = "final_combined_dict.txt"
    output_filename = "sorted_final_combined_dict.txt"
    sorted_dict = sort_final_combined_dict(input_filename)
    
    with open(output_filename, 'w', encoding='utf-8') as out:
        for key, value in sorted_dict:
            out.write(f"{key} {value}\n")
            


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


success


### Training Word2Vec Model

In [2]:
#  Import the gensim library
import gensim
from gensim.models import Word2Vec
import logging

# Input vocabulary.txt file
# Assume each line contains a word or phrase, separate the words in each line with spaces.
with open('sorted_final_combined_dict.txt', 'r', encoding='utf-8') as f:
    sentences = [line.strip().split() for line in f]

# Training the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Output embedding_vec.emb
model.wv.save_word2vec_format('sorted_final_combined_dict.emb', binary=False)


In [4]:
#LOAD Word2vec model
model = gensim.models.KeyedVectors.load_word2vec_format(fname='sorted_final_combined_dict.emb', unicode_errors='strict')

In [9]:
#example: 
model.most_similar(["mental"], topn=4)

[('20th', 0.3473949432373047),
 ('apply', 0.3407224416732788),
 ('odds', 0.3307487964630127),
 ('reinvention', 0.32761314511299133)]

### Generate Bookmark.json

In [8]:
!pip install scikit-learn
!pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [11]:
import gensim
import json
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import re 

# 1. input embedding_vec.emb
embedding_file = "sorted_final_combined_dict.emb"
model = gensim.models.KeyedVectors.load_word2vec_format(embedding_file, binary=False)


# 2. Load the vocabulary from the vocabulary.txt file, extract only the word parts, and remove additional information such as numbers
with open("sorted_final_combined_dict.txt", "r", encoding="utf-8") as f:
    vocabulary = [re.sub(r'\s+\d+$', '', line.strip().lower()) for line in f if line.strip()]



# 3. Get embedding vectors for each word
embeddings = []
not_found_words = []
for word in vocabulary:
    if word in model:
        embeddings.append(model[word])
    else:
        not_found_words.append(word)


# Convert to numpy array and check shape
embeddings = np.array(embeddings)
print(f"Embeddings shape: {embeddings.shape}")

# 4. Dimensionality reduction to 2D using PCA, t-SNE, and UMAP
pca = PCA(n_components=2).fit_transform(embeddings)
tsne = TSNE(n_components=2, perplexity=5, learning_rate=1, n_iter=5000).fit_transform(embeddings)  # setting iteration 
umap_result = umap.UMAP(n_neighbors=15, n_components=2).fit_transform(embeddings)

# 5. Generate a projections list and convert all values ​​to standard float type
projections = []
for i, word in enumerate(vocabulary):
    if word in model:
        projections.append({
            "word": word,
            "pca-0": float(pca[i][0]),
            "pca-1": float(pca[i][1]),
            "tsne-0": float(tsne[i][0]),
            "tsne-1": float(tsne[i][1]),
            "umap-0": float(umap_result[i][0]),
            "umap-1": float(umap_result[i][1])
        })

# 6. Define bookmark.json configuration
bookmark_config = {
    "label": "State 0",
    "isSelected": True,
    "tSNEIteration": 5000,  # Set a higher number of iterations
    "tSNEPerplexity": 5,
    "tSNELearningRate": 1,
    "tSNEis3d": False,  # 2d
    "umapIs3d": False, 
    "umapNeighbors": 15,
    "projections": projections,
    "selectedProjection": "umap",
    "dataSetDimensions": [len(vocabulary), embeddings.shape[1]],
    "cameraDef": {
        "orthographic": True,
        "position": [0, 0, 10],  # Set an initial position more suitable for 2D view
        "target": [0, 0, 0],
        "zoom": 1.2  # Set a zoom that is more suitable for 2D view
    },
    "selectedColorOptionName": "category",
    "selectedLabelOption": "word"
}

# 7. Output bookmark.json
with open("sorted_final_combined_dict.json", "w") as json_file:
    json.dump(bookmark_config, json_file, indent=4)








Embeddings shape: (3638, 100)


