<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#キーワード辞書の取得" data-toc-modified-id="キーワード辞書の取得-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>キーワード辞書の取得</a></span></li><li><span><a href="#キーワード辞書の中身の確認" data-toc-modified-id="キーワード辞書の中身の確認-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>キーワード辞書の中身の確認</a></span><ul class="toc-item"><li><span><a href="#分野" data-toc-modified-id="分野-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>分野</a></span></li><li><span><a href="#トピック" data-toc-modified-id="トピック-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>トピック</a></span></li></ul></li><li><span><a href="#4分野への圧縮" data-toc-modified-id="4分野への圧縮-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>4分野への圧縮</a></span></li></ul></div>

In [1]:
# import argparse
import collections
from collections import Counter, defaultdict
import configparser
import os
from matplotlib import pyplot as plt
import scienceplots
plt.rcParams['figure.dpi'] = 200
plt.style.use(['science','no-latex'])
import numpy as np
import pandas as pd
import pickle

import polars as pl

from textblob import TextBlob

import time

config = configparser.ConfigParser()
config.read('./config.ini', encoding='utf-8')
SAVE_DIR = config.get('settings','SAVE_DIR')
WORK_DIR = config.get('settings','WORK_DIR')
SHARE_DIR = config.get('settings','SHARE_DIR')

# Create keywords dic

In [8]:
def flatten(l):
    """
    Flattens a nested list.
    Even though `str` and `bytes` are iterable, we do not want to split them, so the function recursively runs until it encounters a `str`, `bytes`, or a non-iterable type.

    Args:
        l: An iterable type
    
    Returns:
        A generator of the flattened list
    
    """
    for el in l:
        if isinstance(el, collections.abc.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

def get_tficf_dic(df,col_lower_cluster,col_nouns):
    """Returns a dictionary of TFICF for each cluster.

    Args:
        df: A pd.DataFrame where each row contains a single paper, partition, and a list of nouns
        col_lower_cluster: The column name of the partition to be used
        col_nouns: The column name containing the nouns for each document
    
    Returns:
        A dictionary with (cluster_id, word) as the key and TFICF as the value
    """
    
    def get_icf(df, col_cluster, col_nouns):
        """Creates an ICF dictionary.

        Args:
            df: A pd.DataFrame where each row contains a single paper, partition, and a list of nouns
            col_cluster: The column name of the partition to be used
            col_nouns: The column name containing the nouns for each document
        
        Returns:
            dict: An ICF dictionary where the keys are words and the values are their ICF scores

        """
        
        word_ncluster = dict(df[[col_cluster,col_nouns]]
            .explode(col_nouns)
            .unique(subset=[col_nouns,col_cluster])
            .groupby(col_nouns)
            .agg(pl.count())
            .iter_rows()
        )
        n_ucluster = df[col_cluster].n_unique() 
        if n_ucluster == 0:
            return {}
        else:
            return {word: np.log((n_ucluster)/word_ncluster[word]) for word in word_ncluster} #{word: score}, #col_clusterの中のnounsが0の時
        
    def get_tf(df, col_cluster, col_nouns):
        """Creates a TF dictionary and returns the ratio of a keyword in the text as (cluster_id, word, ratio).

        Args:
            df: A pd.DataFrame where each row contains a single paper, partition, and a list of nouns
            col_cluster: The column name of the partition to be used
            col_nouns: The column name containing the nouns for each document
        
        Returns:
            dict: A TF dictionary where the keys are (col, word) and the values are their TF scores

        """
        c_size = dict(df.groupby(by=col_cluster)
                  .agg([pl.apply(exprs=[col_nouns], f=lambda l: len(list(flatten(l)))).alias('len')])
                  [[col_cluster,'len']]
                  .iter_rows()
                  )

        n_word_incs = {k: Counter(v) for k,v in (df.groupby(by=col_cluster) 
                           .agg([pl.apply(exprs=[col_nouns], f=lambda l: list(flatten(l)))])
                           [[col_cluster,col_nouns]]
                           .iter_rows()
                          )}
        return  {(cluster_id, word): n_word_inc/c_size[cluster_id] for cluster_id, dic in n_word_incs.items() for word, n_word_inc in dic.items()} # {(cid, word): score}
    
    t = time.time()
    print('start',time.time()-t)
    
    ICF = get_icf(df=df, col_cluster=col_lower_cluster, col_nouns=col_nouns)
    print('ICF_create',time.time()-t)
    t = time.time()
    
    TF = get_tf(df=df, col_cluster=col_lower_cluster, col_nouns=col_nouns)
    print('TF_create',time.time()-t)
    t = time.time()
    
    tficf_dic = defaultdict(lambda: defaultdict(float))
    for (cluster_id, word) in TF:
        tficf_dic[cluster_id][word] = TF[cluster_id,word] * ICF[word] 
    print('tficf_dic',time.time()-t)
    
    
    return tficf_dic


def get_relative_keywords(df,col_lower_cluster,col_higher_cluster,col_nouns,m=25,n=5):
    """
    Returns words that are not present in higher-level clusters but appear frequently in lower-level clusters.
    For measuring the degree of presence in lower-level clusters, TF-IDF is used for ease of comparison, instead of raw frequency.
    
    Args:
        col_lower_cluster: The column name of the lower-level cluster
        col_higher_cluster: The column name of the higher-level cluster
        col_nouns: The column name containing the noun keywords
        m: Hyperparameter for relative term extraction; higher values emphasize presence in lower-level clusters over absence in higher-level clusters, and lower values do the opposite
        n: Number of keywords to extract
    
    Returns:
        dict: A dictionary with cluster_id as the key and a list of keywords as the value
    """
    
    def score(tficf_ut, n_vt, m=25):
        return tficf_ut / (n_vt+m)
    
    if col_higher_cluster:
        u_tficf_dic = get_tficf_dic(df, col_lower_cluster, col_nouns) #{c: {w:tficf, ...},...}
        n_vt_dic = df.groupby(col_higher_cluster)[col_nouns].agg(list).map(lambda l: Counter(sum(l,[]))).to_dict()
        
        keywords_dic = {}
        for c in u_tficf_dic:
            c_higher = df[df[col_lower_cluster]==c][col_higher_cluster].iloc[0]
            scores = [(w,score(tficf_ut=u_tficf_dic[c][w],n_vt=n_vt_dic[c_higher][w],m=m)) for w in u_tficf_dic[c]]
            keywords_dic[c] = [i[0] for i in sorted(scores, key=lambda x: x[1], reverse=True)[:n]]
            print(len(keywords_dic))

    else:
        u_tficf_dic = get_tficf_dic(df, col_lower_cluster, col_nouns) #{c: {w:tficf, ...},...}
        keywords_dic = {}
        for c in u_tficf_dic:
            scores = [(w,score(tficf_ut=u_tficf_dic[c][w],n_vt=0,m=m)) for w in u_tficf_dic[c]]
            keywords_dic[c] = [i[0] for i in sorted(scores, key=lambda x: x[1], reverse=True)[:n]]

    return keywords_dic

In [4]:
for RESOLUTION, NMIN in zip([1e-06, 5e-04],[100000, 1000]):

    ## NO KEYWORDSの削除
    df = (pd.concat([
            pd.read_pickle(SHARE_DIR+'paper_detail/keywords.pickle'),
            pd.read_pickle(SAVE_DIR+f'paper_detail/partition_in_2021_{RESOLUTION}_{NMIN}_waltman.pickle'),
        ],axis=1)
            .assign(keywords = lambda df: df['keywords'].map(lambda l: [x.lower() for x in l if x != 'NO KEYWORDS']))
         )
    print(df.shape)

    df_valid = df[df['keywords'].map(len)!=0].dropna(subset=[f'partition_{RESOLUTION}']).astype({f'partition_{RESOLUTION}':int})
    print(df_valid.shape)

    ## polarsに変換
    df_polars1 = pl.from_pandas(df_valid.iloc[:20000000]) 
    df_polars2 = pl.from_pandas(df_valid.iloc[20000000:])
    df_polars = pl.concat([df_polars1, df_polars2])

    display(df_polars.head())

    keyword_dic = get_relative_keywords(df_polars, col_lower_cluster=f'partition_{RESOLUTION}', col_higher_cluster=None, col_nouns='keywords',m=1,n=30)

    with open(SAVE_DIR+f'paper_detail/keywords_in_2021_{RESOLUTION}_{NMIN}_tficf_waltman.pickle','wb') as f:
        pickle.dump(dict(keyword_dic), f)

In [9]:
print(len(keyword_dic)) # -> 1e-06: 67
print(len(keyword_dic)) # -> 5e-04: 15521

67


In [25]:
for RESOLUTION, NMIN in zip([1e-06, 5e-04],[100000, 1000]):
    
    papers = pd.merge(
        pd.read_pickle(SHARE_DIR+f'paper_detail/eid.pickle'),
        pd.read_pickle(SAVE_DIR+f'paper_detail/partition_in_2021_{RESOLUTION}_{NMIN}_waltman.pickle'),
        left_index=True,right_index=True,how='right' #2021までのデータに合わせたいので、右で
    )[f'partition_{RESOLUTION}'].value_counts().to_dict()
    
    papers_2016 = pd.merge(
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/eid_2016.pickle'),
        pd.read_pickle(SAVE_DIR+f'paper_detail/partition_in_2021_{RESOLUTION}_{NMIN}_waltman.pickle'),
        left_on='eid',right_index=True,how='left'
    )[f'partition_{RESOLUTION}'].value_counts().to_dict()
    
    with open(SAVE_DIR+f'paper_detail/keywords_in_2021_{RESOLUTION}_{NMIN}_tficf_waltman.pickle','rb') as f:
        keyword_dic = pickle.load(f)
    
    rows = [
        [partition, papers[partition], papers_2016.get(partition,0)]  + keywords
        for partition,keywords in sorted(keyword_dic.items(),key=lambda x: x[0])
    ]
    
    _ = pd.DataFrame(rows, columns=['partition','n_1970-2021','n_2016']+[f'word{i}' for i in range(30)])
    
    display(_.head())
    
    _.to_excel(SAVE_DIR+f'paper_detail/keywords_in_2021_{RESOLUTION}_{NMIN}_tficf_waltman.xlsx')

Unnamed: 0,partition,n_1970-2021,n_2016,word0,word1,word2,word3,word4,word5,word6,...,word20,word21,word22,word23,word24,word25,word26,word27,word28,word29
0,-1,193660,5739,new species,poetry,cultivar,history,education,psychoanalysis,identity,...,intertextuality,transylvania,politics,russia,design,art,body,language,taxonomy,philosophy
1,0,168317,3193,density functional theory,dft,method mndo,acid power,ab initio calculations,method ab initio,acid force,...,relativistic effects,nmr,quantum chemical calculations,quantum-chemical calculation,molecular dynamics,hydrogen bond,hydrogen bonding,cyclic voltammetry,spectroscopic constants,quantum-chemical calculations
2,1,165883,8086,technology acceptance model,e-commerce,trust,tam,e-learning,technology acceptance,electronic commerce,...,continuance intention,consumer behaviour,social media,mobile banking,purchase intention,perceived ease of use,satisfaction,higher education,social commerce,security
3,2,138371,6558,adsorption,biosorption,activated carbon,isotherm,heavy metals,methylene blue,isotherms,...,dye removal,desorption,removal,adsorption isotherms,lead,dye,kinetic,congo red,cr(vi),heavy metal ions
4,3,129994,6921,innovation,open innovation,absorptive capacity,competitiveness,innovation performance,entrepreneurship,smes,...,knowledge transfer,globalization,collaboration,networks,alliances,industrial policy,clusters,regional innovation system,human capital,competitive advantage


# キーワード辞書の中身の確認

## 分野

In [2]:
RESOLUTION = 1e-06
NMIN = 100000

_ = pd.read_excel(
     SAVE_DIR+f'paper_detail/keywords_in_2021_{RESOLUTION}_{NMIN}_tficf_waltman_name.xlsx', 
     sheet_name='names'
 )[['partition','n_1970-2021','n_2016','nameByGPT-4'] + [f'word{i}' for i in range(5)]]
keywords = {int(k):v for k,v in _.set_index('partition')['nameByGPT-4'].to_dict().items()}

In [4]:
keywords

{-1: nan,
 0: 'Molecular & Cellular Biology',
 1: 'Neuropsychiatric Disorders',
 2: 'Organic & Inorganic Chemistry',
 3: 'Social & Political Sciences',
 4: 'Ecology & Evolution',
 5: 'Nutrition & Microbiome',
 6: 'Nanomaterials & Energy Storage',
 7: 'Computer Vision & Security',
 8: 'Cardiovascular Medicine',
 9: 'Wireless Networks & Security',
 10: 'Biomaterials & Orthopedics',
 11: 'Fluid Mechanics & Heat Transfer',
 12: 'Astronomy & Astrophysics',
 13: 'Environmental Science & Technology',
 14: 'Information & Computer Science',
 15: 'Materials Science & Engineering',
 16: 'Plant Science & Stress Responses',
 17: 'Quantum Physics & Superconductivity',
 18: 'Geology & Tectonics',
 19: 'Climate Science & Remote Sensing',
 20: 'Gastroenterology & Surgery',
 21: 'Respiratory & Critical Care Medicine',
 22: 'Power Systems & Smart Grids',
 23: 'Structural Engineering & Materials',
 24: 'Metamaterials & Antennas',
 25: 'Vibration & Material Mechanics',
 26: 'Control Systems & Robustness',


In [43]:
_

Unnamed: 0,partition,n_1970-2021,n_2016,nameByGPT-4,word0,word1,word2,word3,word4
0,-1.0,185330.0,5500.0,,cultivar,new species,exile,intertextuality,onomastics
1,0.0,8166139.0,307781.0,Molecular & Cellular Biology,oxidative stress,inflammation,autophagy,apoptosis,immunohistochemistry
2,1.0,5492822.0,228993.0,Neuropsychiatric Disorders,schizophrenia,anxiety,depression,nucleus accumbens,prefrontal cortex
3,2.0,4389226.0,136514.0,Organic & Inorganic Chemistry,asymmetric catalysis,organocatalysis,cycloaddition,asymmetric synthesis,homogeneous catalysis
4,3.0,3918221.0,213049.0,Social & Political Sciences,corporate governance,firm performance,corporate social responsibility,monetary policy,foreign direct investment
...,...,...,...,...,...,...,...,...,...
62,61.0,125607.0,3052.0,Animal Breeding & Reproduction,superovulation,genetic parameters,beef cattle,semen,progesterone
63,62.0,121078.0,5314.0,Scientometrics & Research Ethics,altmetrics,plagiarism,scholarly communication,academic dishonesty,academic integrity
64,63.0,113219.0,3040.0,Chemical Engineering & Phase Equilibria,reactive distillation,equation of state,vapor-liquid equilibria,excess molar volume,co capture
65,64.0,111779.0,2380.0,Polymer Science & Engineering,compatibilization,blends,isotactic polypropylene,polymer blends,injection molding


In [14]:
# 分野レベルでノイズになった数
185330/_['n_1970-2021'].sum()

0.0027714802731408734

## トピック

In [5]:
RESOLUTION = 5e-04
NMIN = 1000

In [13]:
_ = pd.read_excel(
     SAVE_DIR+f'paper_detail/keywords_in_2021_{RESOLUTION}_{NMIN}_tficf_waltman_name.xlsx', 
     sheet_name='names'
 )[['partition','n_1970-2021','n_2016'] + [f'word{i}' for i in range(5)]]
# keywords = {int(k):v for k,v in _.set_index('partition')['nameByGPT-4'].to_dict().items()}

In [7]:
# トピックレベルでノイズになった数
193660.0/_['n_1970-2021'].sum()

0.0028960495855849646

In [6]:
_

Unnamed: 0,partition,n_1970-2021,n_2016,word0,word1,word2,word3,word4
0,-1.0,193660.0,5739.0,new species,poetry,cultivar,history,education
1,0.0,168317.0,3193.0,density functional theory,dft,method mndo,acid power,ab initio calculations
2,1.0,165883.0,8086.0,technology acceptance model,e-commerce,trust,tam,e-learning
3,2.0,138371.0,6558.0,adsorption,biosorption,activated carbon,isotherm,heavy metals
4,3.0,129994.0,6921.0,innovation,open innovation,absorptive capacity,competitiveness,innovation performance
...,...,...,...,...,...,...,...,...
15516,15515.0,1000.0,77.0,privacy protection,privacy,video surveillance,face de-identification,de-identification
15517,15516.0,1000.0,58.0,random walk in random environment,random environment,random walk,stochastic homogenization,random conductance model
15518,15517.0,1000.0,50.0,disability,poverty,persons with disabilities,people with disabilities,washington group
15519,15518.0,1000.0,72.0,pim-1,pim1,pim kinase,pim kinases,pim-2


# 4分野への圧縮

- 論文誌に付与されたasjcを、接頭辞から4分野に圧縮して、その特化係数が最も大きい分野に割り振る

## field

In [27]:
RESOLUTION = 1e-06
NMIN = 100000

# RESOLUTION = 5e-04
# NMIN = 1000

_ = pd.read_excel(
     SAVE_DIR+f'paper_detail/keywords_in_2021_{RESOLUTION}_{NMIN}_tficf_waltman_name.xlsx', 
     sheet_name='names'
 )[['partition','n_1970-2021','n_2016','nameByGPT-4'] + [f'word{i}' for i in range(5)]]
keywords = {int(k):v for k,v in _.set_index('partition')['nameByGPT-4'].to_dict().items()}

papers = pd.concat([
    pd.read_pickle(SHARE_DIR+f'paper_detail/asjcs.pickle'),
    pd.read_pickle(SAVE_DIR+f'paper_detail/partition_in_2021_1e-06_100000_waltman.pickle'),
],axis=1).explode('asjcs')

In [43]:
# https://service.elsevier.com/app/answers/detail/a_id/15181/supporthub/scopus/
# https://service.elsevier.com/app/answers/detail/a_id/16205/supporthub/scopus/

prefix_health = ['27','29','34','35','36']
prefix_life = ['11','13','24','28','30']
prefix_phys = ['15','16','17','19','21','22','23','25','26','31']
prefix_social = ['12','14','18','20','32','33']

prefixs = {i:0 for i in prefix_health}
prefixs.update({i:1 for i in prefix_life})
prefixs.update({i:2 for i in prefix_phys})
prefixs.update({i:3 for i in prefix_social})

# 4分野に割り振る
papers['large_asjc'] = papers['asjcs'].map(lambda x: prefixs.get(str(x)[:2], -1))

asjc_rate = papers.value_counts('large_asjc', normalize=True)
print(asjc_rate)

large_asjc
 2    0.507282
 0    0.197190
 1    0.174926
 3    0.113334
-1    0.007268
dtype: float64


In [48]:
pid_to_largefield = {}

for pid, df in papers.groupby('partition_1e-06'):
    asjc_rate_cluster = df.value_counts('large_asjc', normalize=True)
    most_freq_asjc = (asjc_rate_cluster / asjc_rate).sort_values(ascending=False).index[0]
    pid_to_largefield[pid] = most_freq_asjc
    print(pid, keywords[pid], most_freq_asjc)

-1.0 nan 3
0.0 Molecular & Cellular Biology 1
1.0 Neuropsychiatric Disorders 3
2.0 Organic & Inorganic Chemistry 2
3.0 Social & Political Sciences 3
4.0 Ecology & Evolution 1
5.0 Nutrition & Microbiome 1
6.0 Nanomaterials & Energy Storage 2
7.0 Computer Vision & Security 2
8.0 Cardiovascular Medicine 0
9.0 Wireless Networks & Security 2
10.0 Biomaterials & Orthopedics 0
11.0 Fluid Mechanics & Heat Transfer 2
12.0 Astronomy & Astrophysics -1
13.0 Environmental Science & Technology 2
14.0 Information & Computer Science 3
15.0 Materials Science & Engineering 2
16.0 Plant Science & Stress Responses 1
17.0 Quantum Physics & Superconductivity 2
18.0 Geology & Tectonics 2
19.0 Climate Science & Remote Sensing 2
20.0 Gastroenterology & Surgery 0
21.0 Respiratory & Critical Care Medicine 0
22.0 Power Systems & Smart Grids 2
23.0 Structural Engineering & Materials 2
24.0 Metamaterials & Antennas 2
25.0 Vibration & Material Mechanics 2
26.0 Control Systems & Robustness 2
27.0 Optics & Photonics 2

In [49]:
Counter([pid_to_largefield[k] for k in range(48)])

Counter({1: 4, 3: 6, 2: 23, 0: 14, -1: 1})

In [50]:
# おかしいものは手元で微修正
pid_to_largefield[1] = 0
pid_to_largefield[12] = 2
pid_to_largefield[14] = 2
pid_to_largefield[56] = 2
pid_to_largefield[57] = 2
pid_to_largefield[62] = 3

In [51]:
with open(SAVE_DIR+f'paper_detail/partition_to_ASJC_in_2021_{RESOLUTION}_{NMIN}.pickle','wb') as f:
    pickle.dump(pid_to_largefield, f)

## topic

In [2]:
RESOLUTION = 5e-04
NMIN = 1000

papers = pd.concat([
    pd.read_pickle(SHARE_DIR+f'paper_detail/asjcs.pickle'),
    pd.read_pickle(SAVE_DIR+f'paper_detail/partition_in_2021_0.0005_1000_waltman.pickle'),
],axis=1).explode('asjcs')

In [3]:
# https://service.elsevier.com/app/answers/detail/a_id/15181/supporthub/scopus/
# https://service.elsevier.com/app/answers/detail/a_id/16205/supporthub/scopus/

prefix_health = ['27','29','34','35','36']
prefix_life = ['11','13','24','28','30']
prefix_phys = ['15','16','17','19','21','22','23','25','26','31']
prefix_social = ['12','14','18','20','32','33']

prefixs = {i:0 for i in prefix_health}
prefixs.update({i:1 for i in prefix_life})
prefixs.update({i:2 for i in prefix_phys})
prefixs.update({i:3 for i in prefix_social})

# 4分野に割り振る
papers['large_asjc'] = papers['asjcs'].map(lambda x: prefixs.get(str(x)[:2], -1))

asjc_rate = papers.value_counts('large_asjc', normalize=True)
print(asjc_rate)

large_asjc
 2    0.507282
 0    0.197190
 1    0.174926
 3    0.113334
-1    0.007268
dtype: float64


In [6]:
pid_to_largefield = {}

for pid, df in papers.groupby('partition_0.0005'):
    asjc_rate_cluster = df.value_counts('large_asjc', normalize=True)
    most_freq_asjc = (asjc_rate_cluster / asjc_rate).sort_values(ascending=False).index[0]
    pid_to_largefield[pid] = most_freq_asjc
    # print(pid, keywords[pid], most_freq_asjc)

In [7]:
with open(SAVE_DIR+f'paper_detail/partition_to_ASJC_in_2021_{RESOLUTION}_{NMIN}.pickle','wb') as f:
    pickle.dump(pid_to_largefield, f)