In [1]:
import configparser
import math
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pickle
import os

from tqdm import tqdm

config = configparser.ConfigParser()
config.read('./config.ini', encoding='utf-8')
SAVE_DIR = config.get('settings','SAVE_DIR')
WORK_DIR = config.get('settings','WORK_DIR')
SHARE_DIR = config.get('settings','SHARE_DIR')

settings = configparser.ConfigParser()
settings.read('./settings.ini', encoding='utf-8')
MIN_YEAR = int(settings.get('experiment','MIN_YEAR'))
MAX_YEAR = int(settings.get('experiment','MAX_YEAR'))
RESOLUTION = float(settings.get('experiment','RESOLUTION'))
NMIN = int(settings.get('experiment','NMIN'))

In [2]:
_ = pd.read_excel(
     SAVE_DIR+f'paper_detail/keywords_in_2021_{RESOLUTION}_{NMIN}_tficf_waltman_name.xlsx', 
     sheet_name='names'
 )[['partition','n_1970-2021','n_2016','nameByGPT-4'] + [f'word{i}' for i in range(5)]]
keywords = {int(k):v for k,v in _.set_index('partition')['nameByGPT-4'].to_dict().items()}
keywords

{-1: nan,
 0: 'Molecular & Cellular Biology',
 1: 'Neuropsychiatric Disorders',
 2: 'Organic & Inorganic Chemistry',
 3: 'Social & Political Sciences',
 4: 'Ecology & Evolution',
 5: 'Nutrition & Microbiome',
 6: 'Nanomaterials & Energy Storage',
 7: 'Computer Vision & Security',
 8: 'Cardiovascular Medicine',
 9: 'Wireless Networks & Security',
 10: 'Biomaterials & Orthopedics',
 11: 'Fluid Mechanics & Heat Transfer',
 12: 'Astronomy & Astrophysics',
 13: 'Environmental Science & Technology',
 14: 'Information & Computer Science',
 15: 'Materials Science & Engineering',
 16: 'Plant Science & Stress Responses',
 17: 'Quantum Physics & Superconductivity',
 18: 'Geology & Tectonics',
 19: 'Climate Science & Remote Sensing',
 20: 'Gastroenterology & Surgery',
 21: 'Respiratory & Critical Care Medicine',
 22: 'Power Systems & Smart Grids',
 23: 'Structural Engineering & Materials',
 24: 'Metamaterials & Antennas',
 25: 'Vibration & Material Mechanics',
 26: 'Control Systems & Robustness',


# Create data

In [3]:
papers_2016 = pd.merge(
    pd.concat([
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/eid_2016.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/authids_2016.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/subjs_2016.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/doi_2016.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/doctype_2016.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/journal_2016.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/c_history_2016.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/c_history_sum_2016.pickle'),
        pd.read_pickle(SAVE_DIR+f'paper_detail_2016/c_normalized_in_2021_{RESOLUTION}_{NMIN}_2016_waltman.pickle'),
        pd.read_pickle(SAVE_DIR+'paper_detail_2016/CD_2016.pickle'),
    ],axis=1),
    pd.read_pickle(SAVE_DIR+f'paper_detail/partition_in_2021_{RESOLUTION}_{NMIN}_waltman.pickle'),
    left_on='eid',right_index=True,how='left'
)
papers_2016['c_history_sum_2018'] = papers_2016['c_history'].map(lambda l: sum(l[:3]))
print('2016年の全データ', papers_2016.shape)

# Exclude those that are books (ch, bk), those that are short (sh, er), and those that are comments (no, cr (to be checked later), dp, tb, ip, ab, bz, rp).
papers_2016 = papers_2016[papers_2016['doctype'].isin({'ar','cp','re','le','ed'})].copy()
print('ar,cp,re,le,edのいずれかのデータ', len(papers_2016))

2016年の全データ (3053383, 12)
ar,cp,re,le,edのいずれかのデータ 2747241


In [4]:
for k,v in papers_2016[f'partition_{RESOLUTION}'].value_counts().sort_index().items():
    print(str(k).ljust(8),v)

-1.0     4780
0.0      284408
1.0      203464
2.0      131490
3.0      164481
4.0      127465
5.0      113198
6.0      128505
7.0      107891
8.0      61198
9.0      76124
10.0     62373
11.0     59738
12.0     44191
13.0     51060
14.0     61579
15.0     45363
16.0     43567
17.0     31773
18.0     40224
19.0     41908
20.0     31171
21.0     30812
22.0     50331
23.0     37908
24.0     35337
25.0     29167
26.0     27556
27.0     23369
28.0     20512
29.0     22988
30.0     17536
31.0     28186
32.0     20979
33.0     26151
34.0     22504
35.0     16786
36.0     17697
37.0     17005
38.0     19444
39.0     15116
40.0     13919
41.0     12097
42.0     11213
43.0     10774
44.0     15736
45.0     12037
46.0     10377
47.0     11623
48.0     7346
49.0     8491
50.0     7880
51.0     8055
52.0     6877
53.0     8046
54.0     8319
55.0     6940
56.0     9166
57.0     5675
58.0     5463
59.0     3778
60.0     3904
61.0     2916
62.0     4528
63.0     2837
64.0     2208
65.0     3476


# Add columns

## jif

In [5]:
def get_jif():
    SHARE_DIR = '/disks/qnap2/shared/scopus_2022/'
    _papers =(pd.concat([
            pd.read_pickle(SHARE_DIR+'paper_detail/eid.pickle'),
            pd.read_pickle(SHARE_DIR+'paper_detail/year.pickle'),
            pd.read_pickle(SHARE_DIR+'paper_detail/journal.pickle'),
            pd.read_pickle('/disks/qnap2/data/t-miura/2022_fieldmerge/SIGMET/paper_detail/c_history.pickle')
        ],axis=1)
    )

    journal_papers = (_papers[['eid','year','journal','c_history']].set_index('eid')
                     .assign(c1=lambda df: df['c_history'].map(lambda c_list: c_list[1] if len(c_list) > 1 else 0))
                     .assign(c2=lambda df: df['c_history'].map(lambda c_list: c_list[2] if len(c_list) > 2 else 0))
                      [['year','journal','c1','c2']]
                     )
    journal_counts = journal_papers[['journal','year']].value_counts()
    journal_citations = journal_papers.groupby(['journal','year']).agg(np.sum)
    journal_citations_c1 = journal_citations['c1'].to_dict()
    journal_citations_c2 = journal_citations['c2'].to_dict()

    journal_IFs = {
        journal_id: {
            year: (journal_citations_c1.get((journal_id, year-1), 0) + journal_citations_c2.get((journal_id, year-2), 0)) / (journal_counts.get((journal_id, year-2),0) + journal_counts.get((journal_id, year-1),0))
            for year in range(1970,2023) if (journal_counts.get((journal_id, year-2),0) + journal_counts.get((journal_id, year-1),0)) != 0
        } for journal_id in tqdm(list(journal_papers['journal'].unique()))
    }

    # with open('/disks/qnap2/data/t-miura/2023_readership/papers/jif.pickle', 'wb') as f:
    #     pickle.dump(journal_IFs, f)

#     40分ほど

In [6]:
def assign_jif(papers, year):
    jif_dic = pd.read_pickle('/disks/qnap2/data/t-miura/2023_readership/papers/jif.pickle')
    papers['jif'] = papers.apply(lambda row: jif_dic[row['journal']].get(year,.0), axis=1)
    return papers

## partition

In [7]:
def assign_partition(papers):
    papers = pd.merge(
        papers,
        pd.read_pickle(SAVE_DIR+'paper_detail/partition_in_2021_0.0005_1000_waltman.pickle'),
        left_on='eid', right_index=True, how='left'
    )
    with open(SAVE_DIR+f'paper_detail/partition_to_ASJC_in_2021_{RESOLUTION}_{NMIN}.pickle','rb') as f:
        pid_to_largefield = pickle.load(f)
    papers['partition_ASJC'] = papers['partition_1e-06'].map(pid_to_largefield)
    
    return papers

## readership

In [8]:
# readership
def get_df(lines):
    names = ['eid'] + lines[0].strip('\n').split('\t')[1:]
    df = pd.DataFrame([l.strip('\n').split('\t') for l in lines[1:]], columns=names)
    return (df
            .astype({k:int for k in names[2:]}) 
            [df['eid'].map(len) == 18] 
            .assign(eid=lambda df: df['eid'].map(lambda x: int(x[7:]))) 
           )

def merge_df(papers, readership):
    return pd.merge(papers, readership.drop(['doi'],axis=1), on='eid')

In [9]:
def assign_readership(papers):
    
    with open('/disks/qnap2/data/t-miura/2023_readership/papers/old_waltman_1e-06/DOI_LIST/readership_data/highly_cited_readership.txt','r') as f:
        lines = f.readlines()
        readership_hcp = get_df(lines)
    with open('/disks/qnap2/data/t-miura/2023_readership/papers/old_waltman_1e-06/DOI_LIST/readership_data/mid_cited_readership.txt','r') as f:
        lines = f.readlines()
        readership_mcp = get_df(lines)
    with open('/disks/qnap2/data/t-miura/2023_readership/papers/old_waltman_1e-06/DOI_LIST/readership_data/lowly_cited_readership.txt','r') as f:
        lines = f.readlines()
        readership_lcp = get_df(lines)
    
    readership = pd.concat([readership_hcp, readership_mcp, readership_lcp])
    
    papers = pd.merge(papers, readership.drop(['doi'],axis=1), on='eid', how='left')
    
    return papers

## read_per_cite

In [10]:
def calc_read_per_cite_df(df):
    for year in range(2016,2023):
        df[f'read_percite_{year}'] = (df[f'cumulative_reader_{year}'] +1) / (df['c_history'].map(lambda l: sum(l[:year-2015])) + 1)
    return df

def assign_read_percite(papers):
    return calc_read_per_cite_df(papers)

## disuption

In [11]:
def get_disruption():
    eids_2016 = pd.read_pickle('/disks/qnap2/data/t-miura/2023_readership/paper_detail_2016/eid_2016.pickle')
    citations_gb = pd.read_pickle('/disks/qnap2/shared/scopus_2022/citations_gb.pickle')
    citations_gb_2016 = citations_gb.reindex(eids_2016.values)
    citations_gb_2016 = citations_gb_2016.dropna(subset=['target'])

    idxs = citations_gb_2016.index.to_list()
    sources = citations_gb_2016['source'].map(lambda x: set(x) if type(x) == list else set()).to_list()
    targets = citations_gb_2016['target'].map(lambda x: set(x) if type(x) == list else set()).to_list()
    targets_all_dic = citations_gb['target'].map(lambda x: set(x) if type(x) == list else set()).to_dict()

    citations_2016 = pd.DataFrame(list(set(
        [
            (s,idx,1) if targets_all_dic.get(s,set()) & t_set else (s,idx,0) 
            for idx,s_set,t_set in zip(idxs,sources,targets) 
            for s in s_set
        ]))
    ,columns=['source','target','is_develop'])
    citations_2016.to_pickle('/disks/qnap2/data/t-miura/2023_readership/paper_detail_2016/citations_2016_disruptiveness_2022_dropNaN.pickle')

    def calc_D_nok(n_disrupt,n_develop):
        if n_disrupt+n_develop == 0:
            return np.nan
        else:
            return (n_disrupt-n_develop) / (n_disrupt+n_develop)


    D = {}
    for eid_2016, df in tqdm(citations_2016.groupby('target')):
        n_develop = df['is_develop'].sum()
        n_disrupt = len(df) - n_develop
        D[eid_2016] = calc_D_nok(n_disrupt, n_develop)
    pd.Series(D, name='D_nok').to_pickle('/disks/qnap2/data/t-miura/2023_readership/paper_detail_2016/D_nok_2016_2022_dropNaN.pickle')
    
def assign_disruption(papers):
    papers = pd.merge(papers, pd.read_pickle('/disks/qnap2/data/t-miura/2023_readership/paper_detail_2016/D_nok_2016_2022_dropNaN.pickle'), left_on='eid', right_index=True, how='left')
    return papers

## capacity

In [12]:
def assign_capacity(papers):
    c_histories = pd.read_pickle('/disks/qnap_m/rawdata/scopus/2022/paper_detail/c_history.pickle').to_dict() 
    years = pd.read_pickle('/disks/qnap2/shared/scopus_2022/paper_detail/year.pickle').to_dict()
    refs = pd.read_pickle('/disks/qnap2/shared/scopus_2022/citations_gb.pickle')['target'].to_dict()

    def calc_capacity(eid):
        _refs = refs.get(eid,[])
        if type(_refs) != list:
            return np.nan
        # 変更点
        c_ref_star_mean = np.exp(np.mean([np.log(sum(c_histories[_ref_eid][:years[eid]-years[_ref_eid]]) + 1) for _ref_eid in _refs])) #publication citation
        c_ref_inf_mean = np.exp(np.mean([np.log(sum(c_histories[_ref_eid]) + 1) for _ref_eid in _refs])) #2022 citation(ultimate impact)

        return 1-(c_ref_star_mean/c_ref_inf_mean)
    
    papers = papers.assign(capacity=lambda df: df['eid'].map(calc_capacity))
    
    return papers

## 2018, 2021 year, HCP, MCP, LCP

In [13]:
def label_hcp_mcp_lcp(c, c_top10p, c_bottom50p):
    
    if np.isnan(c): return 'invalid citaiton'
    elif c>c_top10p: return 'top'
    elif c<c_bottom50p: return 'bottom'
    else: return 'middle'

def get_hcp_mcp_lcp(papers, year):
    
    if year == 2018:
        citation_col = 'c_history_sum_2018'
    elif year == 2021:
        citation_col = 'c_history_sum'
    else:
        raise('year is not 2018 or 2021')
    
    dfs = []
    for cluster in sorted(papers['partition_1e-06'].dropna().unique()):
        papers_cluster = papers[papers['partition_1e-06']==cluster].copy()

        if cluster == -1: 
            papers_cluster[f'label_hml_{year}'] = ['invalid cluster' for i in range(len(papers_cluster))]
        else:    
            c_normalized_top10p = np.percentile(papers_cluster[citation_col].dropna(), 90)
            c_normalized_bottom50p = np.percentile(papers_cluster[citation_col].dropna(), 50)
            print(cluster, c_normalized_top10p, c_normalized_bottom50p)
            papers_cluster[f'label_hml_{year}'] = papers_cluster[citation_col].map(lambda x: label_hcp_mcp_lcp(x, c_normalized_top10p, c_normalized_bottom50p))

        dfs.append(papers_cluster)
    papers = pd.concat(dfs).reindex(papers.index)
    return papers
    
def assign_hcp_mcp_lcp(papers):
    papers = get_hcp_mcp_lcp(papers, year=2018)
    papers = get_hcp_mcp_lcp(papers, year=2021)
    return papers

## auth_prestige

In [14]:
def assign_auth_prestige(papers):
    authid_medcite = pd.read_pickle('/disks/qnap2/data/t-miura/2022_storyteller/project1/IC2s2_CD15/regression/authid_medcite.pickle')
    papers = papers.assign(auth_prestige = lambda df: df['authids'].map(lambda authids: np.nanmax([0]+[authid_medcite[2015].get(authid,0) for authid in authids])))
    return papers

## Execute

In [15]:
papers_2016.columns

Index(['eid', 'authids', 'subjs', 'doi', 'doctype', 'journal', 'c_history',
       'c_history_sum', 'c_normalized_1e-06', 'CD', 'partition_1e-06',
       'c_history_sum_2018'],
      dtype='object')

In [None]:
papers_2016 = assign_jif(papers_2016, year=2015)

In [None]:
papers_2016 = assign_partition(papers_2016)

In [None]:
papers_2016 = assign_readership(papers_2016)

In [None]:
papers_2016 = assign_read_percite(papers_2016)

In [None]:
papers_2016 = assign_auth_prestige(papers_2016)

In [None]:
get_disruption()
papers_2016 = assign_disruption(papers_2016)

In [None]:
papers_2016 = assign_hcp_mcp_lcp(papers_2016)

In [None]:
papers_2016.value_counts(['label_hml_2018','label_hml_2021'])

In [None]:
papers_2016 = assign_capacity(papers_2016)

In [None]:
print(papers_2016.shape)
papers_2016.columns

In [None]:
papers_2016.head()

In [None]:
papers_2016.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_2016_all.pickle')

In [None]:
import pandas as pd
papers = pd.read_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_2016_all.pickle')
print(papers.shape)
_ = papers[papers['cumulative_reader_2018'].notna()]
print(len(_))
_ = _[_['cumulative_reader_2021'].notna()]
print(len(_)) # 85.4%にreadershipがある

# Narrow down the data to be used in the experiment.

## readership valid(exist + 2018<=2021)

In [None]:
papers_2016 = pd.read_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_2016_all.pickle')

print('2018年のreadershipが2021年より多い場合: ', (papers_2016['cumulative_reader_2018']>papers_2016['cumulative_reader_2021']).sum())
valid_eids = papers_2016[papers_2016['cumulative_reader_2018']<=papers_2016['cumulative_reader_2021']]['eid'] #ちゃんとしたreadershipを持つidたち

papers_2016_validread = papers_2016[papers_2016['eid'].isin(valid_eids)]

In [None]:
papers_2016.shape

In [None]:
print(papers_2016_validread.shape)
papers_2016_validread.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_2016_all_validread.pickle')

In [None]:
papers_2016_validread = pd.read_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_2016_all_validread.pickle')

print(papers_2016_validread.shape)
_ = papers_2016_validread[papers_2016_validread['cumulative_reader_2018'].notna()]
print(len(_))
_ = _[_['cumulative_reader_2021'].notna()]
print(len(_)) # 85.4%にreadershipがある

papers_2016_validread.value_counts(['label_hml_2021'])

In [None]:
papers_2016_validread[papers_2016_validread['cumulative_reader_2018']==0]

# Matching

In [None]:
papers_2016_validread = pd.read_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_2016_all_validread.pickle')

## citation + field

In [None]:
# slowとlow(2018では同じ)
papers_slow_lowmatched2018 = []
papers_low_slowmatched2018 = []
# slowとfast(2021では同じ)
papers_slow_fastmatched2021 = []
papers_fast_slowmatched2021 = []

for partition in tqdm(sorted(papers_2016_validread['partition_1e-06'].unique())):
    df_slow = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['middle','bottom']))&(papers_2016_validread['label_hml_2021'].isin(['top']))].query(f'`partition_1e-06` == {partition}')
    df_fast = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['top']))&(papers_2016_validread['label_hml_2021'].isin(['top']))].query(f'`partition_1e-06` == {partition}')
    df_low = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['middle','bottom']))&(papers_2016_validread['label_hml_2021'].isin(['middle','bottom']))].query(f'`partition_1e-06` == {partition}')
    
    print(partition, len(df_slow), len(df_fast), len(df_low))
    
    # 2018で揃えたslow vs low
    for c2018, count in df_slow['c_history_sum_2018'].value_counts().items():
        n_sample = min([len(df_slow.query('c_history_sum_2018 == @c2018')),len(df_low.query('c_history_sum_2018 == @c2018'))])
        papers_slow_lowmatched2018.append(df_slow.query('c_history_sum_2018 == @c2018').sample(n_sample, random_state=42))
        papers_low_slowmatched2018.append(df_low.query('c_history_sum_2018 == @c2018').sample(n_sample, random_state=42))
    
    # 2021で揃えたslow vs fast
    for c2021, count in df_slow['c_history_sum'].value_counts().items():
        n_sample = min([len(df_slow.query('c_history_sum == @c2021')),len(df_fast.query('c_history_sum == @c2021'))])
        papers_slow_fastmatched2021.append(df_slow.query('c_history_sum == @c2021').sample(n_sample, random_state=42))
        papers_fast_slowmatched2021.append(df_fast.query('c_history_sum == @c2021').sample(n_sample, random_state=42))

papers_slow_lowmatched2018 = pd.concat(papers_slow_lowmatched2018)
papers_low_slowmatched2018 = pd.concat(papers_low_slowmatched2018)
papers_slow_fastmatched2021 = pd.concat(papers_slow_fastmatched2021)
papers_fast_slowmatched2021 = pd.concat(papers_fast_slowmatched2021)

print(papers_slow_lowmatched2018.shape)
print(papers_low_slowmatched2018.shape)
print(papers_slow_fastmatched2021.shape)
print(papers_fast_slowmatched2021.shape)

papers_slow_lowmatched2018.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_slow_1e-06_lowmatched2018.pickle')
papers_low_slowmatched2018.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_low_1e-06_slowmatched2018.pickle')
papers_slow_fastmatched2021.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_slow_1e-06_fastmatched2021.pickle')
papers_fast_slowmatched2021.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_fast_1e-06_slowmatched2021.pickle')

## citation + field + journal

In [None]:
papers_slow_lowmatched2018_journal = []
papers_low_slowmatched2018_journal = []
papers_slow_fastmatched2021_journal = []
papers_fast_slowmatched2021_journal = []

matched_eid_low_slowmatched = set()
matched_eid_fast_slowmatched = set()

for partition in tqdm(sorted(papers_2016_validread['partition_1e-06'].unique())):
    df_slow = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['middle','bottom']))&(papers_2016_validread['label_hml_2021'].isin(['top']))].query(f'`partition_1e-06` == {partition}')
    df_fast = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['top']))&(papers_2016_validread['label_hml_2021'].isin(['top']))].query(f'`partition_1e-06` == {partition}')
    df_low = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['middle','bottom']))&(papers_2016_validread['label_hml_2021'].isin(['middle','bottom']))].query(f'`partition_1e-06` == {partition}')
            
# 2018で揃えたslow vs low
    for eid, c2018, journal in df_slow[['eid', 'c_history_sum_2018','journal']].values:
        df_low_target = df_low.query('c_history_sum_2018 == @c2018').query('journal == @journal')
        if len(df_low_target) != 0:
            papers_slow_lowmatched2018_journal.append(df_slow.query('eid == @eid'))
            row = df_low_target.sample(1, random_state=42)
            papers_low_slowmatched2018_journal.append(row)
            matched_eid_low_slowmatched.add(row['eid'].values[0])        
        
    # 2021で揃えたslow vs fast
    for eid, c2021, journal in df_slow[['eid', 'c_history_sum','journal']].values:
        df_fast_target = df_fast.query('c_history_sum == @c2021').query('journal == @journal')
        if len(df_fast_target) != 0:
            papers_slow_fastmatched2021_journal.append(df_slow.query('eid == @eid'))
            row = df_fast_target.sample(1, random_state=42)
            papers_fast_slowmatched2021_journal.append(row)
            matched_eid_fast_slowmatched.add(row['eid'].values[0])

papers_slow_lowmatched2018_journal = pd.concat(papers_slow_lowmatched2018_journal)
papers_low_slowmatched2018_journal = pd.concat(papers_low_slowmatched2018_journal)
papers_slow_fastmatched2021_journal = pd.concat(papers_slow_fastmatched2021_journal)
papers_fast_slowmatched2021_journal = pd.concat(papers_fast_slowmatched2021_journal)

print(papers_slow_lowmatched2018_journal.shape)
print(papers_low_slowmatched2018_journal.shape)
print(papers_slow_fastmatched2021_journal.shape)
print(papers_fast_slowmatched2021_journal.shape)

papers_slow_lowmatched2018_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_slow_1e-06_lowmatched2018_journal.pickle')
papers_low_slowmatched2018_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_low_1e-06_slowmatched2018_journal.pickle')
papers_slow_fastmatched2021_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_slow_1e-06_fastmatched2021_journal.pickle')
papers_fast_slowmatched2021_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_fast_1e-06_slowmatched2021_journal.pickle')

## citation + topic + journal

In [None]:
papers_slow_lowmatched2018_journal = []
papers_low_slowmatched2018_journal = []
papers_slow_fastmatched2021_journal = []
papers_fast_slowmatched2021_journal = []

matched_eid_low_slowmatched = set()
matched_eid_fast_slowmatched = set()

for partition in sorted(papers_2016_validread['partition_0.0005'].unique()):
    df_slow = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['middle','bottom']))&(papers_2016_validread['label_hml_2021'].isin(['top']))].query(f'`partition_0.0005` == {partition}')
    df_fast = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['top']))&(papers_2016_validread['label_hml_2021'].isin(['top']))].query(f'`partition_0.0005` == {partition}')
    df_low = papers_2016_validread[(papers_2016_validread['label_hml_2018'].isin(['middle','bottom']))&(papers_2016_validread['label_hml_2021'].isin(['middle','bottom']))].query(f'`partition_0.0005` == {partition}')
            
# 2018で揃えたslow vs low
    for eid, c2018, journal in df_slow[['eid', 'c_history_sum_2018','journal']].values:
        df_low_target = df_low.query('c_history_sum_2018 == @c2018').query('journal == @journal')
        if len(df_low_target) != 0:
            papers_slow_lowmatched2018_journal.append(df_slow.query('eid == @eid'))
            row = df_low_target.sample(1, random_state=42)
            papers_low_slowmatched2018_journal.append(row)
            matched_eid_low_slowmatched.add(row['eid'].values[0])        
        
    # 2021で揃えたslow vs fast
    for eid, c2021, journal in df_slow[['eid', 'c_history_sum','journal']].values:
        df_fast_target = df_fast.query('c_history_sum == @c2021').query('journal == @journal')
        if len(df_fast_target) != 0:
            papers_slow_fastmatched2021_journal.append(df_slow.query('eid == @eid'))
            row = df_fast_target.sample(1, random_state=42)
            papers_fast_slowmatched2021_journal.append(row)
            matched_eid_fast_slowmatched.add(row['eid'].values[0])
    if partition % 1000 == 0:
        print(partition)

papers_slow_lowmatched2018_journal = pd.concat(papers_slow_lowmatched2018_journal)
papers_low_slowmatched2018_journal = pd.concat(papers_low_slowmatched2018_journal)
papers_slow_fastmatched2021_journal = pd.concat(papers_slow_fastmatched2021_journal)
papers_fast_slowmatched2021_journal = pd.concat(papers_fast_slowmatched2021_journal)

print(papers_slow_lowmatched2018_journal.shape)
print(papers_low_slowmatched2018_journal.shape)
print(papers_slow_fastmatched2021_journal.shape)
print(papers_fast_slowmatched2021_journal.shape)

papers_slow_lowmatched2018_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_slow_1e-06_lowmatched2018_journal_topic.pickle')
papers_low_slowmatched2018_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_low_1e-06_slowmatched2018_journal_topic.pickle')
papers_slow_fastmatched2021_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_slow_1e-06_fastmatched2021_journal_topic.pickle')
papers_fast_slowmatched2021_journal.to_pickle('/disks/qnap2/data/t-miura/2023_readership/final_paper/papers_fast_1e-06_slowmatched2021_journal_topic.pickle')