# Preprocessing of raw citation files to network format

## 1. Create basic database files for citations.

## 2. Getting citation networks

In [1]:
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import json as js

# project PATH
project_path_str='/scratch/psychiatry30/'
project_path= Path(project_path_str)
import sys
sys.path.append(str(project_path))
from utils_.data_basing import parse_abs_file, parse_citation_file

data_path = project_path / 'data' / 'scopus'
currated_data_path = project_path / 'data' / 'currated_deng'
proc_data_path = project_path / 'data' / 'preprocessed'

In [2]:
## databasing
# paper db
data_types = {"abs": "-ABSTRACT_RETRIEVAL-0.json", "cite_sum":"-CITATIONS_OVERVIEW-0.json", "cite":"-SCOPUS_SEARCH-1.json", "cite_other":"-SCOPUS_SEARCH-2.json"}

paper_db_columns = ['paper_id', 'year', 'date', 'type', 'journal_id', 'open_access',
       'total_citatedby', 'DOI', 'title', 'area_list', 'first_author_id',
       'first_author_city', 'first_author_country', 'first_author_affi_id',
       'corr_author_id', 'corr_author_city', 'corr_author_country',
       'corr_author_affi_id', 'affi_id', 'affi_name', 'affi_city',
       'affi_country', 'author_country_affi_list']
# author_country_affi_list = [[ {"seq":_x["@seq"], "author_id":_x["@auid"], "author_name":_x["ce:indexed-name"], 
#                          "affi_id_list":[], "affi_city_list":[], "affi_country_list":[],...]
paper_df=pd.DataFrame(columns=paper_db_columns)
# areas db
area_db_columns = ["area_id", "area_name", "area_abbrev", "paper_id","year"]
area_df=pd.DataFrame(columns=area_db_columns)
# paper content db
paper_cont_db_columns = ["paper_id", "year", "month", "type", "area_list", "title", "abstract"]
paper_cont_df=pd.DataFrame(columns=paper_cont_db_columns)
# author db
author_db_columns = ["author_id", "author_name", "year", "affi_id_list", "city_list", "country_list", "dept_id_list"] # one item per affiliation
# affiliation_list: [{"affi_id":xxx, "country":xxx, "affi_list":xxx},...]
# it has duplication
author_df=pd.DataFrame(columns=author_db_columns)
# affiliation db
affi_db_columns = ["affi_id", "affi_name", "dept_id", "dept_name", "country", "city", "year"] # one item per affiliation
# affiliation_list: [{"affi_id":xxx, "country":xxx, "affi_year":xxx},...]
# it has duplication
affi_df=pd.DataFrame(columns=affi_db_columns)
# journal db
journal_db_columns = ["journal_id", "journal_name", "publisher", "country", "IF_year_list", "Quantile_year_list"]
journal_df=pd.DataFrame(columns=journal_db_columns)
# citation db
cite_db_columns = ["paper_id", "cite_paper_id", "title", "year", "date", "type", "cited_by", "affi_list"] # one item per affiliation
# affiliation_list: [ {"affi_id":xxx, "affi_name":xxx, "affi_city":xxx, "affi_country":xxx},...]
citation_df=pd.DataFrame(columns=cite_db_columns)

In [3]:
from multiprocessing import cpu_count, Process, Pool
from math import floor
from joblib import Parallel, delayed

# get the number of cpu cores
cpu_r=0.8
num_cores = cpu_count()
cpu_availible = floor(num_cores*cpu_r)
print('Using ', cpu_availible, " cores!")

pool_obj = Pool()
results = Parallel(n_jobs=2)(delayed(lambda x:x*2)(i) for i in range(100000))

Using  16  cores!


In [3]:
# databasing
import time

save_flag = 1
year_dir_list = os.listdir(data_path)
year_publication_dict = {}
COUNTRY_DICT = {}
# monitoring
year_proc_time_dict = {}
for _year in year_dir_list[1:]:
    print("processing papers in year ", _year)
    _paper_list = os.listdir(data_path / _year)
    for _x in _paper_list:
        # remove non folder for year
        if os.path.isfile(_x):
            _paper_list.remove(_x)
    year_publication_dict[_year] = len(_paper_list)
    _paper_list_dict = dict(zip(list(range(1, len(_paper_list)+1, 1)),{key: val for key, val in sorted(dict(zip([int(x) for x in _paper_list], _paper_list)).items(), key = lambda ele: ele[0], reverse = False)}.values()))
    print(str(year_publication_dict[_year]), " papers detected for year: ", _year)
    # get the start time
    start_time = time.time()
    for _idx, _paper in _paper_list_dict.items():
        print(_year, " paper No.", str(_idx), "/", str(year_publication_dict[_year]), ": ", _paper)
        _paper_file_list = os.listdir(data_path / _year / _paper)
        _citation_file_list =[]
        for _file in _paper_file_list:
            if "ABSTRACT_RETRIEVAL" in _file:
                _abs_file = _file
            elif "CITATIONS_OVERVIEW" in _file:
                _sum_citation_file = _file
            elif "SCOPUS_SEARCH" in _file:
                _citation_file_list=_citation_file_list+[_file]
            else:
                print("Error! New file types detected: ", _file)
        # processing abstract file
        _abs_path = data_path / _year / _paper/ _abs_file
        paper_df, author_df, affi_df, area_df, journal_df, paper_cont_df, COUNTRY_DICT = parse_abs_file(_abs_path, COUNTRY_DICT, paper_df, author_df, affi_df, area_df, journal_df, paper_cont_df)
         
        # processing citation file
        _citation_file_list=list({key: val for key, val in sorted(dict(zip([int(x.strip(".json").split("-")[-1]) for x in _citation_file_list], _citation_file_list)).items(), key = lambda ele: ele[0], reverse = False)}.values())[:-1]
        if len(_citation_file_list) > 0:
            _citation_path_list = [ (data_path / _year / _paper/ _x) for _x in _citation_file_list]
            citation_df, no_aff_paper_list = parse_citation_file(_citation_path_list, citation_df)
        else:
            print("No citation file for paper: ", _paper)
    # save results
    if save_flag:
        paper_df.to_csv((proc_data_path / (_year+"_paper_df.csv")), index=0)
        author_df.to_csv((proc_data_path / (_year+"_author_df.csv")), index=0)
        affi_df.to_csv((proc_data_path / (_year+"_affi_df.csv")), index=0)
        journal_df.to_csv((proc_data_path / (_year+"_journal_df.csv")), index=0)
        citation_df.to_csv((proc_data_path / (_year+"_citation_df.csv")), index=0)
        pd.DataFrame({"citation_no_affi": no_aff_paper_list}).to_csv((proc_data_path / (_year+"_journal_df.csv")), index=0)
        pd.DataFrame({"abbr": list(COUNTRY_DICT.keys()), "full_name": COUNTRY_DICT.values()}).to_csv((proc_data_path / (_year+"_country_abbr_df.csv")), index=0)
    year_proc_time_dict[_year] = time.time()-start_time
    print("Total time used for curating data for year", _year, ":", year_proc_time_dict[_year])

pd.DataFrame({"year": list(year_proc_time_dict.keys()), "proc_time": year_proc_time_dict.values()}).to_csv((proc_data_path / ("proc_time_log.csv")), index=0)



processing papers in year  1991
17852  papers detected for year:  1991
1991  paper No. 1 / 17852 :  0000004833
processing abstract file:  0000004833-ABSTRACT_RETRIEVAL-0.json
processing citation file:  0000004833-SCOPUS_SEARCH-1.json
2  cited papers has no affliliations...
1991  paper No. 2 / 17852 :  0000012739
processing abstract file:  0000012739-ABSTRACT_RETRIEVAL-0.json
processing citation file:  0000012739-SCOPUS_SEARCH-1.json
1  cited papers has no affliliations...
1991  paper No. 3 / 17852 :  0000015946
processing abstract file:  0000015946-ABSTRACT_RETRIEVAL-0.json
processing citation file:  0000015946-SCOPUS_SEARCH-1.json
1  cited papers has no affliliations...
1991  paper No. 4 / 17852 :  0000020880
processing abstract file:  0000020880-ABSTRACT_RETRIEVAL-0.json
processing citation file:  0000020880-SCOPUS_SEARCH-1.json
1  cited papers has no affliliations...
1991  paper No. 5 / 17852 :  0000029725
processing abstract file:  0000029725-ABSTRACT_RETRIEVAL-0.json
processing ci

In [28]:
#for x in paper_df.author_country_affi_list:
#    print(x)
#display(paper_df.T)
#display(area_df)
#display(author_df)
#print(affi_df)
#print(journal_df)
#print(paper_cont_df)
#cite_df["country"]=[x[0]["affi_country"] for x in cite_df["affi_list"]]
#display(cite_df)
paper_df.to_csv((proc_data_path / (_year+"_paper_df.csv")), index=0)
author_df.to_csv((proc_data_path / (_year+"_author_df.csv")), index=0)
affi_df.to_csv((proc_data_path / (_year+"_affi_df.csv")), index=0)
journal_df.to_csv((proc_data_path / (_year+"_journal_df.csv")), index=0)
citation_df.to_csv((proc_data_path / (_year+"_citation_df.csv")), index=0)
pd.DataFrame({"citation_no_affi": no_aff_paper_list}).to_csv((proc_data_path / (_year+"_journal_df.csv")), index=0)
pd.DataFrame({"abbr": list(COUNTRY_DICT.keys()), "full_name": COUNTRY_DICT.values()}).to_csv((proc_data_path / (_year+"_country_abbr_df.csv")), index=0)

In [5]:
year_dir_list[1:]

['1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022']

In [None]:
for _citation_file in _citation_file_list: 
    print(_citation_file)
    _citation_path = data_path / _year / _paper/ _citation_file
    with open(_citation_path, 'r') as _cite_data:
        citation_dict = json.load(_cite_data)
        print(json.dumps(citation_dict, indent=4))