In [2]:
import pandas as pd
import numpy as np
import json
import os

In [85]:
def json_to_series(file_path, func, ex_authors=False):
    with open(file_path, "r") as f:
        data = json.load(f)
        return func(data, include_external_authors=ex_authors)

In [86]:
def read_folder(folder_path, conv_func) -> pd.DataFrame:
    file_names = os.listdir(folder_path)
    json_files = [os.path.join(folder_path, file) for file in file_names if file.endswith('.json')]
    series_list = [json_to_series(file, conv_func) for file in json_files]
    df = pd.concat(series_list, axis=1).T
    return df

In [None]:
def convert_to_row(article: dict, include_external_authors = False) -> pd.Series:
    """
    Yield the following most curious fields from a publication:
    title: str
    published: dict -> doi: str and year: int
    abstract: str
    keywords: list -> list
    references: list<dict> -> list<str>
    metrics: dict -> citations: int (scopus)
    authors: list<dict> -> list<str> (names)
    """
    title = article["title"]
    published = article["published"]["doi"]
    year = article["published"]["year"]
    abstract = article["abstract"]
    keywords = article["keywords"]
    refs = list(map(lambda x: x["bibliographic_info"], article["references"]))
    citations = article["metrics"]["Scopus"]["citations"]
    # prob have to save author_uid as well to later connect one to another
    authors = list(map(lambda x: x["name"], article["authors"]))
    if include_external_authors:
        authors.extend(map(lambda x: x["name"], article["authors_external"]))
    return pd.Series([title,
                      published,
                      year,
                      abstract,
                      keywords,
                      refs,
                      citations,
                      authors],
                     index=["title",
                            "doi",
                            "year",
                            "abstract",
                            "keywords",
                            "refs",
                            "citations_num",
                            "authors"])

In [96]:
folder_path = "Diploma/data/publications"
df = read_folder(folder_path, convert_to_row)

In [99]:
df.head()

Unnamed: 0,title,doi,year,abstract,keywords,refs,citations_num,authors
0,Studies of Zγ production in association with a...,10.1007/JHEP07(2017)107,2017,The production of a Z boson and a photon in as...,"[Electroweak interaction, Hadron-Hadron scatte...","[Eboli O.J.P., Gonzalez-Garcia M.C., Lietti S....",34,"[Ahmadov F., Aleksandrov I.N., Bednyakov V.A.,..."
1,Towards the detection of light and heavy relic...,10.1016/j.ppnp.2011.01.050,2011,The standard Big Bang cosmology predicts that ...,"[Neutrino capture, Relic neutrinos, Sterile ne...","[Giunti C., Kim C.W., Fundamentals of Neutrino...",1,[Šimkovic F.]
2,Spatial characteristics of thin-film straw det...,,1998,Spatial characteristics of a straw detector wi...,[],[],0,"[Bychkov V.N., Kekelidze G.D., Lobastov S.P., ..."
3,Measurement of the underlying event in jet eve...,10.1140/epjc/s10052-014-2965-5,2014,Distributions sensitive to the underlying even...,[],[The underlying event in hard interactions at ...,31,"[Ahmadov F., Aleksandrov I.N., Bednyakov V.A.,..."
4,Bubble and kink solitons in the φ6-model of no...,10.1016/0375-9601(93)91074-F,1993,We have studied the φ6-model in the parameter ...,[],"[Kosevich, Et al., Sov. J. Low Temp. Phys., 2,...",12,[Agüero Granados M.A.]


In [97]:
df.to_csv("Diploma/publications.csv", index=False)

In [101]:
def convert_author_to_row(author: dict, include_external_authors = False) -> pd.Series:
    """
    author_uuid: str
    name: str
    years_active = years: dict -> list<str> (only in jinr I believ)
    metrics: dict -> dict<str, int> (Get only Scopus)
    """
    uid = author["author_uuid"]
    name = author["name"]
    affs = author["affiliations"]
    years_active = []
    for elem in affs:
        if elem["src"].lower().startswith("joint institute for nuclear research") or elem["src"].lower().startswith("jinr"):
            years_active = [elem["years"]["start_year"], elem["years"]["end_year"]]
    metrics = author["metrics"]["Scopus"]
    article_num = metrics["articles_number"]
    citations = metrics["citations"]
    h_index = metrics["h_index"]
    coauthors = metrics["coauthors"]
    subject_areas = metrics["subject_areas"]
    return pd.Series([uid,
                      name,
                      years_active,
                      article_num, 
                      citations, 
                      h_index, 
                      coauthors, 
                      subject_areas],
                     index=["uid",
                            "name",
                            "active_jinr",
                            "articles_num",
                            "citations",
                            "h_index",
                            "coauthors",
                            "subject_areas"])

In [91]:
folder_path = "Diploma/data/authors"
df_authors = read_folder(folder_path, convert_author_to_row)

In [94]:
df_authors.head()

Unnamed: 0,uid,name,active_jinr,articles_num,citations,h_index,coauthors,subject_areas
0,1d610056-2aad-4c1c-b77c-26fefbc24495,Echenard B.,"[2020, 2020]",407,17096,72,5204,"[Physics and Astronomy, Mathematics, Engineeri..."
1,1ff61ff7-4d45-42e3-ad51-8935e7819686,Preinhaelter J.,"[1989, 1989]",49,1004,11,971,"[Physics and Astronomy, Energy, Engineering, M..."
2,6624fc7b-efc6-49e2-8c9b-3517c703fd1b,Aleksandrov I.V.,"[1987, 1987]",6,54,4,18,"[Physics and Astronomy, Engineering]"
3,b9c0efe6-ca1d-48e1-8186-e5ad35fcf3d3,Bogdanova Y.V.,"[2017, 2021]",3,156,2,17,"[Biochemistry, Genetics and Molecular Biology,..."
4,40f2dde7-a669-4b1e-8856-18fa9c0f7d48,Povolotskii A.M.,"[2001, 2001]",1,5,0,2,[Physics and Astronomy]


In [95]:
df_authors.to_csv("Diploma/authors.csv", index=False)

sia lab65 exploration

In [3]:
file_name = "sia_lab65.json"
with open(file_name, "r") as f:
    data = json.load(f)

In [7]:
data[0].keys()

dict_keys(['acknowledgement', 'funding', 'images', 'keywords', 'references', 'affiliations', 'created_at', 'tasks_uuid', 'published', 'countries', 'abstract', 'title', 'tables', 'updated_at', 'collections', 'full_text', 'file_paths', 'countries_code', 'authors'])

In [24]:
def sia_to_row(article):
    title = article["title"]
    published = article["published"]["doi"]
    year = article["published"]["year"]
    abstract = article["abstract"]
    keywords = article["keywords"]
    refs = list(map(lambda x: x["src"], article["references"]))
    authors = list(map(lambda x: x["name"], article["authors"]))
    return pd.Series([title,
                      published,
                      year,
                      abstract,
                      keywords,
                      refs,
                      authors],
                     index=["title",
                            "doi",
                            "year",
                            "abstract",
                            "keywords",
                            "refs",
                            "authors"])

In [25]:
series_list = [sia_to_row(article) for article in data]
df = pd.concat(series_list, axis=1).T


In [29]:
df.to_csv("sia_pubs.csv", index=False)