In [6]:
import pandas as pd
import numpy as np

Сперва загрузим данные. Их форматирование из json в удобные таблицы представлена в ноутбуке data_preparation.ipynb.

In [7]:
authors = pd.read_csv("../data/authors.csv")
articles = pd.read_csv("../data/publications.csv")
sia_articles = pd.read_csv("../data/sia_pubs.csv")

In [8]:
print(articles.shape)
print(sia_articles.shape)

(36008, 8)
(36008, 7)


In [9]:
df_1_sorted = articles.drop(columns=["citations_num"]).sort_values(by=list(articles.drop(columns=["citations_num"]).columns)).reset_index(drop=True)
df_2_sorted = sia_articles.sort_values(by=list(sia_articles.columns)).reset_index(drop=True)

if df_1_sorted.equals(df_2_sorted):
    print("DataFrames are identical when reordered.")
else:
    print("DataFrames are not identical when reordered.")

DataFrames are not identical when reordered.


Показалось, что таблицы содержат одинаковые значения, но это не так, поэтому объединю их в одну большую таблицу. Уберу информацию о числе цитирований, поскольку всё равно буду эту информацию добывать отдельно. Также удалю дупликаты на всякий случай

In [10]:
pubs = pd.concat([articles.drop(columns="citations_num"), sia_articles], ignore_index=True)

In [11]:
def get_nice_refs(refs: str):
    res = []
    bad_l = refs.lstrip("[").rstrip("]").split("'")
    for elem in bad_l:
        if len(elem) > 2:
            res.append(elem)
    return res
# Эти признаки почему-то получились в виде одной большой строки, а не списка значений. Пофиксим:
pubs.refs = pubs.refs.apply(get_nice_refs).apply(tuple)
pubs.authors = pubs.authors.apply(lambda x: (x.lstrip("[").rstrip("]").split(","))).apply(tuple)
pubs.keywords = pubs.keywords.apply(lambda x: (x.lstrip("[").rstrip("]").split(","))).apply(tuple)

In [12]:
print(pubs.shape)
pubs.drop_duplicates(inplace=True)
print(pubs.shape)

(72016, 7)
(72014, 7)


In [13]:
pubs.head()

Unnamed: 0,title,doi,year,abstract,keywords,refs,authors
0,Studies of Zγ production in association with a...,10.1007/JHEP07(2017)107,2017,The production of a Z boson and a photon in as...,"('Electroweak interaction', 'Hadron-Hadron sc...","(Eboli O.J.P., Gonzalez-Garcia M.C., Lietti S....","('Ahmadov F.', 'Aleksandrov I.N.', 'Bednyako..."
1,Towards the detection of light and heavy relic...,10.1016/j.ppnp.2011.01.050,2011,The standard Big Bang cosmology predicts that ...,"('Neutrino capture', 'Relic neutrinos', 'Ste...","(Giunti C., Kim C.W., Fundamentals of Neutrino...","('Šimkovic F.',)"
2,Spatial characteristics of thin-film straw det...,,1998,Spatial characteristics of a straw detector wi...,"(,)",(),"('Bychkov V.N.', 'Kekelidze G.D.', 'Lobastov..."
3,Measurement of the underlying event in jet eve...,10.1140/epjc/s10052-014-2965-5,2014,Distributions sensitive to the underlying even...,"(,)",(The underlying event in hard interactions at ...,"('Ahmadov F.', 'Aleksandrov I.N.', 'Bednyako..."
4,Bubble and kink solitons in the φ6-model of no...,10.1016/0375-9601(93)91074-F,1993,We have studied the φ6-model in the parameter ...,"(,)","(Kosevich, Et al., Sov. J. Low Temp. Phys., 2,...","('Agüero Granados M.A.',)"


In [14]:
pubs = pubs[pubs.doi.notna()]
pubs = pubs[pubs.title.notna()]

In [15]:
pubs.isna().sum()

title       0
doi         0
year        0
abstract    0
keywords    0
refs        0
authors     0
dtype: int64

In [16]:
pubs.shape

(64346, 7)

In [17]:
tp = pubs.shape[0] # total publications
nca = authors.shape[0] # number of contributing authors
sa = pubs.authors.apply(lambda x: len(x) == 1).sum() # sole-authored publications
ca = tp - sa # coauthored publications
nay = [pubs.year.min(), pubs.year.max()] # number of active years
pay = pubs.groupby(by="year", as_index=False).count()[["year", "doi"]].rename(columns={"doi": "count"}) # productivity per year
trc = pubs.refs.apply(lambda x: len(x)).sum()  # total references count
arc = pubs.refs.apply(lambda x: len(x)).mean() # mean references count

In [18]:
print(f"""
Total number of publications: {tp}
Number of contributing authors: {nca}
Number of sole-authored publications: {sa}
Number of co-authored publications: {ca}
Range of years in research: {nay[0]}-{nay[1]}
Max productivity was in year {pay[pay["count"] == pay["count"].max()]["year"].item()} with {pay[pay["count"] == pay["count"].max()]["count"].item()} published articles
Total number of references: {trc}
Average number of references: {arc:.3f}
""")


Total number of publications: 64346
Number of contributing authors: 10443
Number of sole-authored publications: 16185
Number of co-authored publications: 48161
Range of years in research: 1957-2024
Max productivity was in year 2018 with 3236 published articles
Total number of references: 2028914
Average number of references: 31.531



In [19]:
def calculate_collaboration_indexes(publications):
    N = len(publications)
    sum_authors = 0
    f_dict = {}
    for index, row in publications.iterrows():
        num_authors = len(row['authors'])
        sum_authors += num_authors
        if num_authors in f_dict:
            f_dict[num_authors] += 1
        else:
            f_dict[num_authors] = 1
    ci = sum_authors / N
    cc_sum = sum([f / k for k, f in f_dict.items()])
    cc = 1 - (cc_sum / N)
    return ci, cc

ci, cc = calculate_collaboration_indexes(pubs)
print(f"Collaboration Index: {ci}, Collaboration Coefficient: {cc}")


Collaboration Index: 70.30057812451435, Collaboration Coefficient: 0.5706021451648997


The Collaboration Index is a measure of the mean number of authors per paper. It is calculated as follows:
$$CI=\frac{\sum_{i=1}^{k}i\cdot f_i}{N}$$

where $f_i$ is the number of publications with $i$ authors, $k$ is the maximum number of authors for any publication, and $N$ is the total number of publications.

The Collaboration Coefficient reflects both single-authorship and multiple-authorship scenarios by giving each author a fraction of credit based on their contribution:
$$CC=1-\left(\frac{\sum_{j=1}^k\frac{f_j}{j}}{N}\right)$$

where $f_j$ represents the number of papers with $j$ authors, and $N$ is the total number of papers.





In [20]:
pubs.to_csv("clean_pubs.csv")

Здесь пока думаю закончить. Далее в тетрадке building_graph буду строить граф цитирований.