# Импортируем библиотеки

In [1]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from tqdm.notebook import tqdm

# Откроем данные сайта arxiv.org

In [2]:
# https://www.kaggle.com/Cornell-University/arxiv
arxiv = pd.read_json('../data/arxiv-metadata-oai-snapshot.json', lines=True)

# преобразуем колонку с датой в тип данных даты
arxiv['update_date'] = pd.to_datetime(arxiv['update_date'])

# отсортируем по дате загрузки
arxiv.sort_values(by='update_date', inplace=True, ignore_index=True)

print(arxiv.shape)
arxiv.head()

(1897853, 14)


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,cond-mat/0101160,Gennadii Varzugin,"A.V. Rybin, G.G. Varzugin and J. Timonen",Singularity Formation and Collapse in the Attr...,"4 pages, 1 figure, v2. In this version of the ...",,,,cond-mat,,A generic mechanism of collapse in the Gross...,"[{'version': 'v1', 'created': 'Thu, 11 Jan 200...",2007-05-23,"[[Rybin, A. V., ], [Varzugin, G. G., ], [Timon..."
1,astro-ph/0109371,Michel Boer,"M. Boer (1), C. Thiebaut (1), A. Klotz (1), G....",Hands-On TAROT: Intercontinental use of the TA...,"4 pages, Based on a demonstration presented at...",,,,astro-ph,,The TAROT telescope has for primary goal the...,"[{'version': 'v1', 'created': 'Fri, 21 Sep 200...",2007-05-23,"[[Boer, M., ], [Thiebaut, C., ], [Klotz, A., ]..."
2,astro-ph/9709059,Liliya L. R. Williams,"Liliya L. R. Williams (1), Paul L. Schechter (...",Measurement of the Hubble Constant Via Gravita...,"15 pages, LaTeX, figues included; To appear in...",,,,astro-ph,,Gravitational lensing is now widely and succ...,"[{'version': 'v1', 'created': 'Mon, 8 Sep 1997...",2007-05-23,"[[Williams, Liliya L. R., ], [Schechter, Paul ..."
3,math/0501091,Thomas Puettmann,"Uwe Abresch (Ruhr-Universitaet Bochum), Carlos...",Wiedersehen metrics and exotic involutions of ...,"17 pages, 5 figures, a QuickTime movie visuali...",,,,math.GT math.DG,,"We provide explicit, simple, geometric formu...","[{'version': 'v1', 'created': 'Thu, 6 Jan 2005...",2007-05-23,"[[Abresch, Uwe, , Ruhr-Universitaet Bochum], [..."
4,math/0501090,Daniel Ruberman,Daniel Ruberman and Nikolai Saveliev,Casson--type invariants in dimension four,"30 pages, 5 figures. To appear in Proceedings ...",,,,math.GT,,This article surveys our ongoing project abo...,"[{'version': 'v1', 'created': 'Thu, 6 Jan 2005...",2007-05-23,"[[Ruberman, Daniel, ], [Saveliev, Nikolai, ]]"


# Преобразуем некоторые колонки в более читаемый вид
## Выделим дату публикации статьи

In [4]:
# в данный момент они записаны внутри словаря, который находится в списке
arxiv.iloc[0]['versions']

[{'version': 'v1', 'created': 'Thu, 11 Jan 2001 21:11:52 GMT'},
 {'version': 'v2', 'created': 'Tue, 30 Jan 2001 17:01:25 GMT'}]

In [5]:
def extract_publication_date(list_of_versions: list):
    pulication_date = list_of_versions[0]['created']

    pulication_datetime = datetime.strptime(pulication_date, '%a, %d %b %Y %H:%M:%S GMT')

    return pulication_datetime

def extract_last_update_date(list_of_versions: list):
    last_update_date = list_of_versions[-1]['created']

    last_update_datetime = datetime.strptime(last_update_date, '%a, %d %b %Y %H:%M:%S GMT')

    return last_update_datetime

In [6]:
tqdm.pandas()
arxiv['pulication_date'] = arxiv['versions'].progress_apply(lambda x: extract_publication_date(x))
arxiv['last_update_date'] = arxiv['versions'].progress_apply(lambda x: extract_last_update_date(x))

arxiv['pulication_date'] = pd.to_datetime(arxiv['pulication_date'])
arxiv['last_update_date'] = pd.to_datetime(arxiv['last_update_date'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1897853.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1897853.0), HTML(value='')))




In [74]:
# выделим года
arxiv['publication_year'] = arxiv['pulication_date'].dt.year
arxiv['last_update_year'] = arxiv['last_update_date'].dt.year

arxiv['publication_month'] = arxiv['pulication_date'].dt.month
arxiv['last_update_month'] = arxiv['last_update_date'].dt.month

## Также выделим количество версий статьи 

In [7]:
arxiv['number_of_versions'] = arxiv['versions'].str.len()

In [8]:
arxiv.drop(columns=['versions'], inplace=True)

## Выделим разницу между датой создания статьи и датой публикации на сайте

In [10]:
arxiv['publication_date_diff'] = arxiv['update_date'] - arxiv['pulication_date']
arxiv['last_update_date_diff'] =  arxiv['update_date'] - arxiv['last_update_date']

arxiv['publication_date_diff'] = round(arxiv['publication_date_diff'] / np.timedelta64(1, 'M'), 0)
arxiv['last_update_date_diff'] = round(arxiv['last_update_date_diff'] / np.timedelta64(1, 'M'), 0)

## Преобразуем авторов

In [11]:
# как видно, авторы записаны в списке внутри другого списка, и разделены строками, исправим это
arxiv.iloc[0]['authors_parsed']

[['Rybin', 'A. V.', ''], ['Varzugin', 'G. G.', ''], ['Timonen', 'J.', '']]

In [12]:
def preprocess_authours(list_of_authors: list):
    returned_list = [" ".join(author[:-1]) for author in list_of_authors]
    return returned_list

In [13]:
tqdm.pandas()
arxiv['authors'] = arxiv['authors_parsed'].progress_apply(lambda x: preprocess_authours(x))

arxiv.drop(columns=['authors_parsed'], inplace=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1897853.0), HTML(value='')))




## Выделим количество авторов

In [14]:
arxiv['number_of_authors'] = arxiv['authors'].str.len()

## Выделим количество категорий

In [61]:
arxiv['number_of_categories'] = arxiv['categories'].str.split().str.len()

## Переименуем темы в более развернутый вид

In [15]:
arxiv.iloc[11]['categories']

'math.OA math.FA'

In [69]:
category_map_old = {'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics',
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
'cond-mat.mtrl-sci': 'Materials Science',
'cond-mat.other': 'Other Condensed Matter',
'cond-mat.quant-gas': 'Quantum Gases',
'cond-mat.soft': 'Soft Condensed Matter',
'cond-mat.stat-mech': 'Statistical Mechanics',
'cond-mat.str-el': 'Strongly Correlated Electrons',
'cond-mat.supr-con': 'Superconductivity',
'cs.AI': 'Artificial Intelligence',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CL': 'Computation and Language',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LG': 'Machine Learning',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control',
'econ.EM': 'Econometrics',
'eess.AS': 'Audio and Speech Processing',
'eess.IV': 'Image and Video Processing',
'eess.SP': 'Signal Processing',
'gr-qc': 'General Relativity and Quantum Cosmology',
'hep-ex': 'High Energy Physics - Experiment',
'hep-lat': 'High Energy Physics - Lattice',
'hep-ph': 'High Energy Physics - Phenomenology',
'hep-th': 'High Energy Physics - Theory',
'math.AC': 'Commutative Algebra',
'math.AG': 'Algebraic Geometry',
'math.AP': 'Analysis of PDEs',
'math.AT': 'Algebraic Topology',
'math.CA': 'Classical Analysis and ODEs',
'math.CO': 'Combinatorics',
'math.CT': 'Category Theory',
'math.CV': 'Complex Variables',
'math.DG': 'Differential Geometry',
'math.DS': 'Dynamical Systems',
'math.FA': 'Functional Analysis',
'math.GM': 'General Mathematics',
'math.GN': 'General Topology',
'math.GR': 'Group Theory',
'math.GT': 'Geometric Topology',
'math.HO': 'History and Overview',
'math.IT': 'Information Theory',
'math.KT': 'K-Theory and Homology',
'math.LO': 'Logic',
'math.MG': 'Metric Geometry',
'math.MP': 'Mathematical Physics',
'math.NA': 'Numerical Analysis',
'math.NT': 'Number Theory',
'math.OA': 'Operator Algebras',
'math.OC': 'Optimization and Control',
'math.PR': 'Probability',
'math.QA': 'Quantum Algebra',
'math.RA': 'Rings and Algebras',
'math.RT': 'Representation Theory',
'math.SG': 'Symplectic Geometry',
'math.SP': 'Spectral Theory',
'math.ST': 'Statistics Theory',
'math-ph': 'Mathematical Physics',
'nlin.AO': 'Adaptation and Self-Organizing Systems',
'nlin.CD': 'Chaotic Dynamics',
'nlin.CG': 'Cellular Automata and Lattice Gases',
'nlin.PS': 'Pattern Formation and Solitons',
'nlin.SI': 'Exactly Solvable and Integrable Systems',
'nucl-ex': 'Nuclear Experiment',
'nucl-th': 'Nuclear Theory',
'physics.acc-ph': 'Accelerator Physics',
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
'physics.app-ph': 'Applied Physics',
'physics.atm-clus': 'Atomic and Molecular Clusters',
'physics.atom-ph': 'Atomic Physics',
'physics.bio-ph': 'Biological Physics',
'physics.chem-ph': 'Chemical Physics',
'physics.class-ph': 'Classical Physics',
'physics.comp-ph': 'Computational Physics',
'physics.data-an': 'Data Analysis, Statistics and Probability',
'physics.ed-ph': 'Physics Education',
'physics.flu-dyn': 'Fluid Dynamics',
'physics.gen-ph': 'General Physics',
'physics.geo-ph': 'Geophysics',
'physics.hist-ph': 'History and Philosophy of Physics',
'physics.ins-det': 'Instrumentation and Detectors',
'physics.med-ph': 'Medical Physics',
'physics.optics': 'Optics',
'physics.plasm-ph': 'Plasma Physics',
'physics.pop-ph': 'Popular Physics',
'physics.soc-ph': 'Physics and Society',
'physics.space-ph': 'Space Physics',
'q-bio.BM': 'Biomolecules',
'q-bio.CB': 'Cell Behavior',
'q-bio.GN': 'Genomics',
'q-bio.MN': 'Molecular Networks',
'q-bio.NC': 'Neurons and Cognition',
'q-bio.OT': 'Other Quantitative Biology',
'q-bio.PE': 'Populations and Evolution',
'q-bio.QM': 'Quantitative Methods',
'q-bio.SC': 'Subcellular Processes',
'q-bio.TO': 'Tissues and Organs',
'q-fin.CP': 'Computational Finance',
'q-fin.EC': 'Economics',
'q-fin.GN': 'General Finance',
'q-fin.MF': 'Mathematical Finance',
'q-fin.PM': 'Portfolio Management',
'q-fin.PR': 'Pricing of Securities',
'q-fin.RM': 'Risk Management',
'q-fin.ST': 'Statistical Finance',
'q-fin.TR': 'Trading and Market Microstructure',
'quant-ph': 'Quantum Physics',
'stat.AP': 'Applications',
'stat.CO': 'Computation',
'stat.ME': 'Methodology',
'stat.ML': 'Machine Learning',
'stat.OT': 'Other Statistics',
'stat.TH': 'Statistics Theory'}

In [70]:
arxiv['categories'] = arxiv['categories'].replace(category_map_old, regex=True)

In [72]:
print(arxiv.shape)
arxiv.head()

(1897853, 20)


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,update_date,pulication_date,last_update_date,number_of_versions,publication_date_diff,last_update_date_diff,number_of_authors,number_of_categories,groups
0,cond-mat/0101160,Gennadii Varzugin,"[Rybin A. V., Varzugin G. G., Timonen J.]",Singularity Formation and Collapse in the Attr...,"4 pages, 1 figure, v2. In this version of the ...",,,,cond-mat,,A generic mechanism of collapse in the Gross...,2007-05-23,2001-01-11 21:11:52,2001-01-30 17:01:25,2,76.0,76.0,3,1,cond-mat
1,astro-ph/0109371,Michel Boer,"[Boer M., Thiebaut C., Klotz A., Buchholtz G.,...",Hands-On TAROT: Intercontinental use of the TA...,"4 pages, Based on a demonstration presented at...",,,,Astrophysics,,The TAROT telescope has for primary goal the...,2007-05-23,2001-09-21 08:10:16,2001-09-21 08:10:16,1,68.0,68.0,7,1,Astrophysics
2,astro-ph/9709059,Liliya L. R. Williams,"[Williams Liliya L. R., Schechter Paul L.]",Measurement of the Hubble Constant Via Gravita...,"15 pages, LaTeX, figues included; To appear in...",,,,Astrophysics,,Gravitational lensing is now widely and succ...,2007-05-23,1997-09-08 12:14:51,1997-09-08 12:14:51,1,116.0,116.0,2,1,Astrophysics
3,math/0501091,Thomas Puettmann,"[Abresch Uwe , Duran Carlos , Puettmann Thomas...",Wiedersehen metrics and exotic involutions of ...,"17 pages, 5 figures, a QuickTime movie visuali...",,,,Geometric Topology Differential Geometry,,"We provide explicit, simple, geometric formu...",2007-05-23,2005-01-06 20:52:36,2005-07-18 19:27:49,2,28.0,22.0,4,2,Geometric Topology Differential Geometry
4,math/0501090,Daniel Ruberman,"[Ruberman Daniel, Saveliev Nikolai]",Casson--type invariants in dimension four,"30 pages, 5 figures. To appear in Proceedings ...",,,,Geometric Topology,,This article surveys our ongoing project abo...,2007-05-23,2005-01-06 19:59:38,2005-01-06 19:59:38,1,28.0,28.0,2,1,Geometric Topology


# Сохраним таблицу

In [75]:
arxiv.to_csv('../data/prepared_arxiv.csv', index=False)