In [12]:
import json
import pandas as pd

def parse_article_data(file_path):
    article_list = []

    with open(file_path, "r") as file:
        for line in file:
            data = json.loads(line)
            article_id = data["id"]

            article_list.append(
                {
                    "ID": article_id,
                    "title": data["title"],
                    "categories": data["categories"],
                    "versions": data["versions"],
                }
            )

    return article_list


def create_article_dataframe(article_data_list):
    return pd.DataFrame.from_records(article_data_list)


# Usage example
file_path = "arxiv-metadata-oai-snapshot.json"
article_data = parse_article_data(file_path)
article_df = create_article_dataframe(article_data)

In [13]:
article_df.head(10)

Unnamed: 0,ID,title,categories,versions
0,704.0001,Calculation of prompt diphoton production cros...,hep-ph,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007..."
1,704.0002,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200..."
2,704.0003,The evolution of the Earth-Moon system based o...,physics.gen-ph,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007..."
3,704.0004,A determinant of Stirling cycle numbers counts...,math.CO,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200..."
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007..."
5,704.0006,Bosonic characters of atomic Cooper pairs acro...,cond-mat.mes-hall,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200..."
6,704.0007,Polymer Quantum Mechanics and its Continuum Limit,gr-qc,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200..."
7,704.0008,Numerical solution of shock and ramp compressi...,cond-mat.mtrl-sci,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200..."
8,704.0009,"The Spitzer c2d Survey of Large, Nearby, Inste...",astro-ph,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007..."
9,704.001,"Partial cubes: structures, characterizations, ...",math.CO,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200..."


In [14]:
def filter_cs_articles(article_df: pd.DataFrame, start_year: int, end_year: int) -> pd.DataFrame:
    cs_articles = article_df[article_df['categories'].str.contains('^cs', regex=True)]

    # Extract the version and created date for the first version
    cs_articles[['version', 'created']] = cs_articles['versions'].apply(
        lambda x: pd.Series([x[0]['version'], x[0]['created']])
    )

    cs_articles['year'] = pd.to_datetime(
        cs_articles['created'], format='%a, %d %b %Y %H:%M:%S GMT'
    ).dt.year

    cs_articles = cs_articles[
        (cs_articles["year"] >= start_year) & (cs_articles["year"] <= end_year)
    ]

    cs_articles = cs_articles.drop(
        columns=["versions", "version", "created"], axis=1
    )

    return cs_articles

cs_articles = filter_cs_articles(article_df, 2018, 2022)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
cs_articles.head()

Unnamed: 0,ID,title,categories,year
929308,1801.00377,Help Me Find a Job: A Graph-based Approach for...,cs.IR cs.SI,2018
929315,1801.00384,Error-Robust Multi-View Clustering,cs.LG,2018
929316,1801.00385,Interactive Co-Design of Form and Function for...,cs.RO,2018
929318,1801.00387,Diversity Analysis of Millimeter-Wave Massive ...,cs.IT math.IT,2018
929319,1801.00388,Beyond Word Embeddings: Learning Entity and Co...,cs.CL cs.AI cs.IR cs.SI,2018


In [16]:
cs_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 259577 entries, 929308 to 1795204
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ID          259577 non-null  object
 1   title       259577 non-null  object
 2   categories  259577 non-null  object
 3   year        259577 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.9+ MB


In [18]:
import plotly.express as px

# Count the number of articles by year
year_counts = cs_articles['year'].value_counts()

# Sort the counts by year
year_counts = year_counts.sort_index()

# Create a bar plot of the year counts
fig = px.bar(
    x=year_counts.index,
    y=year_counts.values,
    color=year_counts.index,
    title='Number of Articles by Year'
)

# Set the x and y axis labels
fig.update_xaxes(title_text='Year')
fig.update_yaxes(title_text='Number of Articles')

# Set the title of the plot
fig.update_layout(
    title={
        'text': 'Number of Computer Science Articles on arXiv.org by Year',
        'font': {'size': 16, 'family': 'Helvetica'}
    }
)

# Set the x and y axis ticks font size
fig.update_xaxes(tickfont=dict(size=12))
fig.update_yaxes(tickfont=dict(size=12))

# Show the plot
fig.show()

In [29]:
import pandas as pd
import plotly.express as px

# Define the years to loop over
years = [2022, 2021, 2020, 2019, 2018]

# Create an empty list to store the dataframes for each year
dfs = []

# Loop over each year
for year in years:
    # Count the number of articles by category for the current year
    category_counts = pd.Series(
        cs_articles[cs_articles['year'] == year]['categories'].str.split(expand=True).values.ravel()).value_counts()

    # Keep only the top 10 categories
    category_counts = category_counts[:15]

    # Create a dataframe for the current year
    df = pd.DataFrame(
        {'Category': category_counts.index, 
        'Number of Articles': category_counts.values, 'Year': year}
        )

    # Append the dataframe to the list of dataframes
    dfs.append(df)

# Concatenate the dataframes into a single dataframe
df = pd.concat(dfs, ignore_index=True)

# Create an animated histogram using Plotly
fig = px.histogram(
    df, 
    x='Category', 
    y='Number of Articles', 
    color='Category', 
    animation_frame='Year', 
    nbins=len(df['Category'].unique())
    )

# Set the layout of the plot
fig.update_layout(
    title='Top 15 Categories by Number of Articles', 
    xaxis_title='Category', 
    yaxis_title='Number of Articles', 
    yaxis=dict(range=[0, 24000])
    )

fig.show()


'https://plotly.com/~tapetrova/56/'

In [21]:
%pip install chart_studio

Note: you may need to restart the kernel to use updated packages.


In [23]:
cs_articles.to_csv('cs_articles.csv', sep = ",")