In [None]:
import os
import json
import math
import numpy as np
import scipy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [None]:
# box plotting the data

In [None]:
def normalize_age(ref_years, year, threshold=20):
    ages = []

    for pub_year in ref_years:
        if pub_year and math.isnan(float(pub_year)) is not True:
            if year - int(pub_year) > threshold:
                pub_year = year-threshold
            ages.append(year-int(pub_year))
    return ages

In [None]:
cat = "cs.AI"
threshold = 50

results = {}
for year in range(2013, 2024):
    if os.path.exists(os.path.join(os.getcwd(),f"../data/{cat}/{year}/output.json")):
        with open(os.path.join(os.getcwd(),f"../data/{cat}/{year}/output.json"), "r") as fp:
            content = json.load(fp)
            reference_year = []
            for paper in content:
                reference_year.extend([ref["year"] for ref in paper["reference"]])

            reference_age = normalize_age(reference_year, year, threshold)
            results[year] = reference_age

In [None]:
df = pd.DataFrame({'2013': pd.Series(results[2013]), 
                   '2014': pd.Series(results[2014]),
                   '2015': pd.Series(results[2015]),
                   '2016': pd.Series(results[2016]),
                   '2017': pd.Series(results[2017]),
                   '2018': pd.Series(results[2017]),
                   '2019': pd.Series(results[2017]),
                   '2020': pd.Series(results[2017]),
                   '2021': pd.Series(results[2017]),
                   '2022': pd.Series(results[2017]),
                   '2023': pd.Series(results[2017])
                  })

In [None]:
df.head()

In [None]:
sns.set(style='whitegrid')
facecolor = '#eaeaf2'
plt.ioff()
fig, ax = plt.subplots(figsize=(10, 6), facecolor=facecolor)

In [None]:
font_color = '#525252'
csfont = {'fontname':'Georgia'}
hfont = {'fontname':'Calibri'}

ax.set_ylabel('USD', fontsize=16, color=font_color, **hfont)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set(fontsize=16, color=font_color, **hfont)

In [None]:
title = 'Citation Age, 2013–2023'
fig.suptitle(title, y=.97, fontsize=22, color=font_color, **csfont)
subtitle = 'Source: Arxiv'
plt.title(subtitle, fontsize=18, pad=10, color=font_color, **hfont)
plt.subplots_adjust(top=0.85)

In [None]:
for i, box in enumerate(ax.artists):
    col = box.get_facecolor()
    plt.setp(ax.lines[i*6+5], mfc=col, mec=col)

In [None]:
lines = ax.get_lines()
categories = ax.get_xticks()

for cat in categories:
    y = round(lines[4+cat*6].get_ydata()[0],1) 
    ax.text(
        cat, 
        y, 
        f'{y}', 
        ha='center', 
        va='center', 
        fontweight='semibold', 
        size=12,
        color='white',
        bbox=dict(facecolor='#828282', edgecolor='#828282')
    )
    

In [None]:
ax = sns.boxplot(data=df, 
                 palette='Set3', 
                 linewidth=1.2, 
                 fliersize=2, 
                 order=['2013', '2014', '2015', '2016', '2017', '2018','2019','2020','2021','2022','2023'],
                 flierprops=dict(marker='o', markersize=4))

In [None]:
# regession

In [None]:
#correlation test

In [None]:
# Number of papers increases 

In [None]:
# Fit the regression line 
cat = "cs.CV"
statistics = pd.read_csv(os.path.join(os.getcwd(),f"../data/{cat}/statistics.csv"), 
                         header=0, index_col=0)
print(statistics.head())

In [None]:
x = [i for i in range(0, 11)]
y = statistics["Mean"][0:11]
coef = np.polyfit(x,y,1)
print(coef)
poly1d_fn = np.poly1d(coef) 

plt.plot(x,y, 'yo', x, poly1d_fn(x), '--k') #'--k'=black dashed line, 'yo' = yellow circle marker
plt.xlim([-1, 11])
plt.ylim(0, 12)