In [1]:
import pandas as pd

In [2]:
# read preprocessed data
df = pd.read_csv('preprocessed_data.csv')

In [3]:
# totoal number of rows
print('Total number of rows:', len(df))

Total number of rows: 19631


In [4]:
df.columns

Index(['Article Title', 'Source Title', 'Language',
       'Times Cited, All Databases', 'Highly Cited Status', 'Hot Paper Status',
       'Publication Year', 'Decade', 'Group', 'WoS Categories new',
       'Research Areas new', 'Keywords Plus lemmatized',
       'Author Keywords lemmatized', 'All Keywords', 'Addresses new',
       'Affiliations new', 'Abstract lemmatized'],
      dtype='object')

In [5]:
df['Publication Year'].value_counts()

2023    3705
2022    3359
2021    2806
2024    2657
2020    2172
2019    1513
2018     885
2017     515
2016     492
2015     248
2014     245
2012     133
2008     125
2010     125
2013     109
2009      90
2011      75
2007      60
2006      44
2003      41
2005      33
2004      29
2001      25
2002      24
2000      23
1998      19
1995      15
1997      14
1996       9
1994       8
1999       7
1992       6
1991       5
1993       3
1989       3
1986       2
1987       2
1990       2
1988       2
1973       1
Name: Publication Year, dtype: int64

In [6]:
# get the article title of the first publication year
df[df['Publication Year'] == 1973]['Article Title']

1531    MACHINE LEARNING OF CORRELATION
Name: Article Title, dtype: object

In [7]:
# max and min times cited
print('Max times of cited:', df['Times Cited, All Databases'].max())
print('Min times of cited:', df['Times Cited, All Databases'].min())

Max times of cited: 4493
Min times of cited: 0


In [8]:
# find the tile of the most cited paper
print('Title of the most cited paper:', df[df['Times Cited, All Databases'] == df['Times Cited, All Databases'].max()]['Article Title'].values[0])
# publication year of the most cited paper
print('Publication year of the most cited paper:', df[df['Times Cited, All Databases'] == df['Times Cited, All Databases'].max()]['Publication Year'].values[0])

Title of the most cited paper: Thumbs up? Sentiment classification using machine learning techniques
Publication year of the most cited paper: 2002


In [9]:
publication_data = df['Publication Year'].value_counts().sort_index()
publication_data

1973       1
1986       2
1987       2
1988       2
1989       3
1990       2
1991       5
1992       6
1993       3
1994       8
1995      15
1996       9
1997      14
1998      19
1999       7
2000      23
2001      25
2002      24
2003      41
2004      29
2005      33
2006      44
2007      60
2008     125
2009      90
2010     125
2011      75
2012     133
2013     109
2014     245
2015     248
2016     492
2017     515
2018     885
2019    1513
2020    2172
2021    2806
2022    3359
2023    3705
2024    2657
Name: Publication Year, dtype: int64

In [10]:
# draw a line plot to show the number of papers published each year with pyecharts
import pyecharts.options as opts
from pyecharts.charts import Line

x_data = publication_data.index.tolist()
y_data = publication_data.values.tolist()

# change the type of x_data to str
x_data = [str(x) for x in x_data]

(
    Line()
    .set_global_opts(
        # tooltip_opts=opts.TooltipOpts(is_show=False),
        xaxis_opts=opts.AxisOpts(type_="category"),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            # axistick_opts=opts.AxisTickOpts(is_show=True),
            # splitline_opts=opts.SplitLineOpts(is_show=True),
        ),
    )
    .add_xaxis(xaxis_data=x_data)
    .add_yaxis(
        series_name="",
        y_axis=y_data,
        symbol="emptyCircle",
        is_symbol_show=True,
        is_smooth=True,
        label_opts=opts.LabelOpts(is_show=False),
    )
    .render("visualize/Publications_Distribution.html")
)

'/Users/ZOU/Desktop/code/visualize/Publications_Distribution.html'

In [11]:
# replace English; English as English
df['Language'] = df['Language'].replace('English; English', 'English')

In [12]:
df['Language'].value_counts()

English        19317
Spanish          105
Russian           56
Turkish           45
Portuguese        37
Chinese           19
German            15
French            12
Italian            5
Croatian           3
Czech              3
Unspecified        2
Bulgarian          2
Slovenian          2
Dutch              2
Slovak             1
Malay              1
Japanese           1
Greek              1
Korean             1
Swedish            1
Name: Language, dtype: int64

In [13]:
# chhange df['Language'].value_counts() into a list of lists
lang = df['Language'].value_counts().index.tolist()
lang_count = df['Language'].value_counts().values.tolist()
lang = [[lang[i], lang_count[i]] for i in range(len(lang))]

# remove English
lang_no_Eng = [x for x in lang if x[0] != 'English']

In [14]:
from pyecharts.charts import Pie

c = (
    Pie()
    .add("", lang_no_Eng, center=["40%", "50%"])
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Language Distribution"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .render("visualize/Language_Distribution_N.html")
)

In [15]:
# count how many publications each source title 
df['Source Title'].value_counts()

IEEE JOURNAL OF SELECTED TOPICS IN APPLIED EARTH OBSERVATIONS AND REMOTE SENSING                                                                        364
ISPRS INTERNATIONAL JOURNAL OF GEO-INFORMATION                                                                                                          310
FRONTIERS IN PSYCHOLOGY                                                                                                                                 242
FRONTIERS IN HUMAN NEUROSCIENCE                                                                                                                         192
ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SENSING                                                                                                      182
                                                                                                                                                       ... 
TILBURG LAW REVIEW-JOURNAL OF INTERNATIONAL AND EUROPEAN LAW    

In [16]:
# count how many times of cited each source title, order by the number of times cited
df.groupby('Source Title')['Times Cited, All Databases'].sum().sort_values(ascending=False)

Source Title
ISPRS JOURNAL OF PHOTOGRAMMETRY AND REMOTE SENSING                                                                                  17008
IEEE JOURNAL OF SELECTED TOPICS IN APPLIED EARTH OBSERVATIONS AND REMOTE SENSING                                                    11033
EUROPEAN JOURNAL OF OPERATIONAL RESEARCH                                                                                             7740
ACCIDENT ANALYSIS AND PREVENTION                                                                                                     4621
PROCEEDINGS OF THE 2002 CONFERENCE ON EMPIRICAL METHODS IN NATURAL LANGUAGE PROCESSING                                               4514
                                                                                                                                    ...  
PROCEEDINGS OF THE 2022 ACM CONFERENCE ON INTERNATIONAL COMPUTING EDUCATION RESEARCH, ICER 2022, VOL. 1                                 0
GEOBIA 2010: GEOGRAPH

In [17]:
# draw a scatter plot to show the number of times cited not 0 with pyecharts
from pyecharts.charts import Scatter

x_data = df[df['Times Cited, All Databases'] != 0]['Publication Year'].values.tolist()
y_data = df[df['Times Cited, All Databases'] != 0]['Times Cited, All Databases'].values.tolist()

scatter = (
    Scatter()
    .add_xaxis(xaxis_data=x_data)
    .add_yaxis(
        series_name="",
        y_axis=y_data,
        symbol_size=5,
        label_opts=opts.LabelOpts(is_show=False),
        markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max")]),
    )
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(
            type_="value",
            min_=1973,
            max_=2024,
            axistick_opts=opts.AxisTickOpts(is_show=True),
            splitline_opts=opts.SplitLineOpts(is_show=True),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            axistick_opts=opts.AxisTickOpts(is_show=True),
            splitline_opts=opts.SplitLineOpts(is_show=True),
        ),
    )
    .render("visualize/Citation_Scatter.html")
)

In [29]:
# get the 97.5th percentile of times cited
df['Times Cited, All Databases'].quantile(0.975)

100.0