In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
try:
    df = pd.read_csv('data/publications.csv')
except FileNotFoundError:
    print('file do not found!')

In [3]:
print('first 5 rows')
display(df.head(5))

print('last 5 rows')
display(df.tail(5))

first 5 rows


Unnamed: 0,Name,Web of Science Documents,Times Cited,Collab-CNCI,Rank,% Docs Cited,Category Normalized Citation Impact,% Documents in Top 1%,% Documents in Top 10%,Documents in Top 1%,Documents in Top 10%,year
0,SWITZERLAND,24154,2705248,0.946748,8,97.93,1.024815,0.89,10.87,97,230,2023
1,CHINA,2185,157320,1.575928,44,99.6,0.900623,2.98,19.26,323,121,2014
2,CHINA,6896,744768,1.032983,42,95.23,1.679004,1.08,11.36,455,662,2013
3,UNITED KINGDOM,2399,177526,1.586585,3,99.21,1.444246,1.63,10.2,98,2463,2005
4,ITALY,10753,301084,0.812773,2,98.35,1.252122,0.81,17.43,440,134,2004


last 5 rows


Unnamed: 0,Name,Web of Science Documents,Times Cited,Collab-CNCI,Rank,% Docs Cited,Category Normalized Citation Impact,% Documents in Top 1%,% Documents in Top 10%,Documents in Top 1%,Documents in Top 10%,year
995,UNITED KINGDOM,22195,2130720,1.276037,46,97.97,0.971705,2.9,20.73,274,1803,2024
996,BRAZIL,27344,1832048,1.565469,42,99.16,1.57703,1.39,22.49,143,1514,2020
997,SWITZERLAND,14360,1033920,0.853179,44,96.86,1.258788,2.95,15.25,224,830,2005
998,SWITZERLAND,5423,591107,0.838366,8,97.8,1.508564,0.87,18.58,151,707,2014
999,CHINA,23053,2996890,1.13527,36,96.31,1.458377,0.5,23.07,214,2073,2014


In [4]:
#Shape of the dataset
row, col = df.shape

print(f'dataset contain {row} X {col}')

dataset contain 1000 X 12


In [5]:
missing_count = df.isnull().sum().sum()
print(f'Total missing values : {missing_count}')

duplicate_count = df.duplicated()
print(duplicate_count)

Total missing values : 0
0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Name                                 1000 non-null   object 
 1   Web of Science Documents             1000 non-null   int64  
 2   Times Cited                          1000 non-null   int64  
 3   Collab-CNCI                          1000 non-null   float64
 4   Rank                                 1000 non-null   int64  
 5   % Docs Cited                         1000 non-null   float64
 6   Category Normalized Citation Impact  1000 non-null   float64
 7   % Documents in Top 1%                1000 non-null   float64
 8   % Documents in Top 10%               1000 non-null   float64
 9   Documents in Top 1%                  1000 non-null   int64  
 10  Documents in Top 10%                 1000 non-null   int64  
 11  year                           

In [7]:
#Extract Unique Countries
df['Name'].unique()


array(['SWITZERLAND', 'CHINA', 'UNITED KINGDOM', 'ITALY', 'SPAIN',
       'AUSTRALIA', 'BRAZIL', 'INDIA', 'CANADA', 'ENGLAND', 'SOUTH KOREA',
       'SWEDEN', 'GERMANY', 'FRANCE', 'JAPAN', 'USA', 'NETHERLANDS'],
      dtype=object)

In [8]:
#In each year 
repeated_countries = pd.crosstab(df['year'], df['Name'])
print(repeated_countries)

Name  AUSTRALIA  BRAZIL  CANADA  CHINA  ENGLAND  FRANCE  GERMANY  INDIA  \
year                                                                      
2003          6       0       5      2        4       3        3      2   
2004          2       3       5      4        1       1        1      2   
2005          5       3       2      2        4       5        1      0   
2006          5       1       6      1        1       5        2      2   
2007          2       2       2     10        2       4        5      3   
2008          6       4       4      2        4       4        1      4   
2009          1       4       2      3        0       0        0      4   
2010          3       2       1      2        1       1        2      5   
2011          1       5       3      3        2       4        3      0   
2012          4       6       0      1        2       1        1      3   
2013          0       1       1      2        1       1        2      3   
2014          0       2  

In [9]:
df.describe()

Unnamed: 0,Web of Science Documents,Times Cited,Collab-CNCI,Rank,% Docs Cited,Category Normalized Citation Impact,% Documents in Top 1%,% Documents in Top 10%,Documents in Top 1%,Documents in Top 10%,year
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,14861.699,1296497.0,1.214932,24.722,97.41069,1.291637,1.7676,17.58979,261.327,1497.457,2013.86
std,8390.150609,967063.3,0.230261,14.108145,1.419199,0.234461,0.71711,4.3631,136.904576,844.902713,6.748477
min,512.0,21846.0,0.800182,1.0,95.0,0.900623,0.5,10.02,12.0,111.0,2003.0
25%,7616.75,507670.0,1.029402,12.0,96.15,1.08702,1.13,13.77,142.0,736.75,2008.0
50%,14711.0,1064920.0,1.214383,25.0,97.385,1.292028,1.81,17.39,261.5,1481.0,2014.0
75%,22022.25,1899791.0,1.415986,37.0,98.6525,1.499628,2.39,21.64,382.0,2202.25,2020.0
max,29959.0,4327668.0,1.599646,49.0,99.89,1.698257,3.0,24.99,499.0,2999.0,2025.0


In [10]:
# 1. Renaming Columns for better readability
df.rename(columns={
    'Name': 'Country',
    'Category Normalized Citation Impact': 'CNCI',
    'Web of Science Documents': 'Documents',
    'year': 'Year'
}, inplace=True)

# 2. Precision Formatting
df = df.round({
    'Collab-CNCI': 2,
    'CNCI': 2,
    '% Docs Cited': 2,
    '% Documents in Top 1%': 2,
    '% Documents in Top 10%': 2
})

print("Columns Renamed and Precision Set.")
display(df.head(3))

Columns Renamed and Precision Set.


Unnamed: 0,Country,Documents,Times Cited,Collab-CNCI,Rank,% Docs Cited,CNCI,% Documents in Top 1%,% Documents in Top 10%,Documents in Top 1%,Documents in Top 10%,Year
0,SWITZERLAND,24154,2705248,0.95,8,97.93,1.02,0.89,10.87,97,230,2023
1,CHINA,2185,157320,1.58,44,99.6,0.9,2.98,19.26,323,121,2014
2,CHINA,6896,744768,1.03,42,95.23,1.68,1.08,11.36,455,662,2013


In [11]:
df[(df['Country'].isin(['UNITED KINGDOM', 'ENGLAND'])) & (df['Year'] == 2017)]

Unnamed: 0,Country,Documents,Times Cited,Collab-CNCI,Rank,% Docs Cited,CNCI,% Documents in Top 1%,% Documents in Top 10%,Documents in Top 1%,Documents in Top 10%,Year
21,ENGLAND,19860,2919420,1.21,40,95.67,1.17,0.78,23.87,487,1734,2017
53,UNITED KINGDOM,25789,2811001,1.06,3,96.24,1.22,1.93,21.11,465,928,2017
458,ENGLAND,14562,1266894,1.25,16,99.05,0.93,1.55,12.52,246,1317,2017
469,ENGLAND,29654,3232286,1.56,44,96.54,1.1,2.24,15.46,397,1881,2017
479,ENGLAND,17013,1344027,1.33,32,99.17,1.06,2.84,20.34,138,1006,2017
664,UNITED KINGDOM,5051,136377,1.16,7,95.94,0.98,2.78,22.31,457,347,2017


In [12]:
df.loc[df['Country'] == 'ENGLAND', 'Country'] = 'UNITED KINGDOM'

df[df['Year'] == 2017].head()

Unnamed: 0,Country,Documents,Times Cited,Collab-CNCI,Rank,% Docs Cited,CNCI,% Documents in Top 1%,% Documents in Top 10%,Documents in Top 1%,Documents in Top 10%,Year
14,SOUTH KOREA,16657,1565758,1.06,28,96.75,1.63,1.18,19.72,130,2775,2017
21,UNITED KINGDOM,19860,2919420,1.21,40,95.67,1.17,0.78,23.87,487,1734,2017
28,USA,11812,425232,1.6,30,99.67,1.63,1.43,10.23,289,2644,2017
41,BRAZIL,28327,3257605,1.49,5,98.35,1.39,2.86,24.16,485,995,2017
53,UNITED KINGDOM,25789,2811001,1.06,3,96.24,1.22,1.93,21.11,465,928,2017


In [13]:
agg_rules = {
    'Documents' : 'sum',
    'Times Cited' : 'sum',
    '% Docs Cited' : 'mean',
    'Collab-CNCI' : 'mean',
    'Rank' : 'min',
    'CNCI' : 'mean',
    '% Documents in Top 1%' : 'mean',
    '% Documents in Top 10%' : 'mean',
    'Documents in Top 1%' : 'sum',
    'Documents in Top 10%' : 'sum'
}


df_clean = df.groupby(['Country', 'Year'], as_index = False).agg(agg_rules)
df = df_clean.round(2)

df.head(30)


Unnamed: 0,Country,Year,Documents,Times Cited,% Docs Cited,Collab-CNCI,Rank,CNCI,% Documents in Top 1%,% Documents in Top 10%,Documents in Top 1%,Documents in Top 10%
0,AUSTRALIA,2003,73479,3965411,96.88,1.33,1,1.39,1.76,13.41,1952,7645
1,AUSTRALIA,2004,34122,4026396,97.88,1.34,1,1.47,2.02,11.29,561,2594
2,AUSTRALIA,2005,76888,4458568,97.31,1.34,7,1.15,1.83,16.21,1177,8504
3,AUSTRALIA,2006,69315,5190781,96.39,1.2,12,1.24,1.47,19.8,1369,6318
4,AUSTRALIA,2007,11637,1443993,98.12,1.35,24,1.36,1.68,14.6,715,764
5,AUSTRALIA,2008,124250,9851009,96.71,1.1,5,1.37,2.48,20.33,1875,7333
6,AUSTRALIA,2009,22569,767346,99.72,1.08,21,1.4,2.82,21.41,87,2861
7,AUSTRALIA,2010,39018,2236904,96.43,1.24,27,1.25,2.31,16.38,579,4933
8,AUSTRALIA,2011,8580,909480,95.52,1.21,19,1.69,1.61,10.72,209,186
9,AUSTRALIA,2012,64809,4733799,97.47,1.32,8,1.16,1.8,20.84,1536,7240


In [14]:
#Now, Create Horizontal Bar Chart for Volume and Quality
df_volume = df.groupby('Country')['Documents'].sum().reset_index()
top_volume = df_volume.sort_values(by='Documents', ascending=False).head(10)
df_quality = df.groupby('Country')['CNCI'].mean().reset_index()
top_quality = df_quality.sort_values(by='CNCI', ascending=False).head(10)

#Volume Chart
fig_vol = px.bar(
    top_volume,
    x='Documents',
    y='Country',
    orientation = 'h',
    text_auto='.2s',
    color='Documents',
    color_continuous_scale='thermal',
    labels = {
        'Documents' : 'Total Documents',
        'Country' : 'Country Name'
    }
)

fig_vol.update_layout(
    yaxis = dict(autorange='reversed'),
    coloraxis_showscale = False
)
fig_vol.show()

#Quality Chart
fig_qual = px.bar(
    top_quality,
    x='CNCI',
    y='Country',
    orientation='h',
    text_auto = '.2s',
    color='CNCI',
    color_continuous_scale='plasma',
    labels = {
        'CNCI' : 'Total CNCI',
        'Country' : 'Country Name'
    }
)

fig_qual.update_layout(
    yaxis = dict(autorange='reversed'),
    coloraxis_showscale = False
)

fig_qual.add_vline(
    x=1.0,
    line_dash='dash',
    line_color = 'red',
    annotation_text = 'Global Avg (1.0)'
)

fig_qual.show()

In [38]:
#Find Distribution of Histogram

#-step1 : find mean and median
mean_val = df['% Docs Cited'].mean()
median_val = df['% Docs Cited'].median()

# step2 : Create histogram
fig_dist = px.histogram(
    df,
    x='% Docs Cited',
    nbins=50,
    marginal='box',
    title='Distribution of % Docs Cited',
    color_discrete_sequence=['green'],
    opacity=0.5
)


fig_dist.add_vline(x=mean_val, line_dash='dash', line_color='red')
fig_dist.add_vline(x=median_val, line_dash='dot', line_color='blue')


fig_dist.add_annotation(
    x=mean_val, 
    y=1.15,
    yref='paper',
    text=f"Mean : {mean_val:.1f}%",
    showarrow=False,
    font=dict(color='red')
)

fig_dist.add_annotation(
    x=median_val,
    y=1.05,
    yref='paper',
    text=f"Median : {median_val:.1f}%",
    showarrow=False,
    font=dict(color='blue')
)

fig_dist.update_layout(
    xaxis_title = 'Percentage of Doc Cited',
    yaxis_title = 'Count',
    bargap = 0.1
)

fig_dist.show()

