### Daeun Kim (Candidate of JPA), AI and Data Engineer

In [1]:
import requests
import pandas as pd
import re
from tqdm import tqdm
import spacy
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

### Data part

In [2]:
# Base API endpoint
api_endpoint = 'https://search.worldbank.org/api/v2/wds'

# Parameters for the API request
params = {
    'format': 'json',
    'fct': 'docty_exact,count_exact,lang_exact,disclstat_exact',
    'rows': 500,  # Max rows per request
    'apilang': 'en',
    'str_docdt': '2010-01-01',
    'end_docdt': '2024-07-01',
    'majdocty_key': '906674',  # Economic and Sector Work (ESW)
    'order': 'desc',
    'os': 0,  
    'srt': 'docdt',
    'lang_key':'120701',  # English
}

# Initialize variables
all_documents = []
offset = 0
rows_per_request = 500  # Adjust based on API limitations

# First request to get the total number of records
params['os'] = offset
response = requests.get(api_endpoint, params=params)
data = response.json()

total_records = int(data['total'])
print(f'Total records found: {total_records}')

# Fetch all records
while offset < total_records:
    params['os'] = offset
    response = requests.get(api_endpoint, params=params)
    data = response.json()
    documents = data.get('documents', {})
    for doc_id, doc_info in documents.items():
        all_documents.append(doc_info)
    offset += rows_per_request
    print(f'Fetched {len(all_documents)} records so far...')

print(f'Total records fetched: {len(all_documents)}')


Total records found: 2920
Fetched 501 records so far...
Fetched 1002 records so far...
Fetched 1503 records so far...
Fetched 2004 records so far...
Fetched 2505 records so far...
Fetched 2926 records so far...
Total records fetched: 2926


In [3]:


df = pd.DataFrame(columns=['id','title','year'])

for doc in tqdm(all_documents):
    title = doc.get('display_title')
    date = doc.get('docdt')
    id = doc.get('id')

    df = pd.concat([df, pd.DataFrame({'id': [id],'title': [title], 'year': [date] })],ignore_index=True)

df.head()

100%|██████████| 2926/2926 [00:00<00:00, 9747.46it/s] 


Unnamed: 0,id,title,year
0,34366193,Suriname - Poverty and Equity Assessment,2024-07-01T00:00:00Z
1,34369413,Serbia Policy Notes 2024,2024-06-30T00:00:00Z
2,34351433,Maldives - Country Climate and\n De...,2024-06-30T00:00:00Z
3,34354089,Microfinance in the Kyrgyz Republic\n ...,2024-06-28T00:00:00Z
4,34354258,Operational Note - How to use the\n ...,2024-06-28T00:00:00Z


In [4]:
df.isna().sum()

id       6
title    6
year     6
dtype: int64

In [5]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.isna().sum()

id       0
title    0
year     0
dtype: int64

In [6]:
def clean_text(text):
    text = re.sub(r'\n', '', text)
    text = text.replace('  ', ' ')
    return text.strip()

df['title'] = df['title'].apply(clean_text)

In [7]:
df.head()

Unnamed: 0,id,title,year
0,34366193,Suriname - Poverty and Equity Assessment,2024-07-01T00:00:00Z
1,34369413,Serbia Policy Notes 2024,2024-06-30T00:00:00Z
2,34351433,Maldives - Country Climate and Developmen...,2024-06-30T00:00:00Z
3,34354089,Microfinance in the Kyrgyz Republic : Pov...,2024-06-28T00:00:00Z
4,34354258,Operational Note - How to use the Bulsho ...,2024-06-28T00:00:00Z


In [None]:
'''
import nltk
import locationtagger

nltk.downloader.download('maxent_ne_chunker')
nltk.downloader.download('words')
nltk.downloader.download('treebank')
nltk.download('maxent_ne_chunker_tab')
nltk.downloader.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

df['country'] = None

for i in tqdm(range(len(df))):
    text = df.loc[i, 'title']

    # Extract country names
    location_ent = locationtagger.find_locations(text = text)

    if location_ent.countries:
        df.loc[i, 'country'] = location_ent.countries[0]


df.head()
'''

In [8]:
nlp = spacy.load('en_core_web_trf')

# add a country column to the dataframe
df['country'] = None

# extract country names from the title
for i in tqdm(range(len(df))):
    try:
        title = df.loc[i,'title']
    except Exception as e:
        print(df.loc[i])

    doc = nlp(title)
    for ent in doc.ents:
        if ent.label_ == 'LOC':
            df.loc[i,'country'] = ent.text
            break
        if ent.label_ == 'GPE':
            df.loc[i,'country'] = ent.text
            break

  model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision)

In [9]:
df.loc[~df['country'].isna()].shape
df.head(10)

Unnamed: 0,id,title,year,country
0,34366193,Suriname - Poverty and Equity Assessment,2024-07-01T00:00:00Z,Suriname
1,34369413,Serbia Policy Notes 2024,2024-06-30T00:00:00Z,Serbia
2,34351433,Maldives - Country Climate and Developmen...,2024-06-30T00:00:00Z,
3,34354089,Microfinance in the Kyrgyz Republic : Pov...,2024-06-28T00:00:00Z,the Kyrgyz Republic
4,34354258,Operational Note - How to use the Bulsho ...,2024-06-28T00:00:00Z,Somalia
5,34354314,A Local Social Contract Platform for Soma...,2024-06-28T00:00:00Z,Somalia
6,34351594,Unleashing Aspirations While Ensuring Opp...,2024-06-27T00:00:00Z,Latin America
7,34351724,Maghreb PASA Synthesis Policy Note - Magh...,2024-06-27T00:00:00Z,
8,34350917,"Green, Resilient, Inclusive, and Learning...",2024-06-26T00:00:00Z,Brazil
9,34350771,Paraguay - Country Economic Memorandum : ...,2024-06-26T00:00:00Z,


In [10]:
import plotly.express as px

In [11]:
valid_data = df.loc[~df['year'].isna()]
valid_data = valid_data.loc[~valid_data['country'].isna()]

In [12]:
valid_data.shape

(2270, 4)

In [13]:
valid_data.head()

Unnamed: 0,id,title,year,country
0,34366193,Suriname - Poverty and Equity Assessment,2024-07-01T00:00:00Z,Suriname
1,34369413,Serbia Policy Notes 2024,2024-06-30T00:00:00Z,Serbia
3,34354089,Microfinance in the Kyrgyz Republic : Pov...,2024-06-28T00:00:00Z,the Kyrgyz Republic
4,34354258,Operational Note - How to use the Bulsho ...,2024-06-28T00:00:00Z,Somalia
5,34354314,A Local Social Contract Platform for Soma...,2024-06-28T00:00:00Z,Somalia


In [14]:
doc_counts = valid_data.groupby(['country', 'year']).size().reset_index(name='count')
doc_counts.head()

Unnamed: 0,country,year,count
0,Accra,2010-06-01T00:00:00Z,1
1,Aden,2012-01-01T00:00:00Z,1
2,Afghanistan,2010-01-01T00:00:00Z,1
3,Afghanistan,2010-04-01T00:00:00Z,2
4,Afghanistan,2011-02-01T00:00:00Z,1


In [15]:
doc_counts['year'] = pd.to_datetime(doc_counts['year']).dt.year
print(doc_counts[['year']].head()) 
print(doc_counts['year'].unique())

# Calculate the number of documents (publications) per year
total_docs_per_year = doc_counts.groupby('year')['count'].transform('sum')
print(total_docs_per_year.head()) 

# Calculate percentage
doc_counts['percentage'] = (doc_counts['count'] / total_docs_per_year) * 100
print(doc_counts[['year', 'country', 'count', 'percentage']].head())


   year
0  2010
1  2012
2  2010
3  2010
4  2011
[2010 2012 2011 2013 2014 2015 2017 2024 2022 2018 2019 2021 2020 2023
 2016]
0    373
1    252
2    373
3    373
4    264
Name: count, dtype: int64
   year      country  count  percentage
0  2010        Accra      1    0.268097
1  2012         Aden      1    0.396825
2  2010  Afghanistan      1    0.268097
3  2010  Afghanistan      2    0.536193
4  2011  Afghanistan      1    0.378788


### Visualization Part

In [19]:
#save file 
#doc_counts.to_csv('doc_counts.csv', index=False)

In [None]:
# use file
'''
import pandas as pd

doc_counts = pd.read_csv('doc_counts.csv')
doc_counts['year'] = pd.to_datetime(doc_counts['year']).dt.year
doc_counts.head()
'''

In [16]:
# percentage - bar
import plotly.express as px

year_order = list(range(2010, 2025))
doc_counts['year'] = pd.Categorical(doc_counts['year'], categories=year_order, ordered=True)

fig_percentage = px.bar(
    doc_counts,
    x='country',
    y='percentage',
    animation_frame='year',
    animation_group='country',
    color='country',
    range_y=[0, doc_counts['percentage'].max()],  # 퍼센티지는 항상 0에서 100 사이
    title='Evolution of the Percentage of Documents by Country and Year',
    labels={'percentage': 'Percentage of Documents (%)', 'country': 'Country'},
    category_orders={"year": year_order}
)

# update layout
fig_percentage.update_layout(
    height=800,
    width=1200,
    showlegend=False,  
    transition={'duration': 50},
    xaxis={'categoryorder': 'total descending'}, #sequence
)

fig_percentage.layout.updatemenus[0].buttons[0].args[1]['frame']['redraw'] = False
fig_percentage.layout.updatemenus[0].buttons[0].args[1]['mode'] = 'immediate'


fig_percentage.frames = sorted(fig_percentage.frames, key=lambda frame: int(frame.name))
fig_percentage.show()


In [18]:
# percentage - pi
import plotly.graph_objects as go
import pandas as pd


year_order = list(range(2010, 2025))
doc_counts['year'] = pd.Categorical(doc_counts['year'], categories=year_order, ordered=True)

# base pi-chart
first_year = year_order[0]
first_data = doc_counts[doc_counts['year'] == first_year]

fig = go.Figure(
    data=[go.Pie(labels=first_data['country'], values=first_data['percentage'], hole=0.3)],
    layout=go.Layout(
        title=f'Evolution of the Percentage of Documents by Country and Year: {first_year}',
        height=800,
        width=1200
    )
)

# animated
frames = []
for year in year_order:
    year_data = doc_counts[doc_counts['year'] == year]
    frames.append(go.Frame(
        data=[go.Pie(labels=year_data['country'], values=year_data['percentage'], hole=0.3)],
        layout=go.Layout(title=f'Evolution of the Percentage of Documents by Country and Year: {year}')
    ))

fig.frames = frames

fig.update_layout(
    updatemenus=[dict(type="buttons", showactive=False, buttons=[dict(label="Play", method="animate", args=[None, {"frame": {"duration": 500, "redraw": True}, "fromcurrent": True, "transition": {"duration": 300}}])])],
)

# setting layout
fig.update_layout(
    title='Evolution of the Percentage of Documents by Country and Year',
    height=800,
    width=1200,
    showlegend=True,
)

fig.show()


In [19]:
# validation - percentage
yearly_percentage_sum = doc_counts.groupby('year')['percentage'].sum()
print("Yearly percentage sums:")
print(yearly_percentage_sum)


Yearly percentage sums:
year
2010    100.0
2011    100.0
2012    100.0
2013    100.0
2014    100.0
2015    100.0
2016    100.0
2017    100.0
2018    100.0
2019    100.0
2020    100.0
2021    100.0
2022    100.0
2023    100.0
2024    100.0
Name: percentage, dtype: float64






In [23]:
# counts - bar
year_order = list(range(2010, 2025))
doc_counts['year'] = pd.Categorical(doc_counts['year'], categories=year_order, ordered=True)

fig_counts = px.bar(
    doc_counts,
    x='country',
    y='count',
    animation_frame='year',
    animation_group='country',
    color='country',
    range_y=[0, doc_counts['count'].max()],
    title='Evolution of the Number of Documents by Country and Year',
    labels={'counts': 'Number of Documents', 'country': 'Country'}
)

fig_counts.update_layout(
    width=1200,
    height=800,
    showlegend=False,
    xaxis={'categoryorder': 'total descending'},
    transition={'duration': 500}
)

fig_counts.frames = sorted(fig_counts.frames, key=lambda frame: int(frame.name))

fig_counts.layout.updatemenus[0].buttons[0].args[1]['frame']['redraw'] = False
fig_counts.layout.updatemenus[0].buttons[0].args[1]['mode'] = 'immediate'


fig_counts.show()


In [24]:
# validation counts 
import plotly.express as px
import pandas as pd

doc_counts = doc_counts.sort_values(by=['year', 'count'], ascending=[True, False])
print("Sorted data by year and count:")
print(doc_counts[['year', 'country', 'count']].head())


# Group the data by year and select the top 10 countries by percentage for each year
top_10_per_year = doc_counts.groupby('year').apply(lambda x: x.nlargest(10, 'percentage')).reset_index(drop=True)

# Add a ranking column for each year to maintain country order based on percentage
top_10_per_year['rank'] = top_10_per_year.groupby('year')['percentage'].rank(ascending=False, method='first')

# Create an animated bar chart for percentages
fig_percentage = px.bar(
    top_10_per_year,
    x='country',
    y='percentage',
    animation_frame='year',
    animation_group='country',
    color='country',
    range_y=[0, doc_counts['percentage'].max()],
    title='Evolution of the Percentage of Documents by Country and Year (Top 10)',
    labels={'percentage': 'Percentage of Documents', 'country': 'Country'},
)

# Update layout
fig_percentage.update_layout(
    height=800,
    width=1200,
    showlegend=False,
    transition={'duration': 500}
)

# Sort countries by percentage rank for each year in the animation and ensure leftmost country has the highest percentage
for frame in fig_percentage.frames:
    frame.data = sorted(
        frame.data, 
        key=lambda d: top_10_per_year[
            (top_10_per_year['year'] == int(frame.name)) & 
            (top_10_per_year['country'] == d['name'])
        ]['rank'].values[0]
    )

    # Reorder x-axis based on rank (ensuring highest percentage is leftmost)
    frame.layout.xaxis.categoryorder = 'array'
    frame.layout.xaxis.categoryarray = top_10_per_year[
        top_10_per_year['year'] == int(frame.name)
    ].sort_values(by='rank')['country'].tolist()

# Show the figure
fig_percentage.show()


Sorted data by year and count:
      year             country  count
474   2010         El Salvador      7
185   2010              Bosnia      6
1423  2010  Sub-Saharan Africa      5
209   2010              Brazil      4
425   2010             Dongnai      4








