In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import math

## Loading dataset and  getting unique affiliations

In [None]:
df = pd.read_csv('papers.csv')
unique_affiliations_df = df['Affiliation'].unique()
unique_affiliations_df = pd.DataFrame(unique_affiliations_df, columns=['Affiliation'])
pd.DataFrame(unique_affiliations_df).to_csv('unique_affiliations.csv')

## Getting countries of unique affiliations

### Using openstreetmap

In [None]:
unique_affiliations_df = pd.read_csv('unique_affiliations.csv')
unique_affiliations_df['country'] = None

none_count = 0

for i in tqdm(range(len(unique_affiliations_df))):
    try:
        institute_name = unique_affiliations_df.loc[i, 'Affiliation']
        url = f"https://nominatim.openstreetmap.org/search?q={institute_name}&format=json&accept-language=en"

        response = requests.get(url)
        if response.status_code == 200:
            results = response.json()
            if len(results) > 0:
                country = results[0]['display_name'].split(',')[-1].strip()
                unique_affiliations_df.loc[i, 'country'] = country
            else:
                unique_affiliations_df['country'][i] = 'None'
                none_count += 1
        else:
            print(f"API ERROR")
    except:
        print(f"ERROR: {i}")
        unique_affiliations_df['country'][i] = 'None'
        none_count += 1
print(f"None count: {none_count}")
pd.DataFrame(unique_affiliations_df).to_csv('unique_affiliations.csv')

### Using elsevier

In [None]:
unique_affiliations_df = pd.read_csv('unique_affiliations.csv')
none_df = pd.DataFrame(unique_affiliations_df[unique_affiliations_df['country'] == 'None'].reset_index(drop=True).copy())

none_count = 0

for i in tqdm(range(len(none_df))):
    try:
        institute_name = none_df.loc[i, 'Affiliation']

        # api_key hidden
        api_key = "***************"
        query = f"affil({institute_name})"
        url = f"https://api.elsevier.com/content/search/affiliation?query={query}&apiKey={api_key}"
        response = requests.get(url)

        data = response.json()
        if len(data['search-results']['entry']) > 0:
            country = data['search-results']['entry'][0]['country']
            none_df.loc[i, 'country'] = country
    except:
        none_count += 1
print(f"None count: {none_count}")
pd.DataFrame(none_df).to_csv('none_df.csv')

### Combining the results

In [None]:
university_affiliation_df = pd.read_csv('unique_affiliations.csv')
none_df = pd.read_csv('none_df.csv')

university_affiliation_dict = {}
for i in range(len(university_affiliation_df)):
    university_affiliation_dict[university_affiliation_df.loc[i, 'Affiliation']] = university_affiliation_df.loc[i, 'country']

for i in range(len(none_df)):
    university_affiliation_dict[none_df.loc[i, 'Affiliation']] = none_df.loc[i, 'country']

In [None]:
papers = pd.read_csv('papers.csv')
papers['country'] = None
none_count = 0
for i in tqdm(range(len(papers))):
    institute_name = papers.loc[i, 'Affiliation']
    papers.loc[i, 'country'] = university_affiliation_dict[institute_name]

#save papers_with_country
papers.to_csv('papers_with_country.csv')

Using openstreetmap and elsevier, we were able to get the country of 93% of the contributions. The remaining 7% of the papers were not able to be found in the databases. We will be using the papers_with_country.csv for the rest of the analysis.

## Analysis

### Number of contributions of each country from 2006 to 2021

Exporting the total contribution of each country from 2006 to 2021 to a csv file. Also exporting the log of the contribution of each country for each year from 2006 to 2021 to a csv file.

In [None]:
# Export the country count
papers['country'].value_counts().to_csv('country_count.csv')

The below code reads a dataset of papers with country information, calculates the number of contributions done by the top 15 countries for each year, and saves the results in two separate CSV files, one with the raw data and another with the logarithmic transformation of the data.

In [None]:
# read papers_with_country
papers = pd.read_csv('papers_with_country.csv')
years = papers['Year'].unique()
#top 15 countries
top_15 = dict(papers['country'].value_counts()[:16])
#delete none
del top_15['None']
df_country_year = pd.DataFrame(columns=['country']+list(years))
#fill df_country_year
for country in top_15.keys():
    df_country_year.loc[len(df_country_year)] = [country]+[0]*len(years)
    for year in years:
        df_country_year.loc[df_country_year['country'] == country, year] = len(papers[(papers['country'] == country) & (papers['Year'] == year)])
df_country_year.to_csv('country_year_contributions.csv')

#also save log data
df_country_year_log = df_country_year.copy()
for year in years:
    df_country_year_log[year] = df_country_year_log[year].apply(lambda x: 0 if x == 0 else math.log(x))
df_country_year_log.to_csv('country_year_contributions_log.csv')


https://datawrapper.dwcdn.net/YX27D/1/
https://datawrapper.dwcdn.net/E4Qdp/1/

### Rate of change of contribution of each country from 2006 to 2021

The below code calculates the slope of the number of papers published by each country for each year, adjusts the values by adding the absolute value of the minimum slope, takes the logarithm of the adjusted values, and saves the results in a CSV file.

In [None]:
df_country_year = pd.read_csv('country_year_contributions.csv')

columns = df_country_year.columns[1:]
df_country_year_slope = pd.DataFrame(columns=list(columns[:-1]))

minn_slope = 100000
for i in range(len(df_country_year)):
    country = df_country_year.loc[i, 'country']
    df_country_year_slope.loc[len(df_country_year_slope)] = [country]+[0]*(len(df_country_year_slope.columns)-1)
    for j in range(1,len(columns)-1):
        slope = df_country_year.loc[i, columns[j+1]] - df_country_year.loc[i, columns[j]]
        df_country_year_slope.loc[df_country_year_slope['country'] == country, columns[j+1]] = slope
        if slope < minn_slope:
            minn_slope = slope

#add minn_slope to all values
for i in range(len(df_country_year_slope)):
    for j in range(1,len(columns)):
        df_country_year_slope.loc[i, columns[j]] += abs(minn_slope)

for year in df_country_year_slope.columns[1:]:
    df_country_year_slope[year] = df_country_year_slope[year].apply(lambda x: 0 if x == 0 else math.log(x))

df_country_year_slope.to_csv('country_year_contributions_slope.csv')

Datawrapper website is used for creating the following chart.
The chart indicates that before 2015, most countries experienced a constant rate of growth in contributions, with the United States and United Kingdom having higher growth rates than the others. However, after 2021, the number of contributions decreased, resulting in lower growth rates for all countries. India had a constant growth rate till 2020, but it experienced a decrease in the number of contributions in 2021. In terms of AI development breakthroughs, it's worth noting that 2015 was a significant year for the field of artificial intelligence. This was the year when DeepMind's AlphaGo defeated Lee Sedol, a world champion in the ancient board game of Go.

https://datawrapper.dwcdn.net/GHIt6/1/

<iframe title="Rate of increase of contributions in logarithmic scale" aria-label="Interactive line chart" id="datawrapper-chart-GHIt6" src="https://datawrapper.dwcdn.net/GHIt6/1/" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="400" data-external="1"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r=0;r<e.length;r++)if(e[r].contentWindow===a.source){var i=a.data["datawrapper-height"][t]+"px";e[r].style.height=i}}}))}();</script>