# Notebook to Parse Countries from Journal Article Titles

## 1. Imports

In [1]:
# import non-standard library
import pandas as pd

# standard library imports
import re
import typing

## 2. Read datasets

Read dataset with article titles column only, print sample.

In [29]:
# output shape and 3 row preview
target_cols = ('Article Title', 'Publication Date', 'Publication Year')
df = pd.read_csv('../data/COVID-data.csv', usecols=target_cols)
print(df.shape)
df.sample(3)

(61005, 3)


Unnamed: 0,Article Title,Publication Date,Publication Year
52900,Severe flu management: a point of view,FEB,2020.0
45272,Coming to Terms with Nonsteroidal Anti-Inflamm...,,2012.0
55934,Inhibition of Protein Kinase R Activation and ...,DEC,2009.0


Read unique country datafile

In [30]:
countries = pd.read_csv('../data/country_list.csv')
countries.sample(3)

Unnamed: 0,Country
163,Senegal
99,Kyrgyzstan
0,Afghanistan


Get unique countries from file

In [31]:
country_set = set(countries['Country'].values)
country_set

{'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Czechoslovakia (former)',
 "Democratic People's Republic of Korea",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'G

## 3. Find countries in Titles

In [32]:
def search_for_countries(
    titles: typing.List[str], 
    countries: typing.Set[str]
) -> typing.List[
    typing.Union[str, None]
]:
    """Finds matches of 'countries' in 'titles'.

    This function searches each title in titles and finds
    matches for it in the countries set.

    Args:
        titles: List of titles to search through.
        countries: Set of unique countries to search for.

    Returns:
        list: dash-joined countries found in each title or None.

    """
    matches = [countries.intersection(title.split()) for title in titles]
    return ['-'.join(m) if m else None for m in matches ]

Run search.

In [28]:
# apply function
countries_in_titles = search_for_countries(df['Article Title'], country_set)

# print first three non-None matches
[x for x in countries_in_titles if x][:3]

['France', 'Austria', 'Germany']

Create new dataframe and from original and search data, then export to file.

In [36]:
# creates dataframe from original data and new country-matching analysis
final = pd.DataFrame(
    data={
        'Article Title': df['Article Title'].values,
        'Countries in Title': countries_in_titles,
        'Publication Date': df['Publication Date'].values,
        'Publication Year': df['Publication Year'].values,
    }
)
final.sample(3)

Unnamed: 0,Article Title,Countries in Title,Publication Date,Publication Year
28720,CT Scanning in Suspected Stroke or Head Trauma...,,JUL 1,2020.0
49537,Thoracoscopic repair of esophageal atresia wit...,,,
2137,Gastrointestinal endoscopy during COVID-19 pan...,,MAY,2020.0


In [42]:
# exports data
final.to_csv('../data/Countries_in_Titles.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=41871a82-6b9f-4a18-a04e-ef9382306e39' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>