In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np

#%matplotlib inline

sns.set_context('paper')
data_dir = Path('../data')
input_dir = data_dir / 'input'
output_dir = data_dir / 'output'
figure_dir = Path("../figures")

df = pd.read_parquet(output_dir / '04_language_detection.parquet')

In [None]:
df.columns

### Completeness and Quality issues

Dividing issues between 'Completeness' and 'Quality' and getting the sum of issues for each record

In [3]:
completeness = ['Author Missing', 'Article Language Missing', 'Journal Language Missing',
                'Abstract Missing', 'Article Title Missing', 'Journal Title Missing',
                'Affiliation Missing']
quality = ['Institutions as Authors', 'Author Use of Honorific', 'Author Name in All Caps',
           'Multilingual Abstract','Abstract Language Match', 'Non-ASCII Characters',
           'Article-Journal Language Match', 'Title Language Match']
df['Completeness Errors'] = df[completeness].sum(axis=1)
df['Quality Errors'] = df[quality].sum(axis=1)

### Renaming

Renaming the columns relevant to the visualizations for cleaner appearance in viz.

In [4]:
palette = ['coral', 'thistle', 'skyblue', 'lightslategray']
df2 = df.rename({'lang_type': 'Language Type'}, axis=1)

### Completeness Issues

**Figure 2**

Viz for showing completeness issues, Grouped bar chart for seeing differences between language types.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
comp_df = df2[['Language Type', 'Author Missing', 'Affiliation Missing', 'Abstract Missing',
               'Article Language Missing', 'Journal Language Missing']]
issue_columns = ['Author Missing', 'Affiliation Missing', 'Abstract Missing',
               'Journal Language Missing', 'Article Language Missing']
comp_df.fillna({'Affiliation Missing': 0, 'Language Type': 'Uncategorized'}, inplace=True)
grouped = comp_df.groupby('Language Type')[issue_columns].mean(numeric_only=True).reset_index()
melted = grouped.melt(id_vars='Language Type', var_name='issue', value_name='prevalence')
melted['prevalence'] = melted.prevalence.map(lambda x: x * 100)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='issue', y='prevalence', hue='Language Type', data=melted, palette=['thistle', 'coral', 'skyblue', 'silver'],
            hue_order=['Monolingual English', 'Monolingual Non-English', 'Multilingual', 'Uncategorized'], ax=ax)

# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container, size=8, fmt='%.1f')

plt.xlabel('Issue', labelpad=12.5)
plt.ylabel('Prevalence')
plt.xticks(rotation=20)
ax.legend(title='Language Type', bbox_to_anchor=(0, 1), loc='upper left', frameon=False)

# Display the plot
plt.tight_layout()
plt.savefig(figure_dir / 'comp_lang_type.png', format='png', dpi=600, bbox_inches='tight')
plt.show()


### Quality

A viz similar to above, but pertaining to the quality issues.

In [None]:
#df2 = df2.rename({'Title Language Match' : 'Title Language Mismatch'
 #                },
  #                axis=1)

qual_df = df2[['Language Type', 'Author Initials',
               'Institutions as Authors',
              'Author Use of Honorific', 'Author Name in All Caps',
              ]]

issue_columns = ['Language Type', 'Author Use of Honorific', 
                 'Author Initials', 'Institutions as Authors', 
                 'Author Name in All Caps',
                 ]
qual_df.fillna({'Language Type': 'Uncategorized'}, inplace=True)
grouped = qual_df.groupby('Language Type', observed=False)[issue_columns].mean(numeric_only=True).reset_index()
melted = grouped.melt(id_vars='Language Type', var_name='issue', value_name='prevalence')
melted['prevalence'] = melted.prevalence.map(lambda x: x*100)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='issue', y='prevalence', hue='Language Type', data=melted, palette=['thistle','coral', 'skyblue', 'silver'],
            hue_order=['Monolingual English', 'Monolingual Non-English', 'Multilingual', 'Uncategorized'])
for container in ax.containers:
    ax.bar_label(container, size=8, fmt='%.1f')

plt.xlabel('Issue')
plt.ylabel('Prevalence')
plt.xticks(rotation=20)
ax.legend(title='Language Type', bbox_to_anchor=(0, 1), loc='upper left', frameon=False)

plt.savefig(figure_dir / 'quality_lang_type.png', format='png', dpi=600, bbox_inches='tight')
plt.show()

## Summary statistics

In [None]:
lang_groups = ['Monolingual English', 'Monolingual Non-English', 'Multilingual']

for i in lang_groups:
    temp = df.loc[df.lang_type == i]
    num_rec = str(temp.shape[0])
    num_pub = str(temp.publisher_name.nunique())
    rec_pub = (temp.shape[0]/temp.publisher_name.nunique())
    num_journ = str(temp.journal_title.nunique())
    percent_share = ((temp.shape[0]/df.shape[0]) * 100)
    print(f'{i} -- \n')
    print(f'Records: {num_rec}, Publishers: {num_pub}, Mean: {round(rec_pub,1)}, Journals: {num_journ}, Percent of Total: {round(percent_share,1)}\n')

In [None]:
temp = df.loc[df.lang_type.isna()]
num_rec = temp.shape[0]
num_pub = temp.publisher_name.nunique()
rec_pub = temp.shape[0]/temp.publisher_name.nunique()
num_journ = temp.journal_title.nunique()
percent_share = ((temp.shape[0]/df.shape[0]) * 100)
print(f'Uncategorized -- \n')
print(f'Records: {num_rec}, Publishers: {num_pub}, Mean: {round(rec_pub,1)}, Journals: {num_journ}, Percent of Total: {round(percent_share,1)}\n')

### Completeness issues

In [None]:
issues = ['Author Missing', 'Affiliation Missing', 'Abstract Missing',
          'Article Language Missing','Article Title Missing']
comp = df2.loc[(df2['Author Missing'] == 1) | (df2['Affiliation Missing'] == 1) |
              (df2['Abstract Missing'] == 1) | (df2['Article Language Missing'] == 1) |
              (df2['Article Title Missing'] == 1)]
prev_comp = (comp.shape[0]/df2.shape[0]) * 100
print(f'Prevalence of Completeness Issues: {prev_comp}\n')

for i in issues:
    temp = df2.loc[df2[i] == 1]
    prev = (temp.shape[0]/df2.shape[0]) * 100
    print(f'Prevalence of {i} Completeness Issues: {round(prev,2)}\n')

### Quality Issues

The first set is the prevalence within the entire sample, (# of Records with Issue)/(# of Records in the sample).

The second set looks at each issue as (# of Records with issue)/(# of Records with data within the Field) e.g. (# of Multilingual Abstracts)/(# of Records that have abstracts)

In [None]:
issues = ['Author Use of Honorific', 
               'Institutions as Authors', 'Author Name in All Caps',
               'Multilingual Abstract', 'Title Language Match',
               'Author Initials']

#Status and Language categories. "Value disagrees with parameters of field" Form. Excludes initials in author name
status_lang = df2.loc[(df2['Author Use of Honorific'] == 1) |
               (df2['Institutions as Authors'] == 1) | (df2['Author Name in All Caps'] == 1) |
               (df2['Multilingual Abstract'] == 1) | (df2['Title Language Match'] == 1)]
prev_status = (status_lang.shape[0]/df2.shape[0]) * 100
print(f'Prevalence of Status and Language Issues: {prev_status}\n')

for i in issues:
    temp = df2.loc[df2[i] == 1]
    prev = (temp.shape[0]/df2.shape[0]) * 100
    print(f'Prevalence of {i} Quality Issues: {round(prev,2)}\n')

In [None]:
issue_dict = {'Multilingual Abstract': 'abstracts',
              'Author Use of Honorific': 'authors',
              'Institutions as Authors' : 'authors',
              'Author Name in All Caps': 'authors',
              'Title Language Mismatch': 'journal_lang',
              'Author Initials' : 'authors',
              'Affiliation Missing': 'authors'}

for k,v in issue_dict.items():
    temp = df2.loc[df2[k] == 1]
    other = df.loc[df[v].notnull()]
    prev = (temp.shape[0]/other.shape[0]) * 100
    print(f'Prevalence of {k} Quality Issues: {round(prev,2)}\n')


### Author Sequence Seperation

To get numbers about which specific Author sequence issue is occuring, All first Authors or No First authors, we'll run the function again, but we'll return specific information: `All First` or `No First`

In [3]:
def sequence_checker(authorList):
    counter = 0 
    try: 
        for author in authorList:
            if author['sequence'] == 'first':
                counter +=1
            else:
                continue
        if counter == 0:
            return 'No First' #no first author
        elif len(authorList) > 1:
            if counter > 1:
                return 'All First' #multiple first authors
            else:
                return 0
        else:
            return 0 #no issue
    except:
        return None


    

In [None]:
df['author_sequence'] = df.authors.map(lambda x: sequence_checker(x))
df.author_sequence.value_counts()

In [None]:
options = ['All First', 'No First']

for o in options:
    temp = df.loc[df.author_sequence == o]
    prev_all = (temp.shape[0]/df.shape[0]) * 100
    author_excl = df.loc[df.authors.notnull()]
    prev_excl = (temp.shape[0]/author_excl.shape[0]) * 100
    print(f'Prevalence (pop=entire sample) of {o} Sequence issue: {prev_all}\n')
    print(f'Prevalence (records with authors) of {o} Sequence issue: {prev_excl}\n')

### Category Prevalence

Status, Language categories

Table 4

In [None]:
status = ['Affiliation Missing', 'Institutions as Authors', 'Author Use of Honorific',
          'Author Name in All Caps']

status_df = df2.loc[(df2[status[0]] == 1) | (df2[status[1]] == 1) |
                     (df2[status[2]] == 1) | (df2[status[3]] == 1)]
prev_status = (status_df.shape[0]/df2.shape[0]) * 100
print(f'Prevalence of Status Issues: {prev_status}\n')

In [None]:
lang = ['Multilingual Abstract', 'Abstract Language Match', 'Article-Journal Language Mismatch',
        'Title Language Mismatch']

lang_df = df2.loc[(df2[lang[0]] == 1) | (df2[lang[1]] == 1) |
                     (df2[lang[2]] == 1) | (df2[lang[3]] == 1)]
prev_lang = (lang_df.shape[0]/df2.shape[0]) * 100
print(f'Prevalence of Language Issues: {prev_lang}\n')

In [None]:
multi = df.loc[df['lang_type'] == 'Multilingual']
multi.columns

In [None]:
def has_english(record):
    try:
        stated_abstract_langs = record['abstract_langs'] #list
        stated_journal_lang = record['journal_lang'] # str
        stated_article_lang = record['article_lang'] #str
        detected_language_abstract = record['detected_lang_abstract'] #list
        detected_language_title = record['detected_lang_title'] #str
        lang_list = [
                    stated_journal_lang,
                    stated_article_lang,
                    detected_language_title,
                    ]
        if stated_abstract_langs is not None:
            lang_list.extend(stated_abstract_langs)
        if detected_language_abstract is not None:
            lang_list.extend(detected_language_abstract)
        set_langs = set(lang_list)
        set_langs = list(set_langs)
        return set_langs
    except:
        return None

multi['set_langs'] = multi.apply(has_english, axis=1)
exp_Langs = multi.explode('set_langs')
exp_Langs.set_langs.value_counts()