In [1]:
import time

import csv
import requests

import modin.pandas as pd
import numpy as np

from tqdm import tqdm  
from pathlib import Path

from bs4 import BeautifulSoup


data_dir = Path('~/Docs/Metadata For Everyone/data')


In [57]:
df = pd.read_csv(data_dir / 'allv3.csv', header=None)
del df[0]
df.columns = ['doi', 'xml']



In [58]:
def get_langs(xml): 
    ret = {}
    try: 
        soup = BeautifulSoup(xml, 'xml')
        journal = soup.find('journal_metadata')
        if journal:
            ret['journal'] = journal.get('language')
            
        article = soup.find('journal_article')
        if article: 
            ret['article_lang'] = article.get('language')

        abstracts = soup.find_all('jats:abstract')
        if abstracts: 
            langs = []
            for abstract in abstracts: 
                langs.append(abstract.get('xml:lang'))
                langs = [l for l in langs if l is not None]
                
            if len(langs) == 0:
                langs = None
            ret['abstracts'] = langs
            
    except Exception as e:
        ret['err'] = type(e).__name__

    return ret

In [59]:
df.loc[:,'lang_dict'] = df.xml.map(get_langs)

[36m(raylet)[0m Spilled 33083 MiB, 195 objects, write throughput 981 MiB/s.


In [60]:
# Normalize the 'details' column
df_normalized = pd.json_normalize(df['lang_dict'])

# Concatenate the normalized DataFrame with the original DataFrame
df = pd.concat([df.drop(columns='lang_dict'), df_normalized], axis=1)




In [61]:
df.loc[:,'num_abstract_langs'] = df['abstracts'].map(lambda x: len(x) if type(x) == list else 0)

In [62]:
df.loc[:,'abstracts'] = df['abstracts'].map(lambda x: x if type(x) == list and len(x) > 0 else None)



In [63]:
del df['xml']
df = df.set_index('doi')

In [66]:
df[['journal', 'article_lang', 'abstracts']].notnull().sum()

journal         416974
article_lang     23899
abstracts        36445
dtype: int64

In [67]:
df.groupby('num_abstract_langs').size()

num_abstract_langs
0    493599
1     32756
2      3074
3       542
4        48
5        18
6         2
7         5
dtype: int64

In [69]:
def article_lang_in_abstract(row):
    if row['article_lang'] and row['abstracts']:
        return row['article_lang'] in row['abstracts']
    else:
        return None
df.loc[:,'article_lang_in_abstract'] = df.apply(article_lang_in_abstract, axis=1)

In [72]:
df.groupby('article_lang_in_abstract').size()

article_lang_in_abstract
False      50
True     2600
dtype: int64

In [73]:
df.article_lang_in_abstract.isnull().sum()

527394

In [75]:
df.fillna('None').groupby('article_lang_in_abstract').size()

article_lang_in_abstract
False        50
True       2600
None     527394
dtype: int64