## COLX 531: BioLaySumm 2024
## Team Burrito

### Milestone 1 Data Inspection

In [2]:
# import relevant libraries
import pandas as pd
from nltk.tokenize import word_tokenize

### Data loading

In [3]:
# define paths to eLife and PLOS datasets
DATA_FOLDER = "../data/"
FILES = [
    "eLife_train",
    "eLife_val",
    "eLife_test",
    "PLOS_train",
    "PLOS_val",
    "PLOS_test",
]

In [4]:
# load files
data_dict = {}
for i, f in enumerate(FILES):
    file_path = DATA_FOLDER + f + ".jsonl"
    print(file_path)
    data_dict[f] = pd.read_json(file_path, lines=True)

../data/eLife_train.jsonl
../data/eLife_val.jsonl
../data/eLife_test.jsonl
../data/PLOS_train.jsonl
../data/PLOS_val.jsonl
../data/PLOS_test.jsonl


### Examples

In [5]:
data_dict["eLife_train"].head()

Unnamed: 0,lay_summary,article,headings,keywords,id
0,"In the USA , more deaths happen in the winter ...","In temperate climates , winter deaths exceed s...","[Abstract, Introduction, Results, Discussion, ...",[epidemiology and global health],elife-35500-v1
1,Most people have likely experienced the discom...,Whether complement dysregulation directly cont...,"[Abstract, Introduction, Results, Discussion, ...","[microbiology and infectious disease, immunolo...",elife-48378-v2
2,The immune system protects an individual from ...,Variation in the presentation of hereditary im...,"[Abstract, Introduction, Results, Discussion, ...","[microbiology and infectious disease, immunolo...",elife-04494-v1
3,The brain adapts to control our behavior in di...,Rapid and flexible interpretation of conflicti...,"[Abstract, Introduction, Results, Discussion, ...",[neuroscience],elife-12352-v2
4,Cells use motor proteins that to move organell...,Myosin 5a is a dual-headed molecular motor tha...,"[Abstract, Introduction, Results, Discussion, ...",[structural biology and molecular biophysics],elife-05413-v2


In [6]:
data_dict["PLOS_train"].head()

Unnamed: 0,lay_summary,article,headings,keywords,id
0,"In the kidney , structures known as nephrons a...","Kidney function depends on the nephron , which...","[Abstract, Introduction, Results, Discussion, ...","[developmental biology, danio (zebrafish), ver...",journal.pgen.0030189
1,Many species of bats in North America have bee...,White-nose syndrome is one of the most lethal ...,"[Abstract, Introduction, Results, Discussion, ...","[sequencing techniques, fungal spores, vertebr...",journal.ppat.1006076
2,The burden of dengue has been increasing over ...,Sustainable dengue intervention requires the p...,"[Abstract, Introduction, Methods, Results, Dis...","[invertebrates, medicine and health sciences, ...",journal.pntd.0007498
3,Estrogen exposure is the most important risk f...,Despite the central role of estrogen exposure ...,"[Abstract, Introduction, Results, Discussion, ...","[oncology/breast cancer, oncology/gynecologica...",journal.pgen.1001012
4,Melioidosis is a severe tropical infection cau...,Macrophage migration inhibitory factor ( MIF )...,"[Abstract, Introduction, Methods, Results, Dis...",[immunology/cellular microbiology and pathogen...,journal.pntd.0000605


### Descriptive statistics

In [7]:
# mapping of files to cleaned names
name_mapping = {}
for i, f in enumerate(FILES):
    name_mapping[f] = f"{i + 1}. {f.replace('_', ' ')}"

In [8]:
# calculate word count metrics using NLTK's word_tokenize for both lay summary and full text
df_list = []
for k, df in data_dict.items():
    length_stats = []
    cols = (
        ["article"] if k in ["eLife_test", "PLOS_test"] else [
            "lay_summary", "article"]
    )
    for col in df[cols]:
        length = df[col].apply(lambda x: len(word_tokenize(x)))
        length_stats.append(length.agg(["count", "min", "mean", "max"]))
    stats_df = pd.DataFrame(length_stats)
    df_list.append(stats_df)

In [18]:
# eLife and PLOS word metrics
def create_table(df_list, start_index, end_index, name_mapping):
    table = (
        pd.concat(
            df_list[start_index:end_index],
            axis=0,
            keys=list(name_mapping.values())[start_index:end_index],
        )
        .sort_index(level=[1, 0])
        .reset_index(names=["dataset", "text"])
    ).style.format(decimal=".", thousands=",", precision=0)
    return table

In [19]:
# eLife metrics
elife_df = create_table(df_list, 0, 3, name_mapping)
# PLOS metrics
plos_df = create_table(df_list, 3, 6, name_mapping)

### Convert table to latex 

In [20]:
print(elife_df.to_latex())

\begin{tabular}{lllrrrr}
 & dataset & text & count & min & mean & max \\
0 & 1. eLife train & article & 4,346 & 329 & 10,428 & 29,653 \\
1 & 2. eLife val & article & 241 & 3,459 & 10,254 & 23,462 \\
2 & 3. eLife test & article & 142 & 2,551 & 9,094 & 17,220 \\
3 & 1. eLife train & lay_summary & 4,346 & 180 & 386 & 688 \\
4 & 2. eLife val & lay_summary & 241 & 237 & 394 & 679 \\
\end{tabular}



In [22]:
print(plos_df.to_latex())

\begin{tabular}{lllrrrr}
 & dataset & text & count & min & mean & max \\
0 & 4. PLOS train & article & 24,773 & 762 & 6,981 & 27,378 \\
1 & 5. PLOS val & article & 1,376 & 774 & 6,968 & 20,999 \\
2 & 6. PLOS test & article & 142 & 1,631 & 7,145 & 18,850 \\
3 & 4. PLOS train & lay_summary & 24,773 & 4 & 196 & 523 \\
4 & 5. PLOS val & lay_summary & 1,376 & 55 & 196 & 385 \\
\end{tabular}

