# Comparing datasets

In [3]:
import polars as pl

In [35]:
ROOT_DATA_DIR = "../data/"

czi_df = pl.read_parquet(ROOT_DATA_DIR + "CZI/processed-CZI-dataset.parquet")

kg_pmc_df = pl.read_parquet(ROOT_DATA_DIR + "software-kg-pmc/processed-pmc-kg-dataset.parquet")


## Standardising the processed datasets

In [36]:
czi_df.describe()

statistic,article_doi,software_id_CZI,software_rrid,software_name,software_url,context,article_pmcid
str,str,str,str,str,f64,str,f64
"""count""","""16056698""","""16158993""","""4223665""","""16153163""",9465310.0,"""16158993""",16158993.0
"""null_count""","""102295""","""0""","""11935328""","""5830""",6693683.0,"""0""",0.0
"""mean""",,,,,,,5912500.0
"""std""",,,,,,,1734700.0
"""min""",""" 10.1186/1477-5956-10-26""","""SM0""","""SCR_000004""",""" MGA""",,""" # 198 genes mapped to this te…",176545.0
"""25%""",,,,,,,4540975.0
"""50%""",,,,,,,6126143.0
"""75%""",,,,,,,7415876.0
"""max""","""10.9745/GHSP-D-21-00144""","""SM999999""","""SCR_021924""","""鼠源及人源化BCMA CAR-T的转染效率""",,"""𝜀c regressions and comparisons…",8510840.0


In [37]:
kg_pmc_df.describe()

statistic,article_id,software_id,top_software_name,all_software_names,software_url,mention_types
str,str,f64,str,str,str,str
"""count""","""7466923""",7466923.0,"""7466923""","""7466923""","""550716""","""7466923"""
"""null_count""","""0""",0.0,"""0""","""0""","""6916207""","""0"""
"""mean""",,1490200.0,,,,
"""std""",,141624.705752,,,,
"""min""","""Eur_J_Neurosci_2015_Nov_6_42(1…",25.0,"""!""","""!""","""""","""Creation"""
"""25%""",,1476301.0,,,,
"""50%""",,1482531.0,,,,
"""75%""",,1513869.0,,,,
"""max""","""PMC99051""",1778958.0,"""⋆""","""⋆""","""∼/pyneal/pyneal_scanner""","""Usage; Mention; Deposition; Cr…"


### Standardising naming and type

In [None]:
kg_pmc_df = (
    kg_pmc_df.rename(
        {"article_id": "article_pmcid", # Updating to reflect what type of ID it is, to match CZI
         "software_id": "software_id_kg_pmc" # Updating to reflect that it is the dataset specific id TODO: check if this is mention id or "local" software id - perhaps rename to something more general
    })
)

czi_df = (
    czi_df.rename(
        {"software_name": "top_software_name", # Update to match format of the KG-PMC dataset
         "software_id_CZI": "software_id_czi"
    })
)

czi_df = czi_df.with_columns( 
    ("PMC" + pl.col("article_pmcid").cast(pl.Utf8)).alias("article_pmcid") # Update CZI dataset to have pmcid on the format "PMCxxxxxxx" like somisci
)

### Standardising order

In [39]:
stand_czi_df = czi_df.select([
    "article_pmcid",
    "software_id_czi",
    "top_software_name",
    "software_url",

    # Unique to this dataset
    "context",
    "article_doi",
    "software_rrid",

])

stand_kg_pmc_df = kg_pmc_df.select([
    "article_pmcid",
    "software_id_kg_pmc",
    "top_software_name",
    "software_url",

    # Unique to this dataset
    "all_software_names",
    "mention_types"
])

In [40]:
stand_kg_pmc_df.describe()[0]

statistic,article_pmcid,software_id_kg_pmc,top_software_name,software_url,all_software_names,mention_types
str,str,f64,str,str,str,str
"""count""","""7466923""",7466923.0,"""7466923""","""550716""","""7466923""","""7466923"""


In [41]:
stand_czi_df.describe()[0]

statistic,article_pmcid,software_id_czi,top_software_name,software_url,context,article_doi,software_rrid
str,str,str,str,f64,str,str,str
"""count""","""16158993""","""16158993""","""16153163""",9465310.0,"""16158993""","""16056698""","""4223665"""


In [42]:
stand_czi_df

article_pmcid,software_id_czi,top_software_name,software_url,context,article_doi,software_rrid
str,str,str,list[str],str,str,str
"""PMC8475362""","""SM0""","""Olympus CellSens""",,"""Then, all items were photograp…","""10.1186/s43591-021-00017-9""",
"""PMC8475362""","""SM1""","""OPUS""",,"""Spectra were then vector norma…","""10.1186/s43591-021-00017-9""",
"""PMC8475362""","""SM2""","""R package DHARMa""",,"""Model fit was assessed through…","""10.1186/s43591-021-00017-9""",
"""PMC8475362""","""SM3""","""R""","[null, null, … ""https://github.com/ncornwell/R""]","""Analyses and plotting were per…","""10.1186/s43591-021-00017-9""",
"""PMC8475362""","""SM3""","""R""","[null, null, … ""https://github.com/dmpe/R""]","""Analyses and plotting were per…","""10.1186/s43591-021-00017-9""",
…,…,…,…,…,…,…
"""PMC6683272""","""SM53566""","""MetaVision""",,"""All data were obtained by revi…","""10.3390/nu11071443""",
"""PMC6683272""","""SM4442""",,,"""All data were obtained by revi…","""10.3390/nu11071443""",
"""PMC6683272""","""SM53019""","""iMDsoft""",,"""All data were obtained by revi…","""10.3390/nu11071443""",
"""PMC6683272""","""SM165""","""SPSS""","[""['http://www-01.ibm.com/software/uk/analytics/spss/']"", ""['https://www.ibm.com/products/software']"", … ""https://scicrunch.org/browse/resources/SCR_002865""]","""Statistical analysis was perfo…","""10.3390/nu11071443""","""SCR_002865"""


## Comparing the processed datasets

Sanity check to see if the PMC addition to article_pmcid in the CZI dataset worked

In [None]:
czi_unique = stand_czi_df.select("article_pmcid").unique().height
pmc_unique = stand_kg_pmc_df.select("article_pmcid").unique().height

common_pmcids = (
    stand_czi_df.select("article_pmcid").unique()
    .join(
        stand_kg_pmc_df.select("article_pmcid").unique(),
        on="article_pmcid",
        how="inner"
    )
)


print(f"CZI unique: {czi_unique:,}")
print(f"PMC unique: {pmc_unique:,}")
print(f"Common PMCIDs: {common_pmcids.height:,}")

CZI unique: 1,697,560
PMC unique: 1,910,273
Common PMCIDs: 1,316,205


Looks reasonable!