# Gather Targeted Publication Mentions

Gather targeted mentions of software within publications (journals, preprints, etc) for software landscape analysis related to Cytomining ecosystem.


In [1]:
import os
from typing import List

import awkward as ak
import duckdb
import numpy as np
import pandas as pd
from biorxiv_retriever import BiorxivRetriever
from box import Box
from scholarly import scholarly
from thefuzz import fuzz

In [2]:
# gather projects data
projects = Box.from_yaml(filename="data/target-projects.yaml").projects

# gather the lowercase loi focus project names from targets
loi_target_projects = [
    project["name"].lower()
    for project in projects.to_list()
    if "loi-focus" in project["category"]
]
loi_target_projects

['pycytominer', 'cytosnake', 'cytotable', 'idr_stream']

In [3]:
# create a str for targeting the specific projects
project_sql_str = ", ".join(["'" + project + "'" for project in loi_target_projects])

# filter results of github stats to find the project creation date for use in filtering below
with duckdb.connect() as ddb:
    loi_target_project_years = ddb.query(
        f"""
    SELECT 
        ghstats."Project Name",
        ghstats."Date Created"
    FROM read_parquet('data/project-github-metrics.parquet') as ghstats
    WHERE LOWER(ghstats."Project Name") in ({project_sql_str})
    """,
    ).df()

loi_target_project_years

Unnamed: 0,Project Name,Date Created
0,pycytominer,2019-07-03 12:22:51-06:00
1,CytoSnake,2022-02-15 11:02:45-07:00
2,CytoTable,2022-09-08 09:46:25-06:00
3,IDR_stream,2022-08-09 15:16:48-06:00


In [4]:
# add a year created
loi_target_project_years["Date Created Year"] = loi_target_project_years[
    "Date Created"
].dt.year
loi_target_project_years = loi_target_project_years[
    ["Project Name", "Date Created Year"]
]
loi_target_project_years

Unnamed: 0,Project Name,Date Created Year
0,pycytominer,2019
1,CytoSnake,2022
2,CytoTable,2022
3,IDR_stream,2022


In [5]:
# instantiate record data as a list of record dictionaries
pub_metrics = loi_target_project_years.to_dict(orient="records")
pub_metrics

[{'Project Name': 'pycytominer', 'Date Created Year': 2019},
 {'Project Name': 'CytoSnake', 'Date Created Year': 2022},
 {'Project Name': 'CytoTable', 'Date Created Year': 2022},
 {'Project Name': 'IDR_stream', 'Date Created Year': 2022}]

In [6]:
# expand the records with Google Scholar results from Scholarly pkg
pub_metrics = [
    dict(
        project,
        **{
            "google_scholar_search_results": [
                result
                for result in scholarly.search_pubs(
                    # wrap the query in quotes to isolate as exact matches only
                    query=f'"{project["Project Name"]}"',
                    # specify a minimum year for the query
                    # (we shouldn't include results which were published
                    # before the project existed)
                    year_low=project["Date Created Year"],
                )
            ]
        },
    )
    for project in pub_metrics
]

# show the len of the results for each project
{
    project["Project Name"]: len(project["google_scholar_search_results"])
    for project in pub_metrics
}

{'pycytominer': 20, 'CytoSnake': 0, 'CytoTable': 0, 'IDR_stream': 0}

In [7]:
# expand the records with Bioarxiv results from Scholarly pkg
bioarxiv_retriever = BiorxivRetriever()

pub_metrics = [
    dict(
        project,
        **{
            "bioarxiv_search_results": [
                # exclude full_text from the data we store (only use for filtering)
                {key: val for key, val in paper.items() if key != "full_text"}
                # gather results from bioarxiv search query
                for paper in bioarxiv_retriever.query(
                    f'"{project["Project Name"]}"', metadata=True, full_text=True
                )
                # only include the paper result if the project name is found within the full text
                if project["Project Name"].lower() in paper["full_text"].lower()
            ],
        },
    )
    for project in pub_metrics
]

# show the len of the results for each project
{
    project["Project Name"]: len(project["bioarxiv_search_results"])
    for project in pub_metrics
}

100%|███████████████████████████████████████████████████████| 13/13 [00:21<00:00,  1.69s/it]
0it [00:00, ?it/s]
100%|█████████████████████████████████████████████████████████| 6/6 [00:16<00:00,  2.68s/it]
0it [00:00, ?it/s]


{'pycytominer': 13, 'CytoSnake': 0, 'CytoTable': 0, 'IDR_stream': 0}

In [8]:
# preview the nested data structure so far for the work ahead
ak.Array(pub_metrics)

In [9]:
# form data for total counts of publications
pub_metrics = [
    dict(
        project,
        **{
            # form a list of all unique article titles
            "all_article_titles": list(
                # convert to a set in order to dedupe exact matches
                set(
                    [
                        article
                        # use differing data structures to create the list of titles
                        for article in list(
                            [
                                article["bib"]["title"]
                                for article in project["google_scholar_search_results"]
                            ]
                            + [
                                article["title"]
                                for article in project["bioarxiv_search_results"]
                            ]
                        )
                    ]
                )
            ),
        },
    )
    for project in pub_metrics
]
pub_metrics[0]["all_article_titles"]

['High-content microscopy reveals a morphological signature of bortezomib resistance',
 'A genome-wide atlas of human cell morphology',
 'Evolution and impact of high content imaging',
 'Optimizing the Cell Painting assay for image-based profiling',
 'Morphology and gene expression profiling provide complementary information for mapping cell state',
 'Biological Cartography: Building and Benchmarking Representations of Life',
 'Evaluating batch correction methods for image-based cell profiling',
 'Predicting cell health phenotypes using image-based morphology profiling',
 'Self-supervision advances morphological profiling by unlocking powerful image representations',
 'Three million images and morphological profiles of cells treated with matched chemical and genetic perturbations',
 'Merging Bioactivity Predictions from Cell Morphology and Chemical Fingerprint Models Using Similarity to Training Data',
 'Predicting the Mitochondrial Toxicity of Small Molecules: Insights from Mechanisti

In [10]:
# gather distinct data using record linkage levenshtein distance
def return_distinct_values_by_threshold(
    string_list: List[str], threshold: int
) -> List[str]:
    """
    Finds and returns a new list of distinct values based on
    record linkage via Levenshtein distance.
    """
    distinct_list = []

    # compare every value pair-wise
    for i in range(len(string_list)):
        for j in range(i + 1, len(string_list)):
            if (
                # if the value is distinct (below the threshold of similarity)
                # and not yet included in the similar list, append it to the result
                fuzz.ratio(string_list[i], string_list[j]) <= threshold
                and string_list[i] not in distinct_list
            ):
                distinct_list.append((string_list[i]))

    return distinct_list


pub_metrics = [
    dict(
        project,
        **{
            # form a list of all unique article titles
            "all_article_titles_record_linked_removed": return_distinct_values_by_threshold(
                project["all_article_titles"], threshold=90
            ),
        },
    )
    for project in pub_metrics
]
pub_metrics[0]["all_article_titles_record_linked_removed"]

['High-content microscopy reveals a morphological signature of bortezomib resistance',
 'A genome-wide atlas of human cell morphology',
 'Evolution and impact of high content imaging',
 'Optimizing the Cell Painting assay for image-based profiling',
 'Morphology and gene expression profiling provide complementary information for mapping cell state',
 'Biological Cartography: Building and Benchmarking Representations of Life',
 'Evaluating batch correction methods for image-based cell profiling',
 'Predicting cell health phenotypes using image-based morphology profiling',
 'Self-supervision advances morphological profiling by unlocking powerful image representations',
 'Three million images and morphological profiles of cells treated with matched chemical and genetic perturbations',
 'Merging Bioactivity Predictions from Cell Morphology and Chemical Fingerprint Models Using Similarity to Training Data',
 'Predicting the Mitochondrial Toxicity of Small Molecules: Insights from Mechanisti

In [11]:
# gather counts from the data and export to file
pub_metrics = [
    dict(
        project,
        **{
            "google_scholar_count": len(project["google_scholar_search_results"]),
            "bioarxiv_count": len(project["bioarxiv_search_results"]),
            "total_pub_count": len(project["all_article_titles"]),
            "total_pub_count_non_record_linked": len(
                project["all_article_titles_record_linked_removed"]
            ),
        },
    )
    for project in pub_metrics
]
ak.to_parquet(
    ak.Array(pub_metrics), "data/loi-target-project-publication-metrics.parquet"
)

<pyarrow._parquet.FileMetaData object at 0x130e67590>
  created_by: parquet-cpp-arrow version 13.0.0
  num_columns: 29
  num_rows: 4
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 0

In [12]:
# depict results from the file
with duckdb.connect() as ddb:
    pub_totals = ddb.query(
        f"""
    SELECT 
        pubstats."Project Name",
        pubstats."Date Created Year",
        pubstats."google_scholar_count",
        pubstats."bioarxiv_count",
        pubstats."total_pub_count",
        pubstats."total_pub_count_non_record_linked"
    FROM read_parquet('data/loi-target-project-publication-metrics.parquet') as pubstats
    """,
    ).df()
pub_totals

Unnamed: 0,Project Name,Date Created Year,google_scholar_count,bioarxiv_count,total_pub_count,total_pub_count_non_record_linked
0,pycytominer,2019,20,13,21,20
1,CytoSnake,2022,0,0,0,0
2,CytoTable,2022,0,0,0,0
3,IDR_stream,2022,0,0,0,0
