In [8]:
import pandas as pd
import os

# Define paths to directories and files
old_data_dir = 'old_data'
new_data_file = 'data/clinic_genetic.csv'
new_data_file_out = 'data/clinic_genetic_formatted.csv'
extra_data_file_out = 'data/clinic_genetic_extra.csv'


from utils_fetch import start_date, end_date, query_terms_list, year_list, DATA_DIR

In [9]:
# Read all CSVs in the old data directory and concatenate
old_dataframes = []
for file in os.listdir(old_data_dir):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(old_data_dir, file))
        old_dataframes.append(df)

# Concatenate all old dataframes, allowing for differing columns
old_data = pd.concat(old_dataframes, ignore_index=True)

old_data['title_form'] = old_data.title.apply(lambda x: str(x).lower().strip().strip('.').replace('\"', '').replace('\'', ''))

# Group by 'title' and 'abstract', keeping the first occurrence but detecting differing rows
grouped_old_data = old_data.groupby(['title_form',], as_index=False).first()
for _, group in old_data.groupby(['title_form',]):
    if len(group) > 1:
        differing_rows = group.loc[:, group.nunique() > 1]
        if not differing_rows.empty:
            print("Differing rows for title/abstract group:\n", differing_rows)

Differing rows for title/abstract group:
                                                  title relevance
16   A Combined Manual Annotation and Deep-Learning...       2.0
256  A Combined Manual Annotation and Deep-Learning...         2
Differing rows for title/abstract group:
     relevance
219       2.0
257         2
Differing rows for title/abstract group:
     relevance
187       1.0
233         1
Differing rows for title/abstract group:
     relevance
191       1.0
234         1
Differing rows for title/abstract group:
     relevance
175       1.0
235         1
Differing rows for title/abstract group:
     relevance
9         2.0
258         2
Differing rows for title/abstract group:
     relevance
177       1.0
236         1
Differing rows for title/abstract group:
     relevance
10        2.0
259         2
Differing rows for title/abstract group:
     relevance                                           ai_topic
218       1.0                         ChatGPT; systematic review
252

In [10]:
grouped_old_data.title[1],grouped_old_data.title[2]

('A Combined Manual Annotation and Deep-Learning Natural Language Processing Study on Accurate Entity Extraction in Hereditary Disease Related Biomedical Literature.',
 'A comparative evaluation of ChatGPT 3.5 and ChatGPT 4 in responses to selected genetics questions')

In [11]:
cols_to_use = ['title', 'abstract', 'source', 'relevance', 'ai_topic',
       'medicine_topic', 'notes', 'used?', 'what section used', 'title_form']
grouped_old_data = grouped_old_data[cols_to_use]

print(grouped_old_data.shape)
grouped_old_data

(228, 10)


Unnamed: 0,title,abstract,source,relevance,ai_topic,medicine_topic,notes,used?,what section used,title_form
0,A case of malignant pheochromocytoma with neur...,Neurofibromatosis type 1 is a hereditary condi...,PubMed,0.0,,,,,,a case of malignant pheochromocytoma with neur...
1,A Combined Manual Annotation and Deep-Learning...,We report a combined manual annotation and dee...,PubMed,2.0,BERT-based and DistilBERT-based NER models; ML...,Entity extraction in hereditary disease-relate...,BERT-based NER achieved high F1-scores for gen...,1,"pre: KNLR, disc",a combined manual annotation and deep-learning...
2,A comparative evaluation of ChatGPT 3.5 and Ch...,To evaluate the efficacy of ChatGPT 4 (GPT-4) ...,PubMed,2.0,"GPT-3.5, GPT-4; ML Task: Genetic information d...","Genetic conditions (BRCA1, HFE, MLH1) and gene...",GPT-4 performs better than GPT-3.5 in genetic ...,1,"edu, disc",a comparative evaluation of chatgpt 3.5 and ch...
3,A Comparative Sentiment Analysis of Greek Clin...,In addressing the critical role of emotional c...,PubMed,1.0,"BERT, RoBERTa, GPT-2, XLNet; ML Task: Sentimen...",Sentiment analysis of patient-clinician conver...,Importance of different language incorporation...,,,a comparative sentiment analysis of greek clin...
4,A complex mechanism translating variation of a...,Linking genes to traits is a central goal in b...,biorxiv,0.0,,,,,,a complex mechanism translating variation of a...
...,...,...,...,...,...,...,...,...,...,...
223,Utilization of a Third-party Partnership in Te...,To meet the increasing demands of genetic risk...,PubMed,0.0,,,,,,utilization of a third-party partnership in te...
224,VarChat: the generative AI assistant for the i...,"In the modern era of genomic research, the sci...",PubMed,2.0,Generative AI; ML Task: Literature search and ...,Interpretation of genomic variants using summa...,VarChat summarizes genomic variant data from s...,1,ana: GVI,varchat: the generative ai assistant for the i...
225,Virtual Labs and Designer Bugs - Generative AI...,AI technologies can pose a major national secu...,PubMed,1.0,No specific transformer-based models like GPT ...,Genetic research with a focus on synthetic bio...,AI's role in genetic research and biological s...,,,virtual labs and designer bugs - generative ai...
226,Weakly Supervised Classification for Nasophary...,Pathological examination of nasopharyngeal car...,PubMed,1.0,Vision Transformer (T2T-ViT); ML Task: Image c...,Cancer diagnosis using whole slide images (WS...,Proposes a weakly supervised transformer frame...,,,weakly supervised classification for nasophary...


In [12]:
# Read the new data
new_data = pd.read_csv(new_data_file)
new_data['title_form'] = new_data.title.apply(lambda x: str(x).lower().strip().strip('.').replace('\"', '').replace('\'', ''))
print(new_data.shape)

# Add missing columns from old data to new data
for col in grouped_old_data.columns:
    if col not in new_data.columns:
        new_data[col] = None

# Update values in new data with old data where title and abstract match
merged_data = new_data.merge(
    grouped_old_data, on=['title_form'], how='left', suffixes=('', '_old')
)
for col in grouped_old_data.columns:
    if col not in ['title', 'title_form']:
        merged_data[col] = merged_data[col].combine_first(merged_data[col + '_old'])

# Add a new column to indicate existence in old data
merged_data['existed_in_old_data'] = merged_data['title'].isin(grouped_old_data['title'])
merged_data = merged_data[['title', 'abstract', 'source', 'title_form', 'relevance', 'ai_topic',
       'medicine_topic', 'notes', 'used?', 'what section used', 'existed_in_old_data']]

# Create an extra dataframe with rows not in the new data based on 'title' and 'abstract'
extra_data = grouped_old_data[
    ~grouped_old_data.set_index(['title']).index.isin(
        new_data.set_index(['title']).index
    )
]

(576, 4)


In [13]:
merged_data.to_csv(new_data_file_out, index=False, sep=',')
print(f"Prev anno: {sum(merged_data.existed_in_old_data)}")
merged_data

Prev anno: 95


Unnamed: 0,title,abstract,source,title_form,relevance,ai_topic,medicine_topic,notes,used?,what section used,existed_in_old_data
0,Natural Language Processing and Schizophrenia:...,(1) Background: Approximately 1% of the global...,PubMed,natural language processing and schizophrenia:...,,,,,,,False
1,Harnessing generative AI to annotate the sever...,0.1There are thousands of human phenotypes whi...,medrxiv,harnessing generative ai to annotate the sever...,,,,,,,False
2,Leveraging hierarchical structures for genetic...,1.Initially introduced in 1909 by William Bate...,"PubMed,medrxiv",leveraging hierarchical structures for genetic...,,,,,,,False
3,16S rRNA gene sequencing for bacterial identif...,16S rRNA gene sequence is the most common hous...,biorxiv,16s rrna gene sequencing for bacterial identif...,,,,,,,False
4,A Systematic Review of Testing and Evaluation ...,1ImportanceLarge Language Models (LLMs) can as...,medrxiv,a systematic review of testing and evaluation ...,1.0,Various Large Language Models (LLMs) like GPT ...,"Healthcare applications, especially in clinica...",Evaluates LLMs in healthcare for diverse tasks...,,,True
...,...,...,...,...,...,...,...,...,...,...,...
571,A self-supervised framework for learning whole...,Whole slide imaging is fundamental to biomedic...,arXiv,a self-supervised framework for learning whole...,,,,,,,False
572,Machine learning-based donor permission extrac...,With more clinical trials are offering optiona...,PubMed,machine learning-based donor permission extrac...,,,,,,,False
573,Accuracy of generative artificial intelligence...,With the increasing development of artificial ...,PubMed,accuracy of generative artificial intelligence...,,,,,,,False
574,Enhancing human phenotype ontology term extrac...,With the increasing utilization of exome and g...,PubMed,enhancing human phenotype ontology term extrac...,,,,,,,False


In [14]:
extra_data.to_csv(extra_data_file_out, index=False, sep=',')
print(f"Prev anno, not_used: {extra_data.shape}")
extra_data

Prev anno, not_used: (133, 10)


Unnamed: 0,title,abstract,source,relevance,ai_topic,medicine_topic,notes,used?,what section used,title_form
0,A case of malignant pheochromocytoma with neur...,Neurofibromatosis type 1 is a hereditary condi...,PubMed,0.0,,,,,,a case of malignant pheochromocytoma with neur...
1,A Combined Manual Annotation and Deep-Learning...,We report a combined manual annotation and dee...,PubMed,2.0,BERT-based and DistilBERT-based NER models; ML...,Entity extraction in hereditary disease-relate...,BERT-based NER achieved high F1-scores for gen...,1,"pre: KNLR, disc",a combined manual annotation and deep-learning...
3,A Comparative Sentiment Analysis of Greek Clin...,In addressing the critical role of emotional c...,PubMed,1.0,"BERT, RoBERTa, GPT-2, XLNet; ML Task: Sentimen...",Sentiment analysis of patient-clinician conver...,Importance of different language incorporation...,,,a comparative sentiment analysis of greek clin...
5,A foundational large language model for edible...,Significant progress has been made in the fiel...,"PubMed,biorxiv",1.0,Large language model (AgroNT); ML Task: Genomi...,Genomic prediction for plant species using DNA...,AgroNT predicts regulatory annotations and fun...,,,a foundational large language model for edible...
6,A Metabolic Biomarker Panel for Congenital Hea...,BackgroundCongenital heart disease (CHD) repre...,medrxiv,0.0,,,,,,a metabolic biomarker panel for congenital hea...
...,...,...,...,...,...,...,...,...,...,...
216,Unraveling the Enigma of Aortic Dissection: Fr...,Aortic dissection (AD) presents a critical med...,PubMed,0.0,,,,,,unraveling the enigma of aortic dissection: fr...
217,UnSegGNet: Unsupervised Image Segmentation usi...,"Image segmentation, the process of partitionin...",arXiv,1.0,"Graph Neural Networks (GNN), Vision Transforme...",Unsupervised medical image segmentation using ...,GNN-based unsupervised segmentation performs w...,1,ana: MIA,unseggnet: unsupervised image segmentation usi...
219,Unveiling the Link Between Celiac Disease and ...,Celiac disease (CD) is a systemic autoimmune d...,PubMed,0.0,,,,,,unveiling the link between celiac disease and ...
221,Using Large Language Models to Annotate Comple...,Social Determinants of Health (SDoH) are an im...,PubMed,0.0,"GPT-3.5, GPT-4; ML Task: Annotating social det...",SDOH identification (housing instability) fro...,GPT-4 outperformed manual annotation and NER m...,,,using large language models to annotate comple...


In [17]:
len(set(merged_data.title))

576

## CHECK

In [40]:
import pandas as pd
from collections import Counter

In [72]:
# generated_df = pd.read_csv(os.path.join(DATA_DIR, "clinic_genetic.csv"))
# init_df = pd.read_csv('/home/toharhymes/Downloads/clinic_genetic_formatted - raw_clinic_genetic_formatted.csv')
shortened_df = pd.read_csv('/home/toharhymes/Downloads/clinic_genetic_formatted - raw_ST1.csv')
upd_df = pd.read_csv('/home/toharhymes/Downloads/clinic_genetic_formatted - ST1_with_added_(ST2).csv')

print(shortened_df.shape, upd_df.shape)
print(len(set(shortened_df.title)), len(set(upd_df.title)))

(551, 16) (307, 22)
551 307


In [73]:
r_shortened_df = shortened_df[shortened_df.relevance=='2']

In [74]:
len(upd_df.title),len(r_shortened_df.title)

(307, 165)

In [75]:
len(set(upd_df.title)),len(set(r_shortened_df.title))

(307, 165)

In [76]:
len(set(upd_df.title.apply(lambda x: x.lower()))),len(set(r_shortened_df.title.apply(lambda x: x.lower())))

(307, 165)

In [77]:
set(upd_df.title)-set(shortened_df.title)

{'Accurate proteome-wide missense variant effect prediction with AlphaMissense',
 'DUVEL: an active-learning annotated biomedical corpus for the recognition of oligogenic combinations',
 'GENA-LM: A Family of Open-Source Foundational',
 'GeneGPT: augmenting large language models with domain tools for improved access to biomedical information',
 'Identifying facial phenotypes of genetic disorders using deep learning'}

In [33]:
len(set(shortened_df.title) & set(init_df.title))

552

In [34]:
set(shortened_df.title) - set(init_df.title)

{'Genetic and phenotypic analysis of the virulence plasmid of a non-Shigatoxigenic enteroaggregative ',

In [36]:
set(init_df.title) - set(shortened_df.title)

{'Assessing the utility of large language models for phenotype-driven gene prioritization in the diagnosis of rare genetic disease',
 'Empowering Personalized Pharmacogenomics with Generative AI Solutions',
 'Enhancing phenotype recognition in clinical notes using large language models: PhenoBCBERT and PhenoGPT',
 'Enhancing recognition and interpretation of functional phenotypic sequences through fine-tuning pre-trained genomic models',
 'Evaluating GPT and BERT models for protein-protein interaction identification in biomedical text',
 "Feasibility of Identifying Factors Related to Alzheimer's Disease and Related Dementia in Real-World Data",
 'Generating 3D brain tumor regions in MRI using vector-quantization Generative Adversarial Networks',
 'Genetic and phenotypic analysis of the virulence plasmid of a non-Shigatoxigenic enteroaggregative \tEscherichia coli O104:H4 outbreak strain',
 'Identification of parthenogenesis-inducing effector proteins in Wolbachia',
 'Multi-view graph l