In [9]:
import pandas as pd
import os

# Define paths to directories and files
old_data_dir = 'old_data'
new_data_file = 'data/clinic_genetic.csv'
new_data_file_out = 'data/clinic_genetic_formatted.csv'
extra_out = 'data/clinic_genetic_extra.csv'




In [2]:
# Read all CSVs in the old data directory and concatenate
old_dataframes = []
for file in os.listdir(old_data_dir):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(old_data_dir, file))
        old_dataframes.append(df)

# Concatenate all old dataframes, allowing for differing columns
old_data = pd.concat(old_dataframes, ignore_index=True)

# Group by 'title' and 'abstract', keeping the first occurrence but detecting differing rows
grouped_old_data = old_data.groupby(['title',], as_index=False).first()
for _, group in old_data.groupby(['title',]):
    if len(group) > 1:
        differing_rows = group.loc[:, group.nunique() > 1]
        if not differing_rows.empty:
            print("Differing rows for title/abstract group:\n", differing_rows)

Differing rows for title/abstract group:
     relevance
191       1.0
234         1
Differing rows for title/abstract group:
     relevance
175       1.0
235         1
Differing rows for title/abstract group:
     relevance
219       2.0
257         2
Differing rows for title/abstract group:
     relevance
187       1.0
233         1
Differing rows for title/abstract group:
     relevance
9         2.0
258         2
Differing rows for title/abstract group:
     relevance
177       1.0
236         1
Differing rows for title/abstract group:
     relevance
10        2.0
259         2
Differing rows for title/abstract group:
     relevance
210       2.0
260         2
Differing rows for title/abstract group:
     relevance                                           ai_topic
218       1.0                         ChatGPT; systematic review
252         1  ChatGPT; ML Task: Applications in bioinformati...
Differing rows for title/abstract group:
     relevance
201       2.0
264         2
Differi

In [3]:
cols_to_use = ['title', 'abstract', 'source', 'relevance', 'ai_topic',
       'medicine_topic', 'notes', 'used?', 'what section used']
grouped_old_data = grouped_old_data[cols_to_use]

print(grouped_old_data.shape)
grouped_old_data

(229, 9)


Unnamed: 0,title,abstract,source,relevance,ai_topic,medicine_topic,notes,used?,what section used
0,"""Having cancer is very expensive"": A qualitati...",To examine patient barriers and facilitators t...,PubMed,0.0,,,,,
1,A Combined Manual Annotation and Deep-Learning...,We report a combined manual annotation and dee...,PubMed,2,BERT-based and DistilBERT-based NER models; ML...,Entity extraction in hereditary disease-relate...,BERT-based NER achieved high F1-scores for gen...,1,"pre: KNLR, disc"
2,A Combined Manual Annotation and Deep-Learning...,We report a combined manual annotation and dee...,PubMed,2.0,BERT-based and DistilBERT-based NER models; ML...,Entity extraction in hereditary disease-relate...,BERT-based NER achieved high F1-scores for gen...,,
3,A Comparative Sentiment Analysis of Greek Clin...,In addressing the critical role of emotional c...,PubMed,1.0,"BERT, RoBERTa, GPT-2, XLNet; ML Task: Sentimen...",Sentiment analysis of patient-clinician conver...,Importance of different language incorporation...,,
4,A Metabolic Biomarker Panel for Congenital Hea...,BackgroundCongenital heart disease (CHD) repre...,medrxiv,0.0,,,,,
...,...,...,...,...,...,...,...,...,...
224,Utilization of a Third-party Partnership in Te...,To meet the increasing demands of genetic risk...,PubMed,0.0,,,,,
225,VarChat: the generative AI assistant for the i...,"In the modern era of genomic research, the sci...",PubMed,2.0,Generative AI; ML Task: Literature search and ...,Interpretation of genomic variants using summa...,VarChat summarizes genomic variant data from s...,1,ana: GVI
226,Virtual Labs and Designer Bugs - Generative AI...,AI technologies can pose a major national secu...,PubMed,1.0,No specific transformer-based models like GPT ...,Genetic research with a focus on synthetic bio...,AI's role in genetic research and biological s...,,
227,Weakly Supervised Classification for Nasophary...,Pathological examination of nasopharyngeal car...,PubMed,1.0,Vision Transformer (T2T-ViT); ML Task: Image c...,Cancer diagnosis using whole slide images (WS...,Proposes a weakly supervised transformer frame...,,


In [4]:
# Read the new data
new_data = pd.read_csv(new_data_file)
print(new_data.shape)

# Add missing columns from old data to new data
for col in grouped_old_data.columns:
    if col not in new_data.columns:
        new_data[col] = None

# Update values in new data with old data where title and abstract match
merged_data = new_data.merge(
    grouped_old_data, on=['title'], how='left', suffixes=('', '_old')
)
for col in grouped_old_data.columns:
    if col not in ['title']:
        merged_data[col] = merged_data[col].combine_first(merged_data[col + '_old'])

# Add a new column to indicate existence in old data
merged_data['existed_in_old_data'] = merged_data['title'].isin(grouped_old_data['title']) 

# Create an extra dataframe with rows not in the new data based on 'title' and 'abstract'
extra_data = grouped_old_data[
    ~grouped_old_data.set_index(['title']).index.isin(
        new_data.set_index(['title']).index
    )
]

(1830, 3)


In [11]:
merged_data.to_csv(new_data_file_out, index=False, sep=',')
print(f"Prev anno: {sum(merged_data.existed_in_old_data)}")
merged_data

Prev anno: 193


Unnamed: 0,title,abstract,source,relevance,ai_topic,medicine_topic,notes,used?,what section used,abstract_old,source_old,relevance_old,ai_topic_old,medicine_topic_old,notes_old,used?_old,what section used_old,existed_in_old_data
0,Dynamic Evolution of SARS-CoV-2 in West Sumatr...,&lt;b&gt;Background and Objective:&lt;/b&gt; T...,PubMed,,,,,,,,,,,,,,,False
1,Natural Language Processing and Schizophrenia:...,(1) Background: Approximately 1% of the global...,PubMed,,,,,,,,,,,,,,,False
2,MIRACUM-Pipe: An Adaptable Pipeline for Next-G...,(1) Background: Next-generation sequencing (NG...,PubMed,,,,,,,,,,,,,,,False
3,Diagnostic Challenges in ABCA4-Associated Reti...,(1) Purpose: ABCA4-associated retinal degenera...,PubMed,,,,,,,,,,,,,,,False
4,Harnessing generative AI to annotate the sever...,0.1There are thousands of human phenotypes whi...,medrxiv,,,,,,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825,Zellweger Syndrome: A Case Report,Zellweger syndrome is an autosomal recessive d...,PubMed,,,,,,,,,,,,,,,False
1826,The clinical impact of mRNA therapeutics in th...,mRNA-based therapeutics have revolutionized me...,PubMed,,,,,,,,,,,,,,,False
1827,Contribution of Genetic Test to Early Diagnosi...,"the deficiency of 5,10-Methylenetetrahydrofola...",PubMed,,,,,,,,,,,,,,,False
1828,"Team Approach: Diagnosis, Management, and Prev...",» Sudden cardiac events during sports competit...,PubMed,,,,,,,,,,,,,,,False


In [7]:
merged_data.to_csv(new_data_file_out, index=False, sep=',')
print(f"Prev anno, not_ised: {sum(merged_data.existed_in_old_data)}")
merged_data

(36, 9)