Selection of a diverse corpus for SDRF-Proteomics:
1. Different journals = different style of writing
    High impact journals: Nature, MCP
    Proteomics focused: JPR
    Technical: Nature Methods, Analytical Chemistry
2. Different labelling:
    LFQ, TMT, SILAC
3. One experiment vs multiple experiments

In [1]:
from Bio import Entrez

def fetch_journal_from_pmid(pmid_list, email="tineclae.claeys@ugent.be"):
    """Fetches journal names for a list of PubMed IDs."""
    Entrez.email = email  # Required by NCBI
    
    journals = {}
    for pmid in pmid_list:
        try:
            handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
            records = handle.read()
            handle.close()
            
            # Extract journal name
            for line in records.split("\n"):
                if line.startswith("JT  - "):  # Journal title line
                    journal_name = line.replace("JT  - ", "").strip()
                    journals[pmid] = journal_name
                    break
        except Exception as e:
            print(f"Error fetching PMID {pmid}: {e}")
            journals[pmid] = None  # Mark as None if fetching fails

    return journals


In [2]:
import os
ann_folder = "/home/compomics/git/Eubic_Dev_hackathon_2025/Combined_all/open_access"
ann_files = [f for f in os.listdir(ann_folder) if f.endswith('.ann')]
pmid_list = [f.split('.')[0] for f in ann_files]

journal_info = fetch_journal_from_pmid(pmid_list)
print(journal_info)


{'35121989': 'Nature cancer', '36653852': 'Acta neuropathologica communications', '36292738': 'Genes', '36499720': 'International journal of molecular sciences', '32785189': 'Biomedicines', '31816881': 'Microorganisms', '34128081': 'Acta neuropathologica', '37266577': 'eLife', '37301900': 'Communications biology', '36240066': 'The Biochemical journal', '35680998': 'Nature ecology & evolution', '32817103': 'mBio', '32095117': 'Cancer cell international', '35236848': 'Nature communications', '28675934': 'Journal of proteome research', '26076430': 'Nature biotechnology', '36982475': 'International journal of molecular sciences', '37834386': 'International journal of molecular sciences', '37778379': 'Philosophical transactions of the Royal Society of London. Series B, Biological', '36979426': 'Biomolecules', '28071813': 'Journal of orthopaedic research : official publication of the Orthopaedic', '28439285': 'Frontiers in plant science', '35677457': 'Data in brief', '36220102': 'Molecular c

In [12]:
print(set(list(journal_info.values())))


{'Journal of orthopaedic research : official publication of the Orthopaedic', 'eLife', 'Acta neuropathologica', 'Nature', 'Current biology : CB', 'Frontiers in immunology', 'International journal of molecular sciences', 'Molecular & cellular proteomics : MCP', 'Scientific data', 'Computational and structural biotechnology journal', 'Antioxidants (Basel, Switzerland)', 'Microbiome', 'Microorganisms', 'Cellular oncology (Dordrecht, Netherlands)', 'Cells', 'Genome medicine', 'mBio', 'PloS one', 'Nature ecology & evolution', 'Nature microbiology', 'Molecular therapy. Nucleic acids', 'Journal of proteomics', 'Clinical proteomics', 'Nature cancer', 'Toxicological sciences : an official journal of the Society of Toxicology', 'Proceedings of the National Academy of Sciences of the United States of America', 'Frontiers in cellular and infection microbiology', 'JCI insight', 'Biomedicines', 'bioRxiv : the preprint server for biology', 'Science advances', 'The Biochemical journal', 'Bioinformatic

In [36]:
from collections import Counter
import pandas as pd


journal_selection = ['Nature', 'JPR', 'Journal of proteome research', 'MCP', 'Molecular & cellular proteomics', 'Proteomics', 'Cell', 'Protein', 'Journal of proteomics', 'Clinical proteomics', 'Nucleic acids research']

# Select from journal all journals that mention any word in journal_selection even a substring
selected_journals = {k: v for k, v in journal_info.items() if any(j in v for j in journal_selection)}
value_counts = Counter(selected_journals.values())
selectiondf = pd.DataFrame(value_counts.items(), columns=["Journal", "Count"]).sort_values("Count", ascending=False)
print(f"This accounts for a total of {selectiondf.Count.sum()} papers")

#find key values in journal_info that match a journal in selectiondf.Journal
to_annotate = []
for k,v in journal_info.items():
    if v in selectiondf.Journal.tolist():
        to_annotate.append(k)
print(len(to_annotate))

This accounts for a total of 29 papers
29


In [37]:
import os

homedir = '/home/compomics/git/Eubic_Dev_hackathon_2025'
file_dict = {}

# Walk through all subdirectories
for root, _, files in os.walk(homedir):
    for f in to_annotate:
        filename = f + '.ann'
        if filename in files:  # Check if the file exists in the current folder
            file_path = os.path.join(root, filename)
            file_size = os.path.getsize(file_path)
            file_dict[file_path] = file_size

print(file_dict)  # Output the dictionary


{'/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/35680998.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/35236848.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/37330510.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/35037038.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/21183079.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/37681923.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/36097293.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/26725330.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/37207340.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/35236989.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/37328468.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_PRIDE/33630598.ann': 0, '/home/compomics/git/Eubic_Dev_hackathon_2025/Unique_github/35121989.ann': 

In [43]:
import pandas as pd
df = pd.DataFrame.from_dict(file_dict, orient='index', columns=['File Size'])
df.sort_values(by='File Size', ascending=False)
df.to_csv('29papers.csv')

In [44]:
to_annotate

['35121989',
 '35680998',
 '35236848',
 '28675934',
 '26076430',
 '28680058',
 '32860207',
 '30814501',
 '31699905',
 '31395880',
 '37330510',
 '35037038',
 '21183079',
 '31699904',
 '29681787',
 '32284562',
 '28091603',
 '37681923',
 '36097293',
 '26725330',
 '26149123',
 '33855848',
 '24657495',
 '37207340',
 '25100859',
 '35236989',
 '25218447',
 '37328468',
 '33630598']

In [53]:
# copy all files in /home/compomics/git/Eubic_Dev_hackathon_2025/Combined_all/open_acces with a filename, independent of extension, that occurs in the to_annotate list
import os
import shutil

source_dir = '/home/compomics/git/Eubic_Dev_hackathon_2025/Combined_all/open_access/'
destination_dir = '/home/compomics/git/Eubic_Dev_hackathon_2025/Combined_all/journalbasedselection'  # Change to your desired copy location

# Check if source directory is accessible
if not os.path.exists(source_dir):
    print(f"Error: Source directory '{source_dir}' does not exist.")
    exit(1)
if not os.access(source_dir, os.R_OK):
    print(f"Error: No read access to source directory '{source_dir}'.")
    exit(1)

# Create destination folder if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Walk through all subdirectories
moved_files = 0
for root, _, files in os.walk(source_dir):
    for file in files:
        filename_without_ext, ext = os.path.splitext(file)  # Split filename and extension
        if filename_without_ext in to_annotate and ext in {".ann", ".txt"}:
            src_path = os.path.join(root, file)
            dest_path = os.path.join(destination_dir, file)
            
            # Check if file is accessible before moving
            if os.access(src_path, os.R_OK):
                shutil.move(src_path, dest_path)
                moved_files += 1
            else:
                print(f"Warning: No read access to '{src_path}', skipping.")

if moved_files == 0:
    print("No matching files found or moved.")
else:
    print(f"File moving complete. {moved_files} files moved.")

File moving complete. 58 files moved.
