In [1]:
import pandas as pd
import re

In [2]:
urls = None
df = pd.read_csv('exp_input/proteomexchange_search.tsv', sep='\t')
publication_datasets = df[['publication','identifier','repository','title','keywords']]

In [3]:
# get an idea of values in df publication
publication_datasets['publication'].value_counts()

publication
Dataset with its publication pending                                                                                                                                                                      11719
no publication                                                                                                                                                                                             2550
<a href="http://www.ncbi.nlm.nih.gov/pubmed/35084980" target="_blank">Melani et al. (2022)</a>                                                                                                               56
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28267743" target="_blank">Matsumoto et al. (2017)</a>                                                                                                            28
<a href="http://www.ncbi.nlm.nih.gov/pubmed/28071820" target="_blank">Kreutz et al. (2017)</a>                                                              

In [4]:
#  clean the `publication` column by filtering out unwanted values like `"Dataset with its publication pending"`, `"no publication"`, and any HTML tags. 
# Remove rows with unwanted values
filtered_df = publication_datasets[~publication_datasets['publication'].isin(["Dataset with its publication pending", "no publication"])]
len(filtered_df)

25868

In [5]:
# Ensure filtered_df is a copy, not a slice
filtered_df = filtered_df.copy()

# Create a new column for the links
filtered_df.loc[:, 'publication_link'] = None

# Extract links from 'publication'
for i, row in filtered_df.iterrows():
    pub = str(row['publication'])  # Ensure string type
    if "href" in pub:
        match = re.findall(r'href=[\'"]([^\'"]+)[\'"]', pub)  # Extract href links
        filtered_df.at[i, 'publication_link'] = match if match else None
    else:
        filtered_df.at[i, 'publication_link'] = None

# Drop rows with missing links safely
filtered_df = filtered_df.dropna(subset=['publication_link']).reset_index(drop=True)

In [6]:
m = 0
for i,row in filtered_df.iterrows():
    id = row['identifier']
    m+=len(row['publication_link'])
print(f"Total number of publications: {m}")
print(f"Average number of publications per dataset: {m/len(filtered_df)}")

Total number of publications: 39347
Average number of publications per dataset: 1.53125


In [7]:
filtered_df.columns

Index(['publication', 'identifier', 'repository', 'title', 'keywords',
       'publication_link'],
      dtype='object')

In [8]:
# group by publication_link, concat identifier

filtered_df = filtered_df.explode('publication_link')  # Split lists into rows

df_grouped = filtered_df.groupby('publication_link').agg({
    'identifier': lambda x: ','.join(map(str, x.fillna('n/a'))),
    'repository': lambda x: ','.join(map(str, x.fillna('n/a'))),
    'title': lambda x: ','.join(map(str, x.fillna('n/a'))),
    'keywords': lambda x: ','.join(map(str, x.fillna('n/a')))  # Replace NaN with 'n/a'
}).reset_index()

df_grouped

Unnamed: 0,publication_link,identifier,repository,title,keywords
0,http://dx.doi.org/10.1002/CAM4.3825,PXD023689,PRIDE,Proteomics reveals the function reverse of MPS...,"prostate CAFs, prostate cancer cells, secretomics"
1,http://dx.doi.org/10.1002/CBIC.202000870,"PXD023060,PXD023059,PXD023056,PXD023057,PXD023058","PRIDE,PRIDE,PRIDE,PRIDE,PRIDE",Rapid and high coverage profile of human cyste...,"Cysteine, FAIMS, SP3,Cysteine, FAIMS, SP3,Cyst..."
2,http://dx.doi.org/10.1002/PMIC.202000214,PXD020638,PRIDE,Combining label-free and label-based accurate ...,"Beef meat quality, Data-Independent Acquisitio..."
3,http://dx.doi.org/10.1002/PMIC.202000240,PXD023907,PRIDE,Fusarium oxysporum f. sp. vasinfectum (UP00003...,"cotton, fungi, fusarium oxysporum"
4,http://dx.doi.org/10.1002/PMIC.202100036,PXD022191,PRIDE,Spectral library for SARS-COVID proteome and i...,"HLA, ProteomeXchange project tag: Covid-19, SA..."
...,...,...,...,...,...
34874,https://www.ncbi.nlm.nih.gov/pubmed/4127907,PXD000055,PRIDE,Casein Kinase 1 functions in eukaryotic non-tr...,"Ostreococcus tauri, Phosphoproteomics, casein ..."
34875,https://www.ncbi.nlm.nih.gov/pubmed/5974087,PXD009498,PRIDE,SILAC APMS analysis of CCDC103 interactome,"APMS, Hela cells, LTQ-Orbitrap, SILAC"
34876,https://www.ncbi.nlm.nih.gov/pubmed/6260970,PXD001626,PRIDE,Widespread somatic L1 retrotransposition occur...,"L1, Retrotransposon, adenoma, carcinoma, color..."
34877,https://www.ncbi.nlm.nih.gov/pubmed/9537338,PXD036326,PRIDE,Arterial-Venous Endothelial Cell Cycle Protein...,"HUVEC, S/G2/M, cell cycle, early G1, endotheli..."


In [9]:
# count different identifiers for all publication_link
df_grouped['identifier_count'] = df_grouped['identifier'].apply(lambda x: len(x.split(',')))
df_grouped

Unnamed: 0,publication_link,identifier,repository,title,keywords,identifier_count
0,http://dx.doi.org/10.1002/CAM4.3825,PXD023689,PRIDE,Proteomics reveals the function reverse of MPS...,"prostate CAFs, prostate cancer cells, secretomics",1
1,http://dx.doi.org/10.1002/CBIC.202000870,"PXD023060,PXD023059,PXD023056,PXD023057,PXD023058","PRIDE,PRIDE,PRIDE,PRIDE,PRIDE",Rapid and high coverage profile of human cyste...,"Cysteine, FAIMS, SP3,Cysteine, FAIMS, SP3,Cyst...",5
2,http://dx.doi.org/10.1002/PMIC.202000214,PXD020638,PRIDE,Combining label-free and label-based accurate ...,"Beef meat quality, Data-Independent Acquisitio...",1
3,http://dx.doi.org/10.1002/PMIC.202000240,PXD023907,PRIDE,Fusarium oxysporum f. sp. vasinfectum (UP00003...,"cotton, fungi, fusarium oxysporum",1
4,http://dx.doi.org/10.1002/PMIC.202100036,PXD022191,PRIDE,Spectral library for SARS-COVID proteome and i...,"HLA, ProteomeXchange project tag: Covid-19, SA...",1
...,...,...,...,...,...,...
34874,https://www.ncbi.nlm.nih.gov/pubmed/4127907,PXD000055,PRIDE,Casein Kinase 1 functions in eukaryotic non-tr...,"Ostreococcus tauri, Phosphoproteomics, casein ...",1
34875,https://www.ncbi.nlm.nih.gov/pubmed/5974087,PXD009498,PRIDE,SILAC APMS analysis of CCDC103 interactome,"APMS, Hela cells, LTQ-Orbitrap, SILAC",1
34876,https://www.ncbi.nlm.nih.gov/pubmed/6260970,PXD001626,PRIDE,Widespread somatic L1 retrotransposition occur...,"L1, Retrotransposon, adenoma, carcinoma, color...",1
34877,https://www.ncbi.nlm.nih.gov/pubmed/9537338,PXD036326,PRIDE,Arterial-Venous Endothelial Cell Cycle Protein...,"HUVEC, S/G2/M, cell cycle, early G1, endotheli...",1


In [10]:
df_grouped['identifier_count'].sum()

39347

In [11]:
# set lower case for the identifier and publication link fields
df_grouped = df_grouped.apply(lambda x: x.str.lower() if x.name in ['identifier', 'publication_link'] else x)
df_grouped

Unnamed: 0,publication_link,identifier,repository,title,keywords,identifier_count
0,http://dx.doi.org/10.1002/cam4.3825,pxd023689,PRIDE,Proteomics reveals the function reverse of MPS...,"prostate CAFs, prostate cancer cells, secretomics",1
1,http://dx.doi.org/10.1002/cbic.202000870,"pxd023060,pxd023059,pxd023056,pxd023057,pxd023058","PRIDE,PRIDE,PRIDE,PRIDE,PRIDE",Rapid and high coverage profile of human cyste...,"Cysteine, FAIMS, SP3,Cysteine, FAIMS, SP3,Cyst...",5
2,http://dx.doi.org/10.1002/pmic.202000214,pxd020638,PRIDE,Combining label-free and label-based accurate ...,"Beef meat quality, Data-Independent Acquisitio...",1
3,http://dx.doi.org/10.1002/pmic.202000240,pxd023907,PRIDE,Fusarium oxysporum f. sp. vasinfectum (UP00003...,"cotton, fungi, fusarium oxysporum",1
4,http://dx.doi.org/10.1002/pmic.202100036,pxd022191,PRIDE,Spectral library for SARS-COVID proteome and i...,"HLA, ProteomeXchange project tag: Covid-19, SA...",1
...,...,...,...,...,...,...
34874,https://www.ncbi.nlm.nih.gov/pubmed/4127907,pxd000055,PRIDE,Casein Kinase 1 functions in eukaryotic non-tr...,"Ostreococcus tauri, Phosphoproteomics, casein ...",1
34875,https://www.ncbi.nlm.nih.gov/pubmed/5974087,pxd009498,PRIDE,SILAC APMS analysis of CCDC103 interactome,"APMS, Hela cells, LTQ-Orbitrap, SILAC",1
34876,https://www.ncbi.nlm.nih.gov/pubmed/6260970,pxd001626,PRIDE,Widespread somatic L1 retrotransposition occur...,"L1, Retrotransposon, adenoma, carcinoma, color...",1
34877,https://www.ncbi.nlm.nih.gov/pubmed/9537338,pxd036326,PRIDE,Arterial-Venous Endothelial Cell Cycle Protein...,"HUVEC, S/G2/M, cell cycle, early G1, endotheli...",1


In [12]:
# save to csv
df_grouped.to_csv('exp_input/publication_data_citations_PXD.csv', index=False)