# Database parser  <br>
largely based on the script of Tine Claeys (see folder *Reference*)

In this notebook, the MySQL database is set up.

In [2]:
import pandas as pd
from collections import defaultdict
import mysql.connector
import glob
import os
import csv
import numpy as np
from master_functions import dbf
pd.set_option('display.max_columns', 10)

In [25]:
df = pd.read_csv('/home/compomics/mounts/conode55/pride/PRIDE_DATA/PXD003903/IONBOT_v0.7.0/PT4530-16.mgf.gzip/PT4530-16.mgf.gzip.ionbot.csv')
df = df.loc[df['best_psm'] == 1]
df["q_value"].unique()

array([0.016, 0.019])

In [2]:
conn = mysql.connector.connect(user='root', password='password', host='127.0.0.1', port='3306',
                               database='expression_atlas_cells')
mycursor = conn.cursor(buffered=True)

# check the connection
if conn.is_connected():
    print("connection succesfull")
else:
    print("no connection")

connection succesfull


1. Only projects with confirmed cell annotation will be accessible in the database. The metadata of each file can be found in file *"file_annotation_update.csv"* and was added manually by checking the corresponding paper and the PRIDE-info page for the project.<br>
<br>
reasons for dropping projects:
    - not LFQ
    - enrichment-experiment
    - no HCD
    - the file annotation cannot be unraveled to link it to the studied sample  
    - the sample is not an established cell line

In [20]:
def excel_file_parser(file_path):
    '''Accepts the annotation file as such and returns 2 files:
    
    - annotation file for building cell, assay and peptide_to_assay tables with imputed annotation for cell lines already in the database.\n
    Also creates new column "file_path" with the path to the .ionbot.csv file. Inserts np.NaN on file_path if no path is found for given assay.
    - project file with info "accession", "experimentTypes", "instrumentNames", "references"'''
    db_f = dbf()

    manual_meta = pd.read_csv(file_path, sep = ";")
    manual_meta = manual_meta["PXD RAW Useable cell_line disease tissue_type treatment sub_cell pool_id".split()]
    manual_meta = manual_meta[manual_meta.Useable == "yes"]
    manual_meta.fillna("/", inplace = True)    
    
    manual_meta.RAW = manual_meta.RAW.apply(lambda x: x.split(".")[0])

    manual_meta = db_f.input_cell_info(manual_meta)
    
    manual_meta["file_path"] = manual_meta.apply(lambda x: db_f.find_file_path(x["PXD"], x["RAW"]), axis = 1)
    
    print(manual_meta.file_path.isna())
    
    return manual_meta

In [21]:
parsed_manual_meta = excel_file_parser("/home/compomics/Sam/git/python/master_thesis/Metadata/annotation_excel4.csv")

connection succesfull
Following cell lines still need annotation:  ['MSC', '/']
0       False
1       False
2       False
3       False
4       False
        ...  
8178    False
8179    False
8180    False
8181    False
8182    False
Name: file_path, Length: 4684, dtype: bool


In [22]:
print(f'Missing files: {parsed_manual_meta[parsed_manual_meta.file_path.isna()].groupby(["PXD"]).RAW.count()}, \n\nUseable files: {parsed_manual_meta[parsed_manual_meta.file_path.notna()].groupby(["PXD"]).RAW.count()}')

Missing files: PXD
PXD000612      96
PXD003668      50
PXD003903     284
PXD004452     605
PXD005354    1168
PXD008222     324
Name: RAW, dtype: int64, 

Useable files: PXD
PXD000612     36
PXD001441      6
PXD001468     24
PXD001511     27
PXD001668     96
PXD001952      8
PXD001974     16
PXD002117     18
PXD002613     34
PXD003530      9
PXD003596     12
PXD003668     40
PXD003790     60
PXD003896     12
PXD003903     52
PXD004051     16
PXD004182      4
PXD004452    128
PXD004900      6
PXD004940     45
PXD005354    392
PXD005507     18
PXD005912     12
PXD005940    216
PXD005946    732
PXD006112     15
PXD006653      4
PXD008222     92
PXD009185     27
Name: RAW, dtype: int64


In [23]:
print('Missing:\t', parsed_manual_meta[parsed_manual_meta.file_path.isna()].shape[0], "\nNot missing:\t" ,parsed_manual_meta[parsed_manual_meta.file_path.notna()].shape[0])

Missing:	 2527 
Not missing:	 2157


In [66]:
pd.set_option('display.max_rows', 20)
parsed_manual_meta.groupby(["tissue_type"]).count().RAW

tissue_type
/                    75
T-cell               48
blood                48
bone                 18
brain               296
breast              588
cervix              487
colon              1807
embryonic            32
epithelium           12
fibroblast          304
kidney              310
liver                18
lung                223
ovary               198
plasma cell          12
prostate             48
skeletal muscle      16
skin                144
Name: RAW, dtype: int64

In [4]:
projects = parsed_manual_meta.PXD.unique().tolist()
len(projects)

29

In [76]:
import openpyxl
wb = openpyxl.Workbook()
sheet = wb["Sheet"]
sheet["A1"], sheet["B1"], sheet["C1"], sheet["D1"] = 'accession', 'digestion', 'instrumentNames', 'PMID'

for i in projects:
    sheet.cell(row = sheet.max_row +1, column = 1).value = i

wb.save("project_annotation2.xlsx")

2. More metadata information on project level is added and includes the following:
- project_id (automatically generated)
- PXD_accession
- digestion method
- instrument name
- PubMed ID

In [11]:
#Metadata will be added later
meta = pd.read_csv("project_annotation2.csv", sep = ";")
meta = meta.iloc[:, 0:15]
meta = meta.astype(str)
meta.rename(columns = {"experimentTypes": "digestion", "references": "PMID"}, inplace = True)
meta.head()

Unnamed: 0,accession,digestion,instrumentNames,PMID
0,PXD001468,/,/,/
1,PXD004051,/,/,/
2,PXD001952,/,/,/
3,PXD000612,/,/,/
4,PXD002117,/,/,/


In [57]:
#Only insert the assays where paths are found. Store the rest in seperate file to handle later.
parsed_manual_meta[parsed_manual_meta.file_path.isna()].to_csv("assays_not_found.csv")
parsed_manual_meta = parsed_manual_meta[parsed_manual_meta.file_path.notna()]
parsed_manual_meta.shape

(2157, 10)

In [61]:
parsed_manual_meta.to_csv("parsed_manual_meta.csv", index = False)

In [3]:
parsed_manual_meta = pd.read_csv("parsed_manual_meta.csv")
parsed_manual_meta.head()

Unnamed: 0,PXD,RAW,Useable,cell_line,disease,tissue_type,treatment,sub_cell,pool_id,file_path
0,PXD001468,b1906_293T_proteinID_01A_QE3_122212,yes,HEK293,healthy,kidney,/,/,1,/home/compomics/mounts/conode53/pride/PRIDE_DA...
1,PXD001468,b1922_293T_proteinID_02A_QE3_122212,yes,HEK293,healthy,kidney,/,/,1,/home/compomics/mounts/conode53/pride/PRIDE_DA...
2,PXD001468,b1923_293T_proteinID_03A_QE3_122212,yes,HEK293,healthy,kidney,/,/,1,/home/compomics/mounts/conode53/pride/PRIDE_DA...
3,PXD001468,b1924_293T_proteinID_04A_QE3_122212,yes,HEK293,healthy,kidney,/,/,1,/home/compomics/mounts/conode53/pride/PRIDE_DA...
4,PXD001468,b1925_293T_proteinID_05A_QE3_122212,yes,HEK293,healthy,kidney,/,/,1,/home/compomics/mounts/conode53/pride/PRIDE_DA...


In [12]:
dbf().build_project_table(meta, projects)

connection succesfull
Projects checked.
29 projects added in table 'project'.


In [13]:
dbf().build_cell_table(parsed_manual_meta)

connection succesfull
111 cell entries in file.
111 entries added in table 'cell'.


In [4]:
dbf().build_assay_cell_table(parsed_manual_meta)

connection succesfull
2157


In [5]:
#The filenames wherefor the parser failed are stored in "parser_failed_ae4.csv"
dbf().find_ionbot_files(parsed_manual_meta)

connection succesfull
parsed check
b1906_293T_proteinID_01A_QE3_122212 was stored
parsed check
b1922_293T_proteinID_02A_QE3_122212 was stored
parsed check
b1923_293T_proteinID_03A_QE3_122212 was stored
parsed check
b1924_293T_proteinID_04A_QE3_122212 was stored
parsed check
b1925_293T_proteinID_05A_QE3_122212 was stored
parsed check
b1926_293T_proteinID_06A_QE3_122212 was stored
parsed check
b1927_293T_proteinID_07A_QE3_122212 was stored
parsed check
b1928_293T_proteinID_08A_QE3_122212 was stored
parsed check
b1929_293T_proteinID_09A_QE3_122212 was stored
parsed check
b1930_293T_proteinID_10A_QE3_122212 was stored
parsed check
b1931_293T_proteinID_11A_QE3_122212 was stored
parsed check
b1932_293T_proteinID_12A_QE3_122212 was stored
parsed check
b1937_293T_proteinID_01B_QE3_122212 was stored
parsed check
b1938_293T_proteinID_02B_QE3_122212 was stored
parsed check
b1939_293T_proteinID_03B_QE3_122212 was stored
parsed check
b1940_293T_proteinID_04B_QE3_122212 was stored
parsed check
b1941

---
---

In [32]:
def build_project_table(meta_df, list_of_pxds):
    count = 0
    meta_df = meta_df[meta_df['accession'].isin(list_of_pxds)]
    meta_df = meta_df[['accession', 'digestion', 'instrumentNames', 'PMID']]
    meta_df = meta_df.astype(str)
    meta_tuples = list(meta_df.to_records(index=False)) #a list of tuples is easily iteratible and easy to store in the database
    for i in meta_tuples:
        count += 1
        project = "INSERT INTO project(PXD_accession, experiment_type, instrument, pmid) VALUES (%s, %s, %s, %s)"
        i = list(i)
        mycursor.execute(project, i)
        conn.commit()
        
    print(f"{count} projects added in table 'project'.")

In [35]:
build_project_table(meta, projects)

52 projects added in table 'project'.


3. Prior to linking the assay to the cell_line, the cell table must be made using all the cell lines that are present in the *'file_annotation_update.csv'* file.<br>
The cell table needs:
- cell_line
- disease
- tissue_type
- treatment
- sub_cell

In [6]:
manual_cell_lines = manual_meta["cell_line disease tissue_type treatment sub_cell".split()]
manual_cell_lines = manual_cell_lines.drop_duplicates()
manual_cell_lines = manual_cell_lines.fillna("/")
manual_cell_lines.head()

Unnamed: 0,cell_line,disease,tissue_type,treatment,sub_cell
36,Hep3B,hepatocellular carcinoma,liver,/,/
84,MHCC97,hepatocellular carcinoma,liver,/,MHCC97H
140,Jurkat,T-cell leukemia,blood,/,/
239,LAN5,neuroblastoma,brain,/,/
243,LAN5,neuroblastoma,brain,A? Peptide,/


In [15]:
print("Amount of different cell lines for each tissue:")
manual_cell_lines.groupby(["tissue_type"]).cell_line.nunique()

Amount of different cell lines for each tissue:


tissue_type
blood             8
bone              1
brain             5
breast            5
cervix            1
colon             4
fetal kidney      1
kidney            3
liver             4
lung              1
ovary             1
prostate          2
skin              1
trophoblast       2
umbilical vein    1
Name: cell_line, dtype: int64

In [7]:
manual_cell_lines.shape

(66, 5)

In [8]:
def build_cell_table(cell_df):
    count = 0
    cell_tuples = list(cell_df.to_records(index=False)) #a list of tuples is easily iteratible and easy to store in the database
    for i in cell_tuples:
        count += 1
        cell = "INSERT INTO cell(cell_line, disease, tissue_type, treatment, sub_cell) VALUES (%s, %s, %s, %s, %s)"
        i = list(i)
        mycursor.execute(cell, i)
        conn.commit()
    print(f"{count} entries adde in table 'cell'.")

In [9]:
build_cell_table(manual_cell_lines)

66 entries adde in table 'cell'.


4. Load the **modification table** from the file Tables/modifications.csv into the database
--> not done yet

In [25]:
pass

5. Load the **protein** table<br>

NOTE: The protein table is loaded after the protein identifications were extracted from the ionbot output files. <br>
Here, the uniprot csv file which contains information on each protein, is merged with the uniprotID which was extracted from the ionbot output files.

In [6]:
#Use uniprot files to list uniprot_id, length, description and sequence
tine_file_path = "/home/compomics/Sam/git/python/Reference/Database/"
uniprot = pd.read_csv(tine_file_path + "Uniprot.csv", sep = ",")

seq = pd.read_csv(tine_file_path + "Uniprot_sequences.csv", sep = ";")

uni_seq = pd.merge(uniprot, seq, on = "uniprot_id", how = "left")
uni_seq = uni_seq.drop_duplicates()

In [7]:
#Link and save all the corresponding uniprot_ids in the database with the length, description and sequence of the uniprot files
query = "SELECT DISTINCT(uniprot_id) from protein"
mysqlData = pd.read_sql_query(query, conn)

print(mysqlData.shape)
mysqlData['description'] = np.nan
mysqlData['length'] = np.nan
mysqlData['sequence'] = np.nan

total_ids = pd.merge(uni_seq, mysqlData, how = "left", on = "uniprot_id")
print(total_ids.shape)

total_ids = total_ids.drop(columns=['description_y', 'length_y', 'sequence_y'])
total_ids = total_ids.rename(columns={'description_x' : 'description', 'length_x' : 'length', 'sequence_x': 'sequence'})

total_ids.to_csv("protein_table_uniprot.csv", sep = ",", index = False)
total_ids.head()

(13431, 1)
(20385, 7)


Unnamed: 0,uniprot_id,description,length,sequence
0,Q8N7X0,Androglobin (Calpain-7-like protein),1667,MASKQTKKKEVHRINSAHGSDKSKDFYPFGSNVQSGSTEQKKGKFP...
1,Q5T1N1,Protein AKNAD1,836,MDEADFSEHTTYKQEDLPYDGDLSQIKIGNDYSFTSKKDGLEVLNQ...
2,Q92667,"A-kinase anchor protein 1, mitochondrial (A-ki...",903,MAIQFRSLFPLALPGMLALLGWWWFFSRKKGHVSSHDEQQVEAGAV...
3,Q5VUY0,Arylacetamide deacetylase-like 3 (EC 3.1.1.-),407,MWDLALIFLAAACVFSLGVTLWVICSHFFTVHIPAAVGHPVKLRVL...
4,P62736,"Actin, aortic smooth muscle (Alpha-actin-2) (C...",377,MCEEEDSTALVCDNGSGLCKAGFAGDDAPRAVFPSIVGRPRHQGVM...


In [8]:
uniprot_df = pd.read_csv("protein_table_uniprot.csv", sep = ",")
non_human = """ADH1_YEAST,,,
ALBU_BOVIN,,,
ALDOA_RABIT,,,
BGAL_ECOLI,,,
CAH2_BOVIN,,,
CAS1_BOVIN,,,
CAS2_BOVIN,,,
CASB_BOVIN,,,
CASK_BOVIN,,,
CYC_HORSE,,,
DHE3_BOVIN,,,
GAG_SCVLA,,,
GFP_AEQVI,,,
K1C15_SHEEP,,,
K1M1_SHEEP,,,
K1M2_SHEEP,,,
K2M1_SHEEP,,,
K2M2_SHEEP,,,
K2M3_SHEEP,,,
KRA3_SHEEP,,,
KRA61_SHEEP,,,
LALBA_BOVIN,,,
LYSC_CHICK,,,
LYSC_LYSEN,,,
MYG_HORSE,,,
REF_HEVBR,,,
SRPP_HEVBR,,,
TRY1_BOVIN,,,
TRYP_PIG,,,
Q9BZD3""".split(",,,\n")

#TODO Q9BZD3 is omitted because no sequence was found. Resolve this.

def build_protein_table(uniprot_df):
    count = 0
    uniprot_tuples = list(uniprot_df.to_records(index=False)) #a list of tuples is easily iteratible and easy to store in the database
    for i in uniprot_tuples:
        count += 1
        query = "UPDATE protein SET description = %s, length = %s, sequence = %s WHERE uniprot_id = %s"
        uni_id, desc, length, sequence = list(i)
        i = [desc, float(length), sequence, uni_id]
        mycursor.execute(query, i)
        conn.commit()
    print(count)

check_na = uniprot_df[uniprot_df.isna().any(axis=1)]
uniprot_df = uniprot_df[~uniprot_df['uniprot_id'].isin(non_human)]    
print(uniprot_df.tail())

      uniprot_id                                        description  length  \
20380     Q8N895  Zinc finger protein 366 (Dendritic cell-specif...     744   
20381     Q9UK55  Protein Z-dependent protease inhibitor (PZ-dep...     444   
20382     Q96MX3   Zinc finger protein 48 (Zinc finger protein 553)     618   
20383     A6NGD5  Zinc finger and SCAN domain-containing protein...     496   
20384     Q9Y4E5  E3 SUMO-protein ligase ZNF451 (EC 2.3.2.-) (Co...    1061   

                                                sequence  
20380  MQKEMKMIKDEDVHFDLAVKKTPSFPHCLQPVASRGKAPQRHPFPE...  
20381  MKVVPSLLLSVLLAQVWLVPGLAPSPQSPETPAPQNQTSRVVQAPK...  
20382  MERAVEPWGPDLHRPEEREPQRGARTGLGSENVISQPNEFEHTPQE...  
20383  MAANCTSSWSLGESCNSPGSEPPQSMPSPATQLGNHDSDPETCHVN...  
20384  MGDPGSEIIESVPPAGPEASESTTDENEDDIQFVSEGPLRPVLEYI...  


In [9]:
uniprot_df = uniprot_df.replace(np.nan,'NaN')

In [10]:
build_protein_table(uniprot_df)

20384


6. The *file_annotation_update.csv* file will be used to build the **assay table**:
- assay_id: auto_incremented
- project_id from the project_table, linked to the PXD from the file
- filename
<br>

Simultaneously, the link assay-cell line must be made.

In [19]:
len(list(manual_meta.RAW))

2219

In [20]:
def build_assay_cell_table(assay_df):
    count = 0
    assay_tuples = list(assay_df.to_records(index = False))
    for i in assay_tuples:
        (accession, filename, useable, cell_line, disease, tissue_type, treatment, sub_cell) = i
        
        #filename = filename.split(".")[0]
        
        #select project_id
        mycursor.execute("SELECT project_id FROM project where PXD_accession = %s", (accession,))
        projectID_tup = mycursor.fetchone()
        (projectID,) = projectID_tup
        #insert into assay table
        assay = "insert into assay(project_id, filename) VALUES(%s, %s)"
        projectID_filename = (projectID, filename)
        mycursor.execute(assay, projectID_filename)
        conn.commit()
        #store this automatically generated assay ID for the cell_to_assay table
        assayID = mycursor.lastrowid
        #select cellID
        mycursor.execute("SELECT cell_id FROM cell WHERE cell_line = %s AND treatment = %s AND disease = %s AND sub_cell = %s", (cell_line, treatment, disease, sub_cell))
        cellID_tup = mycursor.fetchone()
        (cellID,) = cellID_tup
        #insert cellID and assayID in cell_to_assay
        cell_to_assay = "INSERT INTO cell_to_assay(assay_id, cell_id) VALUES(%s, %s)"
        assayID_cellID = (assayID, cellID)
        mycursor.execute(cell_to_assay, assayID_cellID)
        conn.commit()
        count += 1
    print(count)

In [21]:
build_assay_cell_table(manual_meta)

2219


7. The following step is using the ionbot information to fill in the **peptide_to_assay**, **peptide** and **peptide_to_modifications** tables.

In [6]:
def ionbot_parse(file):
    df = pd.read_csv(file, sep=',')
    # best_psm is equal to 1
    df = df.loc[df['best_psm'] == 1]
    #  q-value-best <= 0.01
    df = df.loc[df['q_value'] <= 0.01]
    # DB column needs to contain 'T' (otherwise decoy hit) +  extra check: only retain swissprot entries (start with sp)
    df = df.loc[df['DB'] == 'T']
    df_validated = df[df['proteins'].astype(str).str.startswith('sp')]
    # remove peptides that are not uniquely identified and are linked to multiple proteins = containing || in proteins
    x = '||'
    # regex is False otherwise it also detects a single | which is in every protein present
    df_validated = df_validated[~df_validated['proteins'].str.contains(x, regex=False)]
    # check not all entries were removed
    if df_validated.empty:
        return False

    # modifications can be linked to unimod id: peptide_modifications: unimod ID vs peptide

    # calculte the spectral counts from each peptide: dict: count
    peptides = df_validated['matched_peptide'].tolist()
    spectral_counts = defaultdict(int)
    for pep in peptides:
        spectral_counts[pep] += 1
    spectral_counts = dict(sorted(spectral_counts.items(), key=lambda item: item[1], reverse=True))
    print('parsed check')
    return df_validated, spectral_counts

In [21]:
def ionbot_store(file, filename):
    #check if the assay isn't already in the assay table
    filename = filename.split('/')[-1].split('.')[0]

    mycursor.execute("SELECT assay_id FROM assay WHERE filename = %s", (filename,))
    assayIDtup = mycursor.fetchone()
    if assayIDtup is None:
        print('{} is not in assays'.format(filename))
        return
    (assayID,) = assayIDtup
    parser = ionbot_parse(file)
    if parser is False:
        print(f"parser failed for {filename}.")
        return
    df_validated, spectral_counts = parser

    # use the pandeylines in assay format
    # pandeylines resulted in a pd dataframe with all the proteins and sequences of validated peptides
    # loop over all rows/peptides present in the file (pandey_validated dataframe)
    df_validated_store = df_validated[['proteins', 'matched_peptide', 'modifications']]
    df_validated_tuples = [tuple(x) for x in df_validated_store.to_numpy()]
    for t in df_validated_tuples:
        protID = (t[0])
        pepseq = tuple((t[1],))
        mod = list((t[2],))

        # peptide storage - peptide ID
        sequence = "INSERT INTO peptide(peptide_sequence) VALUES (%s) " \
                    "ON DUPLICATE KEY UPDATE peptide_sequence=peptide_sequence"
        mycursor.execute(sequence, pepseq)
        conn.commit()

        # retrieve peptide_id, do not generate a new id each time!
        mycursor.execute("SELECT peptide_id FROM peptide WHERE peptide_sequence = %s", (pepseq))
        pepIDtup = mycursor.fetchone()
        (pepID,) = pepIDtup

        # link uniProtID = protein in assay to peptide
        proteinID = "INSERT INTO protein(uniprot_id) VALUES (%s) ON DUPLICATE KEY UPDATE uniprot_id=uniprot_id"
        uniprotID = (protID.split('|')[1],)
        mycursor.execute(proteinID, uniprotID)
        conn.commit()

        # relation peptide to protein
        pepToProt = "INSERT INTO peptide_to_protein(uniprot_id, peptide_id) VALUES (%s,%s) " \
                    "ON DUPLICATE KEY UPDATE peptide_id=peptide_id, uniprot_id=uniprot_id"
        uniprotIDstr = ''.join(uniprotID)
        uniprotID_peptideID = (uniprotIDstr, pepID)
        mycursor.execute(pepToProt, uniprotID_peptideID)
        conn.commit()

        for i in mod:
            if pd.isnull(i):
                break
            else:
                # retrieve modification id, peptide id is present
                location = (i.split('|')[0],)
                id = (i[i.find("[")+1:i.find("]")],)

                #retrieve modID
                mycursor.execute("SELECT mod_id FROM modifications WHERE mod_id = %s", (id))
                modIDtup = mycursor.fetchone()
                if modIDtup is None:
                    break
                (modID,) = modIDtup
                # relation peptide_to_modification
                peptoMod = "INSERT INTO peptide_modifications(peptide_id, location, mod_id, assay_id) VALUES (%s, %s, %s, %s)" \
                            "ON DUPLICATE KEY UPDATE peptide_id = peptide_id, mod_id = mod_id, assay_id=assay_id"
                peptoModvalues = pepIDtup + location + modIDtup + assayIDtup
                mycursor.execute(peptoMod, peptoModvalues)
                conn.commit()

        # spectral count for peptide
        count = float('inf')
        for k, v in spectral_counts.items():
            if k == (''.join(pepseq)):
                count = v
                break
        peptideToAssay = "INSERT INTO peptide_to_assay(peptide_id, assay_id, quantification) VALUES (%s, %s, %s) " \
                            "ON DUPLICATE KEY UPDATE peptide_id=peptide_id, assay_id=assay_id"
        peptideID_assayID_count = (pepID, assayID, count)
        mycursor.execute(peptideToAssay, peptideID_assayID_count)
        conn.commit()
    print('{} was stored'.format(filename))

In [8]:
def find_ionbot_files53(projects, project_counter):
    number_of_files = 0
    for pxd in projects:
        number_of_files_per_project = 0
        path = '/home/compomics/mounts/conode53/*/PRIDE_DATA/' + str(pxd) + '/IONBOT_v0.6.2/*.mgf.ionbot.csv'
        read_files = []
        for file in glob.glob(path):
            number_of_files += 1
            if file not in read_files:
                read_files.append(file)
                if os.path.getsize(file) != 0:
                    filename = str(file)
                    ionbot_store(file, filename)
                    number_of_files_per_project += 1
        print(f"\nAmount RAW-files of project {pxd}:", project_counter[pxd])
        print(f"Added RAW-files of project {pxd}: {number_of_files_per_project}")  
    print(number_of_files)

In [9]:
def find_ionbot_files54(projects, project_counter):
    number_of_files = 0
    for pxd in projects:
        number_of_files_per_project = 0
        path = '/home/compomics/mounts/conode54/*/PRIDE_DATA/' + str(pxd) + '/IONBOT_v0.6.2/*.mgf.ionbot.csv'
        read_files = []
        for file in glob.glob(path):
            number_of_files += 1
            if file not in read_files:
                read_files.append(file)
                if os.path.getsize(file) != 0:
                    filename = str(file)
                    ionbot_store(file, filename)
                    number_of_files_per_project += 1
        print(f"\nAmount RAW-files of project {pxd}:", project_counter[pxd])
        print(f"Added RAW-files of project {pxd}: {number_of_files_per_project}")            
    print(number_of_files)

In [10]:
def find_ionbot_files55(projects, project_counter):
    number_of_files = 0
    for pxd in projects:
        number_of_files_per_project = 0
        path = '/home/compomics/mounts/conode55/pride/PRIDE_DATA/' + str(pxd) + '/IONBOT_v0.6.3/*.mgf.ionbot.csv'
        read_files = []
        for file in glob.glob(path):
            number_of_files += 1
            if file not in read_files:
                read_files.append(file)
                if os.path.getsize(file) != 0:
                    filename = str(file)
                    ionbot_store(file, filename)
                    number_of_files_per_project += 1
        print(f"\nAmount RAW-files of project {pxd}:", project_counter[pxd])
        print(f"Added RAW-files of project {pxd}: {number_of_files_per_project}\n")
    print(number_of_files)

In [11]:
project_amounts = manual_meta.groupby(["PXD"]).RAW.count()

In [33]:
find_ionbot_files53(projects, project_amounts)

parsed check
H6-2 was stored
parsed check
H12-2 was stored
parsed check
H14-2 was stored
parsed check
3B15-1 was stored
parsed check
H16-1 was stored
parsed check
3B3-2 was stored
parsed check
3B15-2 was stored
parsed check
H11-1 was stored
parsed check
H3-1 was stored
parsed check
3B18-1 was stored
parsed check
3B1-2 was stored
parsed check
H22-1 was stored
parsed check
3B14-1 was stored
parsed check
3B23-2 was stored
parsed check
3B7-2 was stored
parsed check
3B4-2 was stored
parsed check
3B11-1 was stored
parsed check
H5-1 was stored
parsed check
3B20-1 was stored
parsed check
H11-2 was stored
parsed check
3B21-1 was stored
parsed check
3B8-2 was stored
parsed check
3B19-2 was stored
parsed check
3B5-2 was stored
parsed check
H10-1 was stored
parsed check
3B5-1 was stored
parsed check
H17-2 was stored
parsed check
3B13-2 was stored
parsed check
H8-1 was stored
parsed check
3B13-1 was stored
parsed check
3B16-2 was stored
parsed check
H2-2 was stored
parsed check
H9-1 was stored
pars

In [34]:
find_ionbot_files54(projects, project_amounts)


Amount RAW-files of project PXD000533: 96
Added RAW-files of project PXD000533: 0

Amount RAW-files of project PXD004280: 42
Added RAW-files of project PXD004280: 0

Amount RAW-files of project PXD002842: 8
Added RAW-files of project PXD002842: 0

Amount RAW-files of project PXD003594: 32
Added RAW-files of project PXD003594: 0

Amount RAW-files of project PXD008996: 24
Added RAW-files of project PXD008996: 0

Amount RAW-files of project PXD006035: 92
Added RAW-files of project PXD006035: 0

Amount RAW-files of project PXD008719: 24
Added RAW-files of project PXD008719: 0

Amount RAW-files of project PXD006591: 6
Added RAW-files of project PXD006591: 0

Amount RAW-files of project PXD003406: 26
Added RAW-files of project PXD003406: 0

Amount RAW-files of project PXD003407: 26
Added RAW-files of project PXD003407: 0

Amount RAW-files of project PXD001327: 8
Added RAW-files of project PXD001327: 0

Amount RAW-files of project PXD002057: 6
Added RAW-files of project PXD002057: 0

Amount 

In [35]:
find_ionbot_files55(projects, project_amounts)


Amount RAW-files of project PXD000533: 96
Added RAW-files of project PXD000533: 0


Amount RAW-files of project PXD004280: 42
Added RAW-files of project PXD004280: 0


Amount RAW-files of project PXD002842: 8
Added RAW-files of project PXD002842: 0


Amount RAW-files of project PXD003594: 32
Added RAW-files of project PXD003594: 0


Amount RAW-files of project PXD008996: 24
Added RAW-files of project PXD008996: 0


Amount RAW-files of project PXD006035: 92
Added RAW-files of project PXD006035: 0


Amount RAW-files of project PXD008719: 24
Added RAW-files of project PXD008719: 0


Amount RAW-files of project PXD006591: 6
Added RAW-files of project PXD006591: 0


Amount RAW-files of project PXD003406: 26
Added RAW-files of project PXD003406: 0


Amount RAW-files of project PXD003407: 26
Added RAW-files of project PXD003407: 0


Amount RAW-files of project PXD001327: 8
Added RAW-files of project PXD001327: 0


Amount RAW-files of project PXD002057: 6
Added RAW-files of project PXD002057:

Because the peptides for some assays were not extracted, these "missing assays" will be searched below.

In [4]:
def formatter(list_of_tuples, query = None):
    formatted_list = []
    for i in list_of_tuples:
        for x in i:
            formatted_list.append(x)
    if query == None:
        return formatted_list
    in_p = ", ".join(list(map(lambda x: "%s", formatted_list)))
    query = query % in_p
    mycursor.execute(query, formatted_list)
    return mycursor.fetchall()

In [5]:
len(list(manual_meta.RAW))

mycursor.execute("SELECT assay_id FROM assay")
assay_ids = mycursor.fetchall()

mycursor.execute("SELECT DISTINCT(assay_id) FROM peptide_to_assay")
pep_assay_ids = mycursor.fetchall()
print(len(pep_assay_ids))

unused_assays = [x for x in assay_ids if x not in pep_assay_ids]
print(f"{len(unused_assays)} assays not loaded.")

project_ids = formatter(unused_assays, "SELECT DISTINCT project_id FROM assay WHERE assay_id IN(%s)")
print(project_ids)

pxd_not_found = formatter(project_ids, "SELECT PXD_accession FROM project WHERE project_id IN(%s)")
print(pxd_not_found)
print(len(pxd_not_found))

859
1360 assays not loaded.
[(1818,), (1831,), (1832,), (1833,), (1834,), (1837,), (1839,), (1840,), (1841,), (1842,), (1844,), (1847,), (1848,), (1850,), (1851,), (1854,), (1855,), (1856,), (1857,), (1858,), (1859,), (1860,), (1864,), (1865,), (1866,)]
[('PXD003594',), ('PXD008381',), ('PXD000442',), ('PXD000449',), ('PXD009149',), ('PXD000443',), ('PXD002032',), ('PXD000335',), ('PXD006614',), ('PXD009600',), ('PXD010306',), ('PXD000447',), ('PXD003587',), ('PXD008693',), ('PXD000157',), ('PXD008967',), ('PXD000999',), ('PXD000396',), ('PXD002389',), ('PXD000895',), ('PXD002039',), ('PXD000900',), ('PXD014381',), ('PXD002395',), ('PXD001874',)]
25


The directories were the ionbot result files could be in are searched by looking at subdirectories of each PXD whereof the ionbot result files were not loaded

In [47]:
#pxd_not_found = formatter(pxd_not_found)

pxd_dir = glob.glob("/home/compomics/mounts/*/*/PRIDE_DATA/*")
for i in pxd_dir:
    if os.path.basename(os.path.normpath(i)) in pxd_not_found:
        print(i)
        print('SUBDIRECTORIES')
        for x in glob.glob(i + "/*"):
            print(x)
        print()

/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594
SUBDIRECTORIES
/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594/RAW
/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594/IONBOT_v0.6.2
/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594/MGF
/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594/metadata
/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594/assay_mapping.txt
/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594/IONBOT_v0.6.0
/home/compomics/mounts/conode53/pride/PRIDE_DATA/PXD003594/IONBOT_v0.3.0

/home/compomics/mounts/conode54/pride2/PRIDE_DATA/PXD004824
SUBDIRECTORIES
/home/compomics/mounts/conode54/pride2/PRIDE_DATA/PXD004824/assay_mapping.txt
/home/compomics/mounts/conode54/pride2/PRIDE_DATA/PXD004824/RAW
/home/compomics/mounts/conode54/pride2/PRIDE_DATA/PXD004824/MGF
/home/compomics/mounts/conode54/pride2/PRIDE_DATA/PXD004824/metadata
/home/compomics/mounts/conode54/pride2/PRIDE_DATA/PXD004824/IONBOT

/home/compomics/m

NOTE: The following is still a work in progress to locate the ionbot result files of these PXDs. <br>Another solution is explored in the notebook *'database_compomics_parser'*, whereby the results are extracted from the CompOmics PostgreSQL database at https://github.ugent.be/compomics/open-modification-pride <br>
All ionbot results from each PXD could be acquired from this database at once, instead of searching through the directories on the vm. This will be further explored.

In [3]:
#why are these PXDs empty?
empty_conode55 = ["PXD"+x for x in "000999 000396 002389 000895 002039 000900 014381 002395 001874".split()]
empty_conode54 = ["PXD"+x for x in "000442 000449 002032 000335 006614 010306 000447 000157 008967".split()]

#folder IONBOT_v0.6.0 was used due to no uniquely identified peptides for a protein in the IONBOT_v0.6.2 folder.
try_v0_6_0_conode54 = ["PXD"+x for x in "008381 009149 000443 009600 003587 008693 ".split()]

#Folder "IONBOT" was used for ionbot.csv files
#try_IONBOT_conode54 = "PXD004824 PXD003438 PXD001352 PXD000661".split(" ")

In [28]:
def find_extra_ionbot_files53(projects, project_counter):
    number_of_files = 0
    for pxd in projects:
        number_of_files_per_project = 0
        path = '/home/compomics/mounts/conode54/*/PRIDE_DATA/' + str(pxd) + '/IONBOT/*.mgf.ionbot.csv'
        read_files = []
        for file in glob.glob(path):
            number_of_files += 1
            if file not in read_files:
                read_files.append(file)
                if os.path.getsize(file) != 0:
                    filename = str(file)
                    ionbot_store(file, filename)
                    number_of_files_per_project += 1
        print(f"\nAmount RAW-files of project {pxd}:", project_counter[pxd])
        print(f"Added RAW-files of project {pxd}: {number_of_files_per_project}\n")
    print(number_of_files)

In [32]:
def find_extra_ionbot_files54(projects, project_counter):
    number_of_files = 0
    for pxd in projects:
        number_of_files_per_project = 0
        path = '/home/compomics/mounts/conode54/*/PRIDE_DATA/' + str(pxd) + '/IONBOT_v0.6.0/*.mgf.ionbot.csv'
        read_files = []
        for file in glob.glob(path):
            number_of_files += 1
            if file not in read_files:
                read_files.append(file)
                if os.path.getsize(file) != 0:
                    filename = str(file)
                    ionbot_store(file, filename)
                    number_of_files_per_project += 1
        print(f"\nAmount RAW-files of project {pxd}:", project_counter[pxd])
        print(f"Added RAW-files of project {pxd}: {number_of_files_per_project}\n")
    print(number_of_files)

In [33]:
find_extra_ionbot_files54(try_v0_6_0_conode54, project_amounts)

parser failed
parser failed
parser failed
parser failed
parser failed
parsed check
160116_K052_OffLRP_RP_f18 was stored
parsed check
20160205_PGM_K052_SCX_RP_04 was stored
parsed check
20160205_PGM_K052_SCX_RP_19 was stored
parsed check
160116_K052_OffLRP_RP_f08 was stored
parsed check
20160205_PGM_K052_SCX_RP_11 was stored
parsed check
160116_K052_OffLRP_RP_f02 was stored
parser failed
parser failed
parser failed
parsed check
20160205_PGM_K052_SCX_RP_17 was stored
parser failed
parsed check
20160205_PGM_K052_SCX_RP_07 was stored
parsed check
160116_K052_OffLRP_RP_f04 was stored
parsed check
20160205_PGM_K052_SCX_RP_15 was stored
parser failed
parsed check
20160205_PGM_K052_SCX_RP_10 was stored
parsed check
20160205_PGM_K052_SCX_RP_03 was stored
parser failed
parser failed
parser failed
parsed check
20160205_PGM_K052_SCX_RP_08 was stored
parsed check
20160205_PGM_K052_SCX_RP_05 was stored
parsed check
20160205_PGM_K052_SCX_RP_06 was stored
parser failed
parsed check
20160205_PGM_K052_S

In [29]:
processing_files = "PXD004824 PXD003438 PXD001352 PXD000661".split(" ")
find_extra_ionbot_files53(processing_files, project_amounts)

qExHF01_02581 is not in assays
qExHF01_02605 is not in assays
qExHF01_02597 is not in assays
qExHF01_02595 is not in assays
qExHF01_02602 is not in assays
qExHF01_02584 is not in assays
qExHF01_02583 is not in assays
qExHF01_02590 is not in assays
qExHF01_02603 is not in assays
qExHF01_02594 is not in assays
qExHF01_02582 is not in assays
qExHF01_02607 is not in assays
parsed check
qExHF01_02606 was stored
qExHF01_02596 is not in assays
qExHF01_02593 is not in assays
qExHF01_02586 is not in assays
qExHF01_02604 is not in assays
qExHF01_02588 is not in assays
qExHF01_02587 is not in assays
qExHF01_02598 is not in assays
parsed check
qExHF01_02592 was stored
qExHF01_02585 is not in assays
qExHF01_02608 is not in assays
parsed check
qExHF01_02580 was stored
qExHF01_02600 is not in assays
qExHF01_02601 is not in assays
qExHF01_02591 is not in assays

Amount RAW-files of project PXD004824: 3
Added RAW-files of project PXD004824: 27

20150203_SCX_N2a_Mem_2 is not in assays
parsed check
20141