In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
import xmltodict
from IPython.display import clear_output

## Data Import

Reading the eSol dataset from tanpaku

In [None]:
raw_sol_data = pd.read_csv('Raw/esol_data.tab',sep='\t')

In [None]:
raw_sol_data.head()

Unnamed: 0,JW_ID,ECK number,B number,Gene name K-12,Locus name K-12,Synonyms of locus names K-12,Solubility(%),Yield(uM),Yield(ug/ml),Minus Sol,...,TF(ug/ml),GroE(ug/ml),KJE(ug/ml),Calculated MW(kDa),Calculated pI,Type of gene product,Gene product description,Cell location,Structure (PDB) id,SCOP assignment
0,JW0002,ECK0003,b0003,thrB,thrB,,32.0,2.3,78.0,,...,,,,33.6,6.2,e,homoserine kinase,Cytoplasmic,,54211; Ribosomal protein S5 domain 2-like 5506...
1,JW0003,ECK0004,b0004,thrC,thrC,,18.0,1.7,78.0,46.53,...,35.81,36.71,38.33,47.056,5.71,e,threonine synthase,Cytoplasmic,,53686; Tryptophan synthase beta subunit-like P...
2,JW0004,ECK0005,b0005,yaaX,yaaX,,78.0,1.2,14.0,,...,,,,11.4,10.9,o,predicted protein,Periplasmic,,
3,JW0005,ECK0006,b0006,yaaA,yaaA,,7.0,2.4,71.0,14.01,...,29.98,34.34,54.96,29.555,8.74,o,conserved protein,Cytoplasmic,,
4,JW0007,ECK0008,b0008,talB,talB,yaaK,85.0,2.7,94.0,,...,,,,35.2,5.8,e,transaldolase B,Cytoplasmic,1ONR,51569; Aldolase


In [None]:
raw_sol_data

Remove the data records that have no solubility values, as we can not train or test on these.

In [None]:
raw_sol_data = raw_sol_data.dropna(subset=['Solubility(%)'])

Add Organism column to the raw eSol data. (for processing purposes)

In [None]:
raw_sol_data.loc[:, 'Organism'] = 'Escherichia coli (strain K12)'

## Retrieving and Merging with UniProt Records

First setup a base function to retrieve a specific gene and organism from the UniProt Database.

In [None]:
def retrieve_data_from_uniprot(gene_name, organism):
    query = {
        "offset": 0,
        "gene": gene_name,
        "organism": organism
    }

    response = requests.get("https://www.ebi.ac.uk/proteins/api/proteins", params=query)
    raw_content = response.content
    content = xmltodict.parse(raw_content)

    entry = content['uniprot']['entry']

    if isinstance(entry, list):
        return entry[0]
    else:
        return entry

update_progress simply shows a progress bar

In [None]:
def update_progress(actual, total, clear=True, title="Progress"):
    bar_length = 100

    progress = (actual / total)

    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    if clear:
        clear_output(wait = True)

    text = "{0}: [{1}] {2:.1f}%".format(title, "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

retrieve_and_merge retrieves the data from UniProt and merges it with the eSol dataset, to add additional information.

In [None]:
def retrieve_and_merge(esol_dataset):
    merged = pd.DataFrame(columns=['Gene Name', 'Solubility(%)', 'Yield(uM)',
       'Yield(ug/ml)', 'Minus Sol', 'TF Sol', 'GroE Sol', 'KJE Sol',
       'Minus(uM)', 'TF(uM)', 'GroE(uM)', 'KJE(uM)', 'Minus(ug/ml)',
       'TF(ug/ml)', 'GroE(ug/ml)', 'KJE(ug/ml)', 'Calculated MW(kDa)',
       'Calculated pI', 'Type of gene product', 'Gene product description',
       'Cell location', 'Structure (PDB) id', 'SCOP assignment',
       'Sequence', 'Sequence length', 'Sequence mass', 'Organism', 'UP_ID', 'UP_NAME'])

    unknown = pd.DataFrame(columns=esol_dataset.columns)

    total = len(esol_dataset)
    progress = 0
    success = 0
    failed = 0

    for index, row in esol_dataset.iterrows():
        try:
            uniprot_dict = retrieve_data_from_uniprot(gene_name = row['Gene name K-12'], organism = row["Organism"])
            new_data = {
                'Gene Name': row['Gene name K-12'],
                'Solubility(%)': row['Solubility(%)'],
                'Yield(uM)': row['Yield(uM)'],
                'Yield(ug/ml)': row['Yield(ug/ml)'],
                'Minus Sol': row['Minus Sol'],
                'TF Sol': row['TF Sol'],
                'GroE Sol': row['GroE Sol'],
                'KJE Sol': row['KJE Sol'],
                'Minus(uM)': row['Minus(uM)'],
                'TF(uM)': row['TF(uM)'],
                'GroE(uM)': row['GroE(uM)'],
                'KJE(uM)': row['KJE(uM)'],
                'Minus(ug/ml)': row['Minus(ug/ml)'],
                'TF(ug/ml)': row['TF(ug/ml)'],
                'GroE(ug/ml)': row['GroE(ug/ml)'],
                'KJE(ug/ml)': row['KJE(ug/ml)'],
                'Calculated MW(kDa)': row['Calculated MW(kDa)'],
                'Calculated pI':  row['Calculated pI'],
                'Type of gene product': row['Type of gene product'],
                'Gene product description': row['Gene product description'],
                'Cell location':  row['Cell location'],
                'Structure (PDB) id': row['Structure (PDB) id'],
                'SCOP assignment': row['SCOP assignment'],
                'Sequence': uniprot_dict['sequence']['#text'],
                'Sequence length': uniprot_dict['sequence']['@length'],
                'Sequence mass': uniprot_dict['sequence']['@mass'],
                'Organism': uniprot_dict['organism']['name']['#text'],
                'UP_ID': uniprot_dict['accession'],
                'UP_NAME': uniprot_dict['name']
            }

            merged = pd.concat([merged, pd.DataFrame([new_data], columns=merged.columns)])
            success += 1
        except Exception as ex:
            failed += 1
            unknown = pd.concat([unknown, pd.DataFrame([row], columns=unknown.columns)])
        
        progress += 1
        
        update_progress(progress, total, clear=True)
        update_progress(success, total, title="Sucess Ratio", clear=False)
        update_progress(failed, total, title="Failure Ratio", clear=False)

    return merged, unknown

In [None]:
complete_data, unknown_data = retrieve_and_merge(raw_sol_data)

Progress: [####################################################################################################] 100.0%
Sucess Ratio: [####################################################################################################] 99.6%
Failure Ratio: [----------------------------------------------------------------------------------------------------] 0.4%


Just save the generated datasets to the file storage.

In [None]:
complete_data.to_csv('Processed Data/complete_data.csv', index=False)
unknown_data.to_csv('Processed Data/missing_data.csv', index=False)

See Data/Processed Data for the complete and missing data.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8ea8948f-b978-492f-a6dd-8ac5feb4b472' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>