In [1]:
import pandas as pd
import sbol2
import os
from __future__ import absolute_import, division, print_function
import numpy as np
import os
from abc import abstractmethod, ABCMeta

import seqtrainer.dataset_builder as dataset_builder
from rdflib import Graph
from rdflib.query import ResultRow

pd.set_option('display.max_columns', None)

current_dir = os.path.abspath('')
data_path = os.path.join(current_dir, '..', 'data')
sbol_path = os.path.join(data_path, 'sbol_data')
original_data_path = os.path.join(data_path, 'original_data')
model_data_path = os.path.join(data_path, 'processed_data', 'replicated_models')
model_output_path = os.path.join('..', 'model_outputs')


Read in the data representing tested promoter sequences in E. coli.

In [2]:
df3 = pd.read_csv(os.path.join(original_data_path, "fLP3_Endo2_lb_expression_formatted.txt"), delimiter=" ")
df3 = df3.dropna(subset=['start']).iloc[0:100]
df3

Unnamed: 0,name,tss_name,tss_position,strand,start,end,variant,RNA_exp_sum_1_1,RNA_exp_sum_1_2,RNA_exp_sum_2_1,RNA_exp_sum_2_2,RNA_exp_sum_1,RNA_exp_sum_2,RNA_exp_sum_ave,DNA_1,DNA_2,DNA_ave,expn_med,num_barcodes_mapped,num_barcodes_integrated,category
0,"TSS_11125_storz_regulondb,2945404,+",TSS_11125_storz_regulondb,2945404.0,+,2945284.0,2945434.0,GCAAATTTTGCACAAAAAATAGGCTTTAGTGATTTGTTTTTGTTCA...,43.807236,55.833039,54.661823,49.577995,49.820137,52.119909,50.970023,31.625906,35.101752,33.363829,52.010547,50,28,tss
1,"TSS_1218_storz,289782,+",TSS_1218_storz,289782.0,+,289662.0,289812.0,GTATCTGCCTCCGATTCTCTGCAGAAGCAGAAAGACATTGGATCGA...,0.562627,0.549348,0.816074,0.585183,0.555988,0.700629,0.628308,36.298833,31.606403,33.952618,0.565387,53,26,tss
2,"TSS_12352_storz,3253602,-",TSS_12352_storz,3253602.0,-,3253572.0,3253722.0,CGTTTGTCTGCGCTGTGTGCCGCAACGACCGCAGCAATGGGGGCCG...,0.516503,0.717569,0.658121,0.587724,0.617036,0.622923,0.619979,72.105393,72.498105,72.301749,0.577755,108,54,tss
3,"TSS_6478_storz,1684674,-",TSS_6478_storz,1684674.0,-,1684644.0,1684794.0,GCAGATACAACTCACACAATGCACCCGCTGTGTGAAATAAACAGAG...,1.006087,1.084363,0.763596,0.900281,1.045225,0.831939,0.938582,35.065895,37.408634,36.237265,0.936175,56,27,tss
4,"TSS_2956_storz,770940,+",TSS_2956_storz,770940.0,+,770820.0,770970.0,TTATAAAGATATGACCAAGTTCTGGGGCAAGTTGTTTGGTATCAAC...,0.715541,0.744280,0.679240,0.634099,0.729911,0.656670,0.693290,79.950519,78.564136,79.257327,0.632548,116,59,tss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,"TSS_1359_storz,321859,+",TSS_1359_storz,321859.0,+,321739.0,321889.0,ATCATGTTCTGAGTAATCTCGACGCTTATCTGTACCAGCTCTCAGA...,0.943290,0.657469,0.714856,0.705305,0.800380,0.710081,0.755230,41.395351,42.477685,41.936518,0.781225,39,28,tss
99,"TSS_7812_storz,2040839,+",TSS_7812_storz,2040839.0,+,2040719.0,2040869.0,GAAAATCGTGGAGTTTCCTTGGTTTCAATGTGTCTCCCTTTGTTAC...,0.822293,0.706276,0.622189,0.890778,0.764285,0.756484,0.760384,44.509809,41.027238,42.768523,0.761780,58,30,tss
100,"TSS_13505_storz,3489911,-",TSS_13505_storz,3489911.0,-,3489881.0,3490031.0,ATCAAAAATGAAGCCGATAACGGCCTGCGCAACACGCGTGGCACCA...,0.739661,0.683316,0.671462,0.699112,0.711488,0.685287,0.698388,78.602098,85.062707,81.832402,0.696464,102,53,tss
101,"TSS_11114_storz,2944085,-",TSS_11114_storz,2944085.0,-,2944055.0,2944205.0,GATCGGGCCGGAAGCCGGACACCGCGCAGGTTGGTACAACCACTAT...,1.189162,1.298429,1.038564,1.163454,1.243796,1.101009,1.172402,56.544554,60.487481,58.516017,1.094582,95,40,tss


Use pysbol2 to manipulate the dataframe to create an sbol file for each row in the dataframe, and save it to a folder.

In [None]:
def create_sbol_files(df, file_path, output_prefix_name, media="LB"):
    sbol2.setHomespace('http://github.com/cywlol/promoters')

    for i, row in df.iterrows():
        doc = sbol2.Document()
        doc.displayId = "E_coli_promoters"

        media_label_MD = media
        chassis_label_MD = "E_coli_chassis"
        
        media_md = sbol2.ModuleDefinition(media_label_MD)
        media_md.addRole("http://identifiers.org/ncit/NCIT:C48164")
        media_md.version = "1"
        doc.addModuleDefinition(media_md)

        chassis_md = sbol2.ModuleDefinition(chassis_label_MD)
        chassis_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
        chassis_md.wasDerivedFrom = ["https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=511145"]
        chassis_md.version = "1"
        doc.addModuleDefinition(chassis_md)

        fragment_seq = row["variant"]
        start = row["start"]
        end = row["end"]
        strand = row["strand"]
        RNA_exp_ave = row["expn_med"]

        # Establish the identities of each component, ensuring that each one is unique
        promoter_label_CD = f"promoter_definition_{i}"
        promoter_seq_label = f"promoter_seq_{i}"
        sample_design_label_MD = f"sample_design_definition_{i}"
        chassis_module_label = f"chassis_module_{i}"
        circuit_label_CD = f"circuit_definition_{i}"
        strain_label_MD = f"strain_definition_{i}"
        circuit_label_FC = f"cicuit_fc_{i}"
        measurement_label = f"exp_ave_{i}"
        gfp_fc_label = f"gfp_{i}"
        promoter_location_label = f"promoter_{i}_location"
        strain_module_label = f"strain_module_{i}"
        media_module_label = f"media_module_{i}"
        
        # define gfp once
        gfp_component_label = f"gfp_{i}"

        '''
        # Engineered Region
        engineered_cd = sbol2.ComponentDefinition(engr_region_label_CD, sbol2.BIOPAX_DNA)
        engineered_cd.roles = ["https://identifiers.org/so/SO:0000804"]
        sub = engineered_cd.components.create('promoter')
        sub.definition = promoter_cd.persistentIdentity
        doc.addComponentDefinition(engineered_cd)
        # No sequence?
        '''

        # Add strain module definition
        strain_md = sbol2.ModuleDefinition(strain_label_MD)
        strain_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
        strain_md.version = "1"
        doc.addModuleDefinition(strain_md)

        chassis_module = sbol2.Module(chassis_module_label)
        chassis_module.definition = chassis_md.identity
        strain_md.modules.add(chassis_module)

        # Sample Design
        sample_md = sbol2.ModuleDefinition(sample_design_label_MD)
        sample_md.addRole("http://identifiers.org/obo/OBI:0000073")
        sample_md.version = "1"
        doc.addModuleDefinition(sample_md)

        # Add strain module to sample design
        strain_module = sbol2.Module(strain_module_label)
        strain_module.definition = strain_md.identity
        # Add media module to sample design
        media_module = sbol2.Module(media_module_label)
        media_module.definition = media_md.identity


        sample_md.modules.add(media_module)
        sample_md.modules.add(strain_module)

        # Promoter and Sequence
        promoter_cd = sbol2.ComponentDefinition(promoter_label_CD, sbol2.BIOPAX_DNA)
        promoter_cd.roles = [sbol2.SO_PROMOTER]
        promoter_cd.version = "1"
        doc.addComponentDefinition(promoter_cd)
        seq = sbol2.Sequence(promoter_seq_label, fragment_seq, sbol2.SBOL_ENCODING_IUPAC)
        seq.version = "1"

        doc.addSequence(seq)
        promoter_cd.sequences = seq

        # Circuit
        circuit = sbol2.ComponentDefinition(circuit_label_CD, sbol2.BIOPAX_DNA)
        circuit.roles = ["https://identifiers.org/so/SO:0000804"]
        circuit.version = "1"

        annotation = sbol2.SequenceAnnotation(promoter_location_label)
        annotation.version = "1"
        rng = sbol2.Range("prange", start, end)
        rng.version = "1"

        if (strand == "-"):
            rng.orientation = sbol2.SBOL_ORIENTATION_REVERSE_COMPLEMENT

        annotation.locations.add(rng)
        circuit.sequenceAnnotations.add(annotation)
        promoter_comp = circuit.components.create('promoter')
        promoter_comp.definition = promoter_cd.identity
        doc.addComponentDefinition(circuit)


        circuit_fc = sbol2.FunctionalComponent(circuit_label_FC)
        circuit_fc.version = "1"
        circuit_fc.definition = circuit.identity
        circuit_fc.direction = sbol2.SBOL_DIRECTION_OUT
        circuit_fc.access = sbol2.SBOL_ACCESS_PUBLIC
        strain_md.functionalComponents.add(circuit_fc)

        measurement3 = sbol2.Measurement(measurement_label, value=RNA_exp_ave, unit="http://www.ontology-of-units-of-measure.org/resource/om-2/RatioUnit")
        measurement3.version = "1"

        gfp_protein = sbol2.ComponentDefinition(gfp_component_label, sbol2.BIOPAX_PROTEIN)
        gfp_protein.roles=[sbol2.SO_CDS]
        gfp_protein.version = "1"
        doc.addComponentDefinition(gfp_protein)

        gfp_fc = sbol2.FunctionalComponent(gfp_fc_label)
        gfp_fc.version = "1"
        gfp_fc.definition = gfp_protein.identity
        gfp_fc.direction = sbol2.SBOL_DIRECTION_OUT
        gfp_fc.access = sbol2.SBOL_ACCESS_PUBLIC


        doc.add(measurement3)
        m = gfp_fc.measurements.add(measurement3)

        sample_md.functionalComponents.add(gfp_fc)
        # exp_attachment = sbol2.Attachment(exp_label)
        # exp_attachment.name = f'fragmentation expression at rLP5 for sample {i}'
        # exp_attachment.description = 'CSV including the gene expression of the sequence: RNA_exp1, RNA_exp2, and its average.'
        # exp_attachment.source = 'CSV_LINK_HERE' # update when added attachment to SBOL collection
        # exp_attachment.format = 'https://identifiers.org/edam/format_3752'
        # doc.addAttachment(exp_attachment)
        report = doc.validate()
        if (report == 'Valid.'):
            doc.write(os.path.join(file_path, output_prefix_name + "_" + str(i) + ".xml"))

create_sbol_files(df3, sbol_path, "sample_design")       


Find the uri of the label to predict for your machine learning problem. 

In [3]:
uris = dataset_builder.find_possible_y_uris(os.path.join(sbol_path, "sample_design_6.xml"))
uris

['http://sbols.org/v2#version',
 'http://sbols.org/v2#start',
 'http://sbols.org/v2#end',
 'http://www.ontology-of-units-of-measure.org/resource/om-2/hasNumericalValue']

We will use the expression of the promoter as our label, so we will store the last uri in the previous list.

In [4]:
desired_y_uri = "http://www.ontology-of-units-of-measure.org/resource/om-2/hasNumericalValue"
desired_y_uri


'http://www.ontology-of-units-of-measure.org/resource/om-2/hasNumericalValue'

Get a list of all the paths to the SBOL files we want to use for training

In [5]:
paths = [os.path.join(sbol_path, file_name) for file_name in os.listdir(sbol_path)]
paths

['c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_0.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_1.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_10.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_100.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_101.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_102.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_11.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_12.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\notebooks\\..\\data\\sbol_data\\sample_design_13.xml',
 'c:\\Users\\Sai\\Documents\\GitHub\\SBOLtrainer\\note

Use the 'build_dataset' method to build dataset with the sequence and label for each SBOL file.

In [6]:
df = dataset_builder.build_dataset(paths, desired_y_uri) 
df

Unnamed: 0,sequence,target
0,GCAAATTTTGCACAAAAAATAGGCTTTAGTGATTTGTTTTTGTTCA...,52.010547
1,GTATCTGCCTCCGATTCTCTGCAGAAGCAGAAAGACATTGGATCGA...,0.565387
2,CTTCCAGGCGGGTGGGGTCAATGTCCATCAGGGCAATATGCGCCGT...,0.690511
3,ATCAAAAATGAAGCCGATAACGGCCTGCGCAACACGCGTGGCACCA...,0.696464
4,GATCGGGCCGGAAGCCGGACACCGCGCAGGTTGGTACAACCACTAT...,1.094582
...,...,...
95,TCGTCTATCCAGTGATTCGCCCCCTGGTCGTGATAATCACCGCCGC...,0.745525
96,TCACTGCTTAGCACCCATAAGGTGCTGCGTAATACCTATTTTCTGC...,0.635995
97,TCATCCTTTAGGCATTTGCACAATGCCGTACGTTACGTACTTCCTT...,0.602889
98,ATCATGTTCTGAGTAATCTCGACGCTTATCTGTACCAGCTCTCAGA...,0.781225


Individual label values for one file can be extracted using the 'get_y_label' function.

In [7]:
y = dataset_builder.get_y_label(os.path.join(sbol_path, "sample_design_6.xml"), uri=desired_y_uri)
y

0.615704421698235

Individual sequences for one file can be extracted using the 'get_y_label' function.

In [8]:
sequence = dataset_builder.get_sequence_from_sbol(os.path.join(sbol_path, "sample_design_6.xml"))
sequence

'AAACATTAAAGGGTGGTATTTCAAGGTCGGCTCCATGCAGACTGGCGTCCACACTTCTAAGCCTCCCACCTATCCTACACATCAAGGCTCAATGTTCAGTGTCAAGCTATAGTAAAGGTTCACGGGGTCTTTCCGTCTTGCCGCGGGTAC'

Save the dataset for preprocessing in 'preprocessing_example.ipynb'

In [9]:
dataset_path = os.path.join(data_path, "dataset_builder", "unprocessed_dataset.csv")
df.to_csv(dataset_path, index=False)

In [None]:
X = np.array(a)
X1 = X.reshape(X.shape[0], -1)
X2 = df.drop(['RNA_exp_ave', 'fragment'], axis=1).values
X = np.concatenate([X1, X2], axis=1)
y = np.log1p(df['RNA_exp_ave'])

In [None]:
 df['RNA_exp_ave'].describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [None]:
X_train

In [None]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 7],
    'learning_rate': [0.01, 0.1],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  
    cv=3,
    verbose=1,
)

grid = grid_search.fit(X_train, y_train)


In [None]:
print("Best parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r_sq = r2_score(y_test, y_pred)
print("Test MSE:", mse)
print("R sq:", r_sq)

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=100)

In [None]:
xgb_model.fit(X_train, y_train)

preds = xgb_model.predict(X_test)

r_sq = r2_score(y_test, preds) 

In [None]:
r_sq

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_sequences_ohe, y_scaled, test_size=0.2, random_state=42)

In [73]:
X_train = np.array(X_train) 
X_test = np.array(X_test)

In [74]:
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)


In [75]:
y_train = y_train.ravel()
y_test = y_test.ravel()

In [59]:
y_train

array([-0.09600204,  0.05270001, -0.07746396, ..., -0.10162205,
       -0.09888044, -0.10658665])

In [76]:
rfc = RandomForestRegression()
rfc.train(X_train, y_train)


In [77]:
preds = rfc.predict(X_test)


In [None]:
r_sq = r2_score(y_test, preds) 


In [83]:
mean_squared_error(y_test, preds)


0.7960463681705944

In [82]:
r2_score(y_test, preds) 


0.20183621111801853

In [None]:
 
# def create_synbio_collection(email, password, file_path, id, name, description, version='1'):
#     response = requests.post(
#         "https://synbiohub.org/login",
#         headers={"Accept": "text/plain"},
#         data={"email": email, "password": password}
#     )
        
#     if response.ok:
#         token = response.text.strip() # theres a whitespace before the token for some reason
#         response = requests.post(
#         'https://synbiohub.org/submit',
#         headers={
#             'X-authorization': token,
#             'Accept': 'text/plain'
#         },
#         files={
#         'files': open(file_path,'rb'),
#         },
#         data={
#             'id': id,
#             'version' : version,
#             'name' :  name,
#             'description' : description,
#             'citations' : '',
#             'overwrite_merge' : '0'
#         },
    
#     )
#     else:
#         print("Login failed:", response.status_code)
#         print(response.text)

def partshop_pull_collection(email, password, synbio_username, collection_name, file_path, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    doc = sbol2.Document()
    shop.login(email, password)

    collection_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{collection_name}_collection/{version}"
    s = shop.pull(collection_uri, doc)
    
    for obj in doc:
        print(obj)   
    
    doc.write(file_path)
    return doc

def partshop_pull_sample_designs(email, password, synbio_username, collection_name, sample_design_labels, output_dir, prefix, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    shop.login(email, password)

    for sample_design_label in sample_design_labels: 
        doc = sbol2.Document()
     #f"https://synbiohub.org/user/cywong/Ecolipromoterexpdataalt/ModuleDefinition_sample_design_definition_0/1
        collection_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{prefix}_{sample_design_label}/{version}"
        s = shop.pull(collection_uri, doc)
        doc.write(os.path.join(output_dir, sample_design_label + ".xml"))

def new_partshop_pull_sample_designs(email, password, synbio_username, collection_name, sample_design_labels, output_dir, prefix, version=1):

    shop = sbol2.PartShop("https://synbiohub.org")
    doc = sbol2.Document()
    shop.login(email, password)
    uris = []

    for sample_design_label in sample_design_labels: 
        uris.append(f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{prefix}_{sample_design_label}/{version}")
        
    shop.pull(uris, doc)
    doc.write(os.path.join(output_dir, sample_design_label + ".xml"))
        
def partshop_submit(email, password, synbio_username, collection_name, file_path, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    doc = sbol2.Document()
    shop.login(email, password)

    collection_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{collection_name}_collection/{version}"
    s = shop.submit(collection_uri, doc)
    
    for obj in doc:
        print(obj)   
    
    doc.write(file_path)

In [None]:

ecoli_genome_file_name = "E. coli.fasta"
env_email = os.getenv("SYNBIO_EMAIL")
env_password = os.getenv("SYNBIO_PASSWORD")

username = "cywong"
output_name = 'ecolipromoter_one.xml'
id = "Ecolipromoterexpdataalt"
name = "E coli promoter data exploration alternative"
description = "A collection containing the extracted E coli data from paper"
sbol_file_name = output_name
imported_sbol_file_name = "promoters_import.xml"
# # # #partshop_attach_exp_data(username, id, attachment_file_names, env_email, env_password, exp_labels)
# partshop_attach_genome_to_md(username, id, os.path.join(attachments_path, ecoli_genome_file_name), env_email, env_password, chassis_label)
# create_synbio_collection(env_email, env_password, os.path.join(sbol_path, sbol_file_name), id, name, description)