In [28]:
import pandas as pd
import sbol2
import os
import requests
from dotenv import load_dotenv
import zipfile

pd.set_option('display.max_columns', None)

current_dir = os.path.abspath('')
data_path = os.path.join(current_dir, '..', 'data')
attachments_path = os.path.join(current_dir, '..', 'attachments')
pulled_attachments_path = os.path.join(current_dir, '..', 'pulled_attachments')
sbol_path = os.path.join(current_dir, '..', 'sbol_data')

load_dotenv()

True

In [3]:
df = pd.read_csv(os.path.join(data_path, "frag-rLP5_LB_expression.txt"), delimiter=" ")
# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in LB media	
df2 = pd.read_csv(os.path.join(data_path, "frag-rLP5-M9_expression.txt"), delimiter=" ")
# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in M9 media at rLP5

In [44]:
def create_sbol_doc(df, file_path, media="LB"):
    doc = sbol2.Document()
    sbol2.setHomespace('http://github.com/cywlol/promoters')
    doc.displayId = "E_coli_promoters"
    # save labels for later when we do the attachments through api
    exp_data_labels = []
    attachment_file_names = []

    media_label_MD = media
    chassis_label_MD = "E_coli_chassis"

    # define the media once, then have it as a reference for each promoter row
    media_md = sbol2.ModuleDefinition(media_label_MD)
    doc.addModuleDefinition(media_md)
    media_md.addRole("http://identifiers.org/ncit/NCIT:C48164") 
    
    # define the chassis once, then have it as a reference for each promoter row
    chassis_md = sbol2.ModuleDefinition(chassis_label_MD) 
    doc.addModuleDefinition(chassis_md)
    chassis_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
    
    #attach genome 
    chassis_md.wasDerivedFrom = ["https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=511145"]

    for i, row in df.iterrows():
        fragment_seq = row["fragment"]

        # Establish the identities of each component, ensuring that each one is unique
        promoter_label_CD = f"promoter_{i}"
        promoter_seq_label = f"promoter_seq_{i}"
        sample_design_label_CD = f"sample_design_{i}"
        strain_label_MD = f"strain_{i}"
        data_label = f"exp_data_{i}"
        exp_label = f'sample_{i}_expression_data'
        attachment_file_name = f'frag_{media}_exp_sample_{i}.csv'

        #engr_region_label_CD = f"engr_region_{i}"
        #location_promoter_annotation = f"location_promoter_annotation_{i}"
        #location_promoter_label = f"location_promoter_label_{i}"
        
        # Promoter and Sequence 
        promoter_cd = sbol2.ComponentDefinition(promoter_label_CD, sbol2.BIOPAX_DNA)
        promoter_cd.roles = [sbol2.SO_PROMOTER]
        seq = sbol2.Sequence(promoter_seq_label, fragment_seq, sbol2.SBOL_ENCODING_IUPAC)
        doc.addSequence(seq)
        promoter_cd.sequences = [seq.persistentIdentity]
        doc.addComponentDefinition(promoter_cd)

        
        # Add strain module definition
        strain_md = sbol2.ModuleDefinition(strain_label_MD)
        strain_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
        doc.addModuleDefinition(strain_md)
        strain_c1 = strain_md.modules.create('chassis')
        strain_c1.definition = chassis_md.persistentIdentity
        strain_c2 = strain_md.functionalComponents.create('promoter')
        strain_c2.definition = promoter_cd.persistentIdentity
    
        #annotation = sbol2.SequenceAnnotation("promoter_location")
        #range = sbol2.Range("prange", start, end)
    
        #if (strand == "-"):
        #    range.orientation = sbol2.SBOL_ORIENTATION_REVERSE_COMPLEMENT
                
        #annotation.locations.add(range)
        #promoter_cd.sequenceAnnotations.add(annotation)
        '''
        # Engineered Region 
        engineered_cd = sbol2.ComponentDefinition(engr_region_label_CD, sbol2.BIOPAX_DNA)
        engineered_cd.roles = ["https://identifiers.org/so/SO:0000804"]
        sub = engineered_cd.components.create('promoter')
        sub.definition = promoter_cd.persistentIdentity
        doc.addComponentDefinition(engineered_cd)
        # No sequence?
        '''
        
        #strain_c2 = strain_md.functionalComponents.create('engineered_region')
        #strain_c2.definition = strain_md.persistentIdentity
        
        # Sample Design  
        sample_md = sbol2.ModuleDefinition(sample_design_label_CD)
        doc.addModuleDefinition(sample_md)
        sample_md.addRole("http://identifiers.org/obo/OBI:0000073")

        m_strain = sample_md.modules.create('strain')
        m_strain.definition = strain_md.persistentIdentity
        
        m_media = sample_md.modules.create('media')
        m_media.definition = media_md.persistentIdentity

        # Create the attachment data as a csv
        rna_series = row[["RNA_exp_1", "RNA_exp_2", "RNA_exp_ave"]]
        rna_df = pd.DataFrame([rna_series])  
        rna_df.to_csv(os.path.join(attachments_path, attachment_file_name), index=False)
        
        # exp_attachment = sbol2.Attachment(exp_label)
        # exp_attachment.name = f'fragmentation expression at rLP5 for sample {i}'
        # exp_attachment.description = 'CSV including the gene expression of the sequence: RNA_exp1, RNA_exp2, and its average.'
        # exp_attachment.source = 'CSV_LINK_HERE' # update when added attachment to SBOL collection
        # exp_attachment.format = 'https://identifiers.org/edam/format_3752'
        # doc.addAttachment(exp_attachment)
        
        # Experiment and Measurement Data
        exp = sbol2.ExperimentalData(exp_label)
        exp.wasDerivedFrom = sample_md.persistentIdentity
        doc.add(exp)

        exp_data_labels.append(exp_label)
        attachment_file_names.append(attachment_file_name)

        if (i == 5):
            break
    
    doc.write(file_path)
    return exp_data_labels, attachment_file_names, doc, chassis_label_MD
        
def partshop_attach_exp_data(synbio_username, collection_name, file_names, email, password, exp_data_labels, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    print(shop.login(email, password))
    exp_labels = ["ExperimentalData_" + label for label in exp_data_labels]  
    for label, file_name in zip(exp_labels, file_names):
        path = os.path.join(attachments_path, file_name)
        attachment_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{label}/{version}"
        shop.attachFile(attachment_uri, path)

def partshop_attach_genome_to_md(synbio_username, collection_name, file_path, email, password, chassis_label, version=1):
    shop = sbol2.PartShop("https://synbiohub.org")
    print(shop.login(email, password))

    label = "ModuleDefinition_" + chassis_label
    attachment_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{label}/{version}"

    shop.attachFile(attachment_uri, file_path)

def create_synbio_collection(email, password, file_path, id, name, description, version='1'):
    response = requests.post(
        "https://synbiohub.org/login",
        headers={"Accept": "text/plain"},
        data={"email": email, "password": password}
    )
        
    if response.ok:
        token = response.text.strip() # theres a whitespace before the token for some reason
        response = requests.post(
        'https://synbiohub.org/submit',
        headers={
            'X-authorization': token,
            'Accept': 'text/plain'
        },
        files={
        'files': open(file_path,'rb'),
        },
        data={
            'id': id,
            'version' : version,
            'name' :  name,
            'description' : description,
            'citations' : '',
            'overwrite_merge' : '0'
        },
    
    )
    else:
        print("Login failed:", response.status_code)
        print(response.text)

def partshop_pull(collection_uri, email, password):
    shop = sbol2.PartShop("https://synbiohub.org")
    doc = sbol2.Document()
    shop.login(email, password)
    s = igem.pull(collection_uri, doc)
    
    for obj in doc:
        print(obj)
    
    doc.write("test_pulling.xml")

def download_all_attachments(email, password, doc):
    shop = sbol2.PartShop("https://synbiohub.org")
    shop.login(email, password)
    for attachment in doc.attachments:
        shop.downloadAttachment(attachment.identity, filepath=pulled_attachments)

In [42]:
ecoli_genome_file_name = "E. coli.fasta"
env_email = os.getenv("SYNBIO_EMAIL")
env_password = os.getenv("SYNBIO_PASSWORD")
username = "cywong"
output_name = 'ecolipromoters.xml'
id = "Ecolipromoterexpdata"
name = "E coli promoter data exploration"
description = "A collection containing the extracted E coli data from paper"
sbol_file_name = output_name

In [40]:
exp_labels, attachment_file_names, doc, chassis_label = create_sbol_doc(df, os.path.join(sbol_path, output_name))                                           

In [33]:
create_synbio_collection(env_email, env_password, os.path.join(sbol_path, sbol_file_name), id, name, description)

In [37]:
partshop_attach_exp_data(username, id, attachment_file_names, env_email, env_password, exp_labels)

<Response [200]>


In [43]:
partshop_attach_genome_to_md(username, id, os.path.join(attachments_path, ecoli_genome_file_name), env_email, env_password, chassis_label)

<Response [200]>


In [None]:
#partshop_pull("https://synbiohub.org/user/cywong/Ecolipromoterdata/Ecolipromoterdata_collection/1", env_email, env_password)

In [None]:
# doc = sbol2.Document()
# doc.read("test_pulling.xml")

# for cd in doc.componentDefinitions:
#     print("CD:", cd)
# for md in doc.moduleDefinitions:
#     print("MD:", md)

# for exp in doc.experimentalData:
#     print("Exp:", exp)
#     print("    Attachments:", exp.attachments)

# for attachment in doc.attachments:
#     print("Attachment: ", attachment.source) 
    
# for seq in doc.sequences:
#     print("Seq:", seq.elements) 

In [None]:
# download_all_attachemnts(env_email, env_password, doc, "https://synbiohub.org/user/cywong/Ecolipromoterdataencoding")

In [None]:
df3 = pd.read_csv(os.path.join(data_path, "endo_scramble_expression_formatted_std.txt"), delimiter= "\t")
df3

# Positive promoter controls are synthetic thus have no actual location data. This refers to the 2,000 out of the 17,000 TSS they found, scrambled them to find the functional sites, including controls, scrambled variants, and unscrambled
    # name: ID represented as {[TSS name][genomic position][strand + the scrambled region]}
    # tss_name: Name of the original (unscrambled) transcription start site (TSS) this variant is derived from.
    # tss_position: Start genome coordinate of the TSS.
    # strand: Strand direction
    # scramble_start: Start of the 10 bp scrambled window relative to var_left and var_right
    # scramble_end: End of the scrambled window relative to var_left and var_right
    # var_left: Genomic coordinate of the start (5' end) of the full 150 bp promoter variant.
    # var_right: Genomic coordinate of the end (3' end) of the full 150 bp promoter variant.
    # scramble_start_pos: Genomic coordinate where the scrambled 10 bp region begins.
    # scramble_end_pos: Genomic coordinate where the scrambled 10 bp region ends.
    # scramble_pos_rel_tss: Position of the scrambled region relative to the TSS.
    # variant: The sequence
    # RNA_exp_sum_1_1: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 1 (normalized by DNA?)
    # RNA_exp_sum_1_2: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 2.
    # RNA_exp_sum_2_1: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 2.
    # RNA_exp_sum_1: Sum of RNA expression across both technical replicates for biological replicate 1.
    # RNA_exp_sum_2: Sum of RNA expression across both technical replicates for biological replicate 2.
    # RNA_exp_sum_ave: (RNA_exp_sum_1 + RNA_exp_sum_2)/2)
    # DNA_1: DNA-seq count in biological replicate 1 (how much of this fragment was integrated).
    # DNA_2: DNA-seq count in biological replicate 2
    # DNA_ave: (DNA_1 + DNA_2) / 2
    # expn_med: Median RNA expression across all barcodes, normalized by DNA abundance (RNA/DNA).
    # num_barcodes_integrated: Number of barcodes actually integrated and measured in DNA/RNA-seq.
    # category: Label indicating whether this is scrambled, unscrambled, negative, positive control
    # unscrambled_exp: Measured promoter activity of the wild-type (unscrambled) version of this promoter.
    # relative_exp: (expn_med / unscrambled_exp)

df4 = pd.read_csv(os.path.join(data_path, "fLP3_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df4
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the fLP3 site
    # name: Unique identifier for the promoter variant in the format [TSS_name,TSS_position,strand]
    # tss_name: The named identifier for the transcription start site (TSS), such as from RegulonDB or Storz 
    # tss_position: Genomic coordinate of the TSS — the position where transcription starts.
    # strand: Strand direction
    # start: Absolute genomic coordinate where the 150 bp promoter fragment starts.
    # end: Absolute genomic coordinate where the 150 bp promoter fragment ends.
    # variant: The promoter sequence
    # RNA_exp_sum_1_1: Sum of RNA-seq counts for this variant in biological replicate 1, technical replicate 1.
    # RNA_exp_sum_1_2: Sum of RNA-seq counts in biological replicate 1, technical replicate 2.
    # RNA_exp_sum_2_1: Sum of RNA-seq counts in biological replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Sum of RNA-seq counts in biological replicate 2, technical replicate 2.
    # RNA_exp_sum_1: Total RNA expression for biological replicate 1 
    # RNA_exp_sum_2: Total RNA expression for biological replicate 2 
    # RNA_exp_sum_ave: Average of RNA expression across the two biological replicates.
    # DNA_1: DNA-seq count in biological replicate 1 
    # DNA_2: DNA-seq count in biological replicate 2.
    # DNA_ave: Average of DNA_1 and DNA_2
    # expn_med: Median RNA/DNA expression across barcodes 
    # num_barcodes_integrated: Number of barcodes that were actually integrated and sequenced.
    # category: Type of promoter: ['tss', 'neg_control', 'pos_control']
    # active: Binary classification — "active" or "inactive" promoter based on expn_med


df5 = pd.read_csv(os.path.join(data_path,"peak_tile_expression_formatted_std.txt"), delimiter="\t")
df5
# The authors first used sheared genomic fragments (~200–300 bp) to find candidate promoter regions, and then designed a tiling oligo library: ~150 bp sequences overlapping by 10 bp Spanning all ~3,500 candidate promoter region, located at LB and nth-ydgR intergenic locus 
    # variant: The 150 bp DNA sequence (oligo) used in the MPRA tile
    # name: Identifier for the tile, usually formatted as “peak_start_peak_end_strand_posStart-posEnd”.
    # peak_start: Genomic start coordinate of the candidate promoter region (the whole peak region).
    # peak_end: Genomic end coordinate of the candidate promoter region.
    # strand: Strand direction
    # tile_start: Start of this 150 bp oligo relative to the peak
    # tile_end: End of the tile 
    # RNA_exp_sum_1_1: Sum of RNA reads for this tile in biological replicate 1, technical replicate 1
    # RNA_exp_sum_1_2: Same as above, but technical replicate 2
    # RNA_exp_sum_2_1: Sum of RNA reads in biological replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Same as above, but technical replicate 2.
    # RNA_exp_sum_1: Total RNA expression in biological replicate 1
    # RNA_exp_sum_2: Total RNA expression in biological replicate 2 
    # RNA_exp_sum_ave: Average RNA expression across the two biological replicates.
    # DNA_1: DNA read count in biological replicate 1
    # DNA_2: DNA read count in biological replicate 2.
    # DNA_ave: Average of DNA_1 and DNA_2
    # expn_med: Median RNA/DNA expression across all barcodes for this tile 
    # num_barcodes_integrated: Number of barcodes that were successfully integrated and sequenced.
    # category: ['tile', 'neg_control', 'random', 'pos_control']
    # peak_length: Length (in bp) of the peak region from which this tile was derived (e.g., 362 bp).
    # tile_start_relative: Position of the tile’s start within the peak (tile_start / peak_length)
    # start: Genomic start coordinate of the tile (based on tile_start + peak_start).
    # end: Genomic end coordinate of the tile.
    # active: Binary label indicating if the tile is active or inactive

df6 = pd.read_csv(os.path.join(data_path,"rLP5_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df6
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP5 site

df7 = pd.read_csv(os.path.join(data_path, "rLP6_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df7
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP6 site

        
def post_all_attachments(synbio_username, collection_name, file_names, email, password, exp_data_labels, version=1):
    sbh_url = "https://synbiohub.org/login" 
    sbh_email = email
    sbh_password = password
    
    response = requests.post(
        sbh_url,
        headers={"Accept": "text/plain"},
        data={"email": sbh_email, "password": sbh_password}
    )

    if response.ok:
        token = response.text.strip() # theres a whitespace before the token for some reason
        print(token)
        exp_labels = ["ExperimentalData_" + label for label in exp_data_labels]  
        
        for label, file_name in zip(exp_labels, file_names):
            attachment_uri = f"https://synbiohub.org/user/{synbio_username}/{collection_name}/{label}/{version}/attach"
            print(attachment_uri)
            with open(file_name, 'rb') as f:
                response = requests.post(
                    attachment_uri,
                    headers={
                        'X-authorization': token,
                        'Accept': 'text/plain'
                    },
                    files={'file': f}
                )
    
            
            print("Status Code:", response.status_code)
            print("Response Body:", response.text)
    else:
        print("Login failed:", response.status_code)
        print(response.text)

def post_attachment(attachment_uri, file_path, email, password):
    sbh_url = "https://synbiohub.org/login" 
    sbh_email = email
    sbh_password = password
    
    response = requests.post(
        sbh_url,
        headers={"Accept": "text/plain"},
        data={"email": sbh_email, "password": sbh_password}
    )
    
    if response.ok:
        token = response.text.strip() # theres a whitespace before the token for some reason
        print(token)

        with open(file_path, 'rb') as f:
            response = requests.post(
                attachment_uri,
                headers={
                    'X-authorization': token,
                    'Accept': 'text/plain'
                },
                files={'file': f}
            )
        
        print("Status Code:", response.status_code)
        print("Response Body:", response.text)
    
    else:
        print("Login failed:", response.status_code)
        print(response.text)

#df1 and df2

# fragment: 150–300 bp genomic DNA sequence that were randomly sheared and barcoded
# RNA_exp_1: RNA expression level (replicate 1) – normalized measurement of transcript abundance for this fragment in the first RNA-Seq replicate. (normalized by DNA)
# RNA_exp_2: RNA expression level (replicate 2) – same as above, but from a second biological replicate.
# RNA_exp_ave: Average RNA expression – mean of RNA_exp_1 and RNA_exp_2
# DNA_sum_1: DNA integration abundance (replicate 1) – quantifies how much of this DNA fragment (or barcode) was actually integrated in the first DNA-Seq sample. 
# DNA_sum_2: DNA integration abundance (replicate 2) – same as above, from the second replicate.
# DNA_ave: Average DNA integration level – mean of DNA_sum_1 and DNA_sum_2
# num_integrated_barcodes: Number of barcodes that were integrated into the genome for this fragment. Typically should match num_mapped_barcodes unless there are sequencing/integration issues.
# start: Start coordinate (in bp) in the E. coli MG1655 reference genome (U00096.2). 
# end: End coordinate in the E. coli MG1655 reference genome (U00096.2). 
# strand: DNA strand orientation (+ or -) 
# variation: Standard deviation or variability in RNA measurement across barcode replicates 