In [1]:
import pandas as pd
import sbol2
import os
pd.set_option('display.max_columns', None)

In [2]:
current_dir = os.path.abspath('')
data_path = os.path.join(current_dir, '..', 'data')

In [3]:
df = pd.read_csv(os.path.join(data_path, "frag-rLP5_LB_expression.txt"), delimiter=" ")
df

# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in LB media	
    # fragment: 150–300 bp genomic DNA sequence that were randomly sheared and barcoded
    # RNA_exp_1: RNA expression level (replicate 1) – normalized measurement of transcript abundance for this fragment in the first RNA-Seq replicate. (normalized by DNA)
    # RNA_exp_2: RNA expression level (replicate 2) – same as above, but from a second biological replicate.
    # RNA_exp_ave: Average RNA expression – mean of RNA_exp_1 and RNA_exp_2
    # DNA_sum_1: DNA integration abundance (replicate 1) – quantifies how much of this DNA fragment (or barcode) was actually integrated in the first DNA-Seq sample. 
    # DNA_sum_2: DNA integration abundance (replicate 2) – same as above, from the second replicate.
    # DNA_ave: Average DNA integration level – mean of DNA_sum_1 and DNA_sum_2
    # num_integrated_barcodes: Number of barcodes that were integrated into the genome for this fragment. Typically should match num_mapped_barcodes unless there are sequencing/integration issues.
    # start: Start coordinate (in bp) in the E. coli MG1655 reference genome (U00096.2). 
    # end: End coordinate in the E. coli MG1655 reference genome (U00096.2). 
    # strand: DNA strand orientation (+ or -) 
    # variation: Standard deviation or variability in RNA measurement across barcode replicates 

Unnamed: 0,fragment,RNA_exp_1,RNA_exp_2,RNA_exp_ave,DNA_sum_1,DNA_sum_2,DNA_ave,num_mapped_barcodes,num_integrated_barcodes,start,end,strand,variation
0,CTTCCAACCCATGGCGCGTGCGTACATAAAGGTTTCGGACGCGCGC...,1.316755,1.635498,1.476127,1.006226,1.006226,1.006226,1,1,1838405,1838702,-,0.312743
1,TGACTTCTTCGTGAACTTGCTGGATATGCGTTACGAGTGGAAAGCG...,0.776481,0.881533,0.829007,2.227730,2.227730,2.227730,4,1,4133776,4134009,+,0.183064
2,TGGGGGGATGTATGGGTACGTTGTAATTAGGGATTTAACGAATTAG...,0.530471,1.426090,0.978280,2.721586,2.721586,2.721586,3,1,2862818,2863076,-,1.426718
3,AAAAATAAAATTCAGCAATCAATTAATGCCTTACATCAACATGGCA...,2.392780,2.118698,2.255739,5.572886,5.572886,5.572886,2,1,3798297,3798568,-,0.175510
4,TTTTTTGTCGTTGACCTCACCATGTCGATCACTGTGCCTGTATCCC...,0.664520,0.619367,0.641943,1.163899,1.163899,1.163899,2,1,3657099,3657389,+,0.101518
...,...,...,...,...,...,...,...,...,...,...,...,...,...
321118,ATGCCAGCAGGTCAGTGACCTGGCTGTGAATCGAGATAAAGCCGAT...,0.919931,0.906042,0.912986,9.649436,9.649436,9.649436,3,1,887886,888093,+,0.021949
321119,TGCCCGTTGATTTTCAGAGAAGGGGAATTAGTACAGCAGACGGGCG...,0.740166,0.790270,0.765218,1.315612,1.315612,1.315612,1,1,3055172,3055425,+,0.094497
321120,GGCCTGGTGGCCTGCATCGCGCTGGCGCTGATTATCGCCACGCTCG...,1.458680,1.492610,1.475645,9.895783,9.895783,9.895783,2,1,3844004,3844212,-,0.033173
321121,AAAATGAAATTGGGCAGTTGAAACCAGACGTTTCGCCCCTATTACA...,1.211114,0.487061,0.849087,2.830200,2.830200,2.830200,2,1,1862773,1863068,-,1.314162


In [None]:
df2 = pd.read_csv(os.path.join(data_path, "frag-rLP5-M9_expression.txt"), delimiter=" ")
df2
# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in M9 media at rLP5

In [None]:
df4 = pd.read_csv(os.path.join(data_path, "endo_scramble_expression_formatted_std.txt"), delimiter= "\t")
df4

# Positive promoter controls are synthetic thus have no actual location data. This refers to the 2,000 out of the 17,000 TSS they found, scrambled them to find the functional sites, including controls, scrambled variants, and unscrambled
    # name: ID represented as {[TSS name][genomic position][strand + the scrambled region]}
    # tss_name: Name of the original (unscrambled) transcription start site (TSS) this variant is derived from.
    # tss_position: Start genome coordinate of the TSS.
    # strand: Strand direction
    # scramble_start: Start of the 10 bp scrambled window relative to var_left and var_right
    # scramble_end: End of the scrambled window relative to var_left and var_right
    # var_left: Genomic coordinate of the start (5' end) of the full 150 bp promoter variant.
    # var_right: Genomic coordinate of the end (3' end) of the full 150 bp promoter variant.
    # scramble_start_pos: Genomic coordinate where the scrambled 10 bp region begins.
    # scramble_end_pos: Genomic coordinate where the scrambled 10 bp region ends.
    # scramble_pos_rel_tss: Position of the scrambled region relative to the TSS.
    # variant: The sequence
    # RNA_exp_sum_1_1: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 1 (normalized by DNA?)
    # RNA_exp_sum_1_2: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 2.
    # RNA_exp_sum_2_1: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 2.
    # RNA_exp_sum_1: Sum of RNA expression across both technical replicates for biological replicate 1.
    # RNA_exp_sum_2: Sum of RNA expression across both technical replicates for biological replicate 2.
    # RNA_exp_sum_ave: (RNA_exp_sum_1 + RNA_exp_sum_2)/2)
    # DNA_1: DNA-seq count in biological replicate 1 (how much of this fragment was integrated).
    # DNA_2: DNA-seq count in biological replicate 2
    # DNA_ave: (DNA_1 + DNA_2) / 2
    # expn_med: Median RNA expression across all barcodes, normalized by DNA abundance (RNA/DNA).
    # num_barcodes_integrated: Number of barcodes actually integrated and measured in DNA/RNA-seq.
    # category: Label indicating whether this is scrambled, unscrambled, negative, positive control
    # unscrambled_exp: Measured promoter activity of the wild-type (unscrambled) version of this promoter.
    # relative_exp: (expn_med / unscrambled_exp)

In [None]:
# extract the content of row 1 variant column
variant_sequence = df4.iloc[0]['variant']
variant_sequence

In [None]:
len(variant_sequence)

In [None]:
df5 = pd.read_csv(os.path.join(data_path, "fLP3_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df5
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the fLP3 site
    # name: Unique identifier for the promoter variant in the format [TSS_name,TSS_position,strand]
    # tss_name: The named identifier for the transcription start site (TSS), such as from RegulonDB or Storz 
    # tss_position: Genomic coordinate of the TSS — the position where transcription starts.
    # strand: Strand direction
    # start: Absolute genomic coordinate where the 150 bp promoter fragment starts.
    # end: Absolute genomic coordinate where the 150 bp promoter fragment ends.
    # variant: The promoter sequence
    # RNA_exp_sum_1_1: Sum of RNA-seq counts for this variant in biological replicate 1, technical replicate 1.
    # RNA_exp_sum_1_2: Sum of RNA-seq counts in biological replicate 1, technical replicate 2.
    # RNA_exp_sum_2_1: Sum of RNA-seq counts in biological replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Sum of RNA-seq counts in biological replicate 2, technical replicate 2.
    # RNA_exp_sum_1: Total RNA expression for biological replicate 1 
    # RNA_exp_sum_2: Total RNA expression for biological replicate 2 
    # RNA_exp_sum_ave: Average of RNA expression across the two biological replicates.
    # DNA_1: DNA-seq count in biological replicate 1 
    # DNA_2: DNA-seq count in biological replicate 2.
    # DNA_ave: Average of DNA_1 and DNA_2
    # expn_med: Median RNA/DNA expression across barcodes 
    # num_barcodes_integrated: Number of barcodes that were actually integrated and sequenced.
    # category: Type of promoter: ['tss', 'neg_control', 'pos_control']
    # active: Binary classification — "active" or "inactive" promoter based on expn_med


In [None]:
df6 = pd.read_csv(os.path.join(data_path,"peak_tile_expression_formatted_std.txt"), delimiter="\t")
df6["category"].unique()
# The authors first used sheared genomic fragments (~200–300 bp) to find candidate promoter regions, and then designed a tiling oligo library: ~150 bp sequences overlapping by 10 bp Spanning all ~3,500 candidate promoter region, located at LB and nth-ydgR intergenic locus 
    # variant: The 150 bp DNA sequence (oligo) used in the MPRA tile
    # name: Identifier for the tile, usually formatted as “peak_start_peak_end_strand_posStart-posEnd”.
    # peak_start: Genomic start coordinate of the candidate promoter region (the whole peak region).
    # peak_end: Genomic end coordinate of the candidate promoter region.
    # strand: Strand direction
    # tile_start: Start of this 150 bp oligo relative to the peak
    # tile_end: End of the tile 
    # RNA_exp_sum_1_1: Sum of RNA reads for this tile in biological replicate 1, technical replicate 1
    # RNA_exp_sum_1_2: Same as above, but technical replicate 2
    # RNA_exp_sum_2_1: Sum of RNA reads in biological replicate 2, technical replicate 1.
    # RNA_exp_sum_2_2: Same as above, but technical replicate 2.
    # RNA_exp_sum_1: Total RNA expression in biological replicate 1
    # RNA_exp_sum_2: Total RNA expression in biological replicate 2 
    # RNA_exp_sum_ave: Average RNA expression across the two biological replicates.
    # DNA_1: DNA read count in biological replicate 1
    # DNA_2: DNA read count in biological replicate 2.
    # DNA_ave: Average of DNA_1 and DNA_2
    # expn_med: Median RNA/DNA expression across all barcodes for this tile 
    # num_barcodes_integrated: Number of barcodes that were successfully integrated and sequenced.
    # category: ['tile', 'neg_control', 'random', 'pos_control']
    # peak_length: Length (in bp) of the peak region from which this tile was derived (e.g., 362 bp).
    # tile_start_relative: Position of the tile’s start within the peak (tile_start / peak_length)
    # start: Genomic start coordinate of the tile (based on tile_start + peak_start).
    # end: Genomic end coordinate of the tile.
    # active: Binary label indicating if the tile is active or inactive

In [None]:
df7 = pd.read_csv(os.path.join(data_path,"rLP5_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df7
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP5 site

In [None]:
df8= pd.read_csv(os.path.join(data_path, "rLP6_Endo2_lb_expression_formatted_std.txt"), delimiter="\t")
df8
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP6 site

In [4]:
def loop_over_df(df, media="LB"):
    doc = sbol2.Document()
    sbol2.setHomespace('http://github.com/cywlol/promoters')
    version = '1.0'

    # create the expression data so we can add it as an attachment
    rna_df = df[["RNA_exp_1", "RNA_exp_2", "RNA_exp_ave"]]
    rna_df.loc[:, 'sample_id'] = [f'sample_{i}' for i in rna_df.index]
    rna_df.to_csv(f'frag_{media}_expression_data.csv', index=False)

    media_label_MD = media
    chassis_label_MD = "E_coli_chassis"
    strain_label_CD = "E_coli_strain"
    exp_label = "experiment_data"

    # define the experiment data once, then have it as a reference for each promoter row
    exp_label = "experiment_data"
    exp_attachment = sbol2.Attachment(exp_label)
    exp_attachment.name = 'fragmentation expression at rLP5'
    exp_attachment.description = 'CSV including the gene expression of the sequence: RNA_exp1, RNA_exp2, and its average.'
    exp_attachment.source = 'CSV_LINK_HERE' # update when added attachment to SBOL collection
    exp_attachment.format = 'https://identifiers.org/edam/format_3752'
    doc.addAttachment(exp_attachment)

    # define the media once, then have it as a reference for each promoter row
    media_md = sbol2.ModuleDefinition(media_label_MD)
    doc.addModuleDefinition(media_md)
    media_md.addRole("http://identifiers.org/ncit/NCIT:C48164") 
    
    # define the chassis once, then have it as a reference for each promoter row
    chassis_md = sbol2.ModuleDefinition(chassis_label_MD) 
    doc.addModuleDefinition(chassis_md)
    chassis_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
    #attach genome 
    attachment_label = "ecoli_mg1655_fasta"
    fasta_attachment = sbol2.Attachment(attachment_label)
    fasta_attachment.name = 'E. coli MG1655 genome.'
    fasta_attachment.description = 'E. coli MG1655 genome sequence'
    fasta_attachment.source = 'https:/0www.ncbi.nlm.nih.gov/nuccore/U00096.3?report=fasta'
    fasta_attachment.format = 'https://identifiers.org/edam/format_1929'
    chassis_md.attachments = [fasta_attachment.persistentIdentity]
    doc.addAttachment(fasta_attachment)
    # add tax id as a  was_derived_from relation to https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=511145

    for i, row in df.iterrows():
        fragment_seq = row["fragment"]

        if (i == 3):
            break

        # Establish the identities of each component, ensuring that each one is unique
        promoter_label_CD = f"promoter_{i}"
        promoter_seq_label = f"promoter_seq_{i}"
        sample_design_label_CD = f"sample_design_{i}"
        strain_label_CD = f"strain_{i}"
        data_label = f"exp_data_{i}"
        #engr_region_label_CD = f"engr_region_{i}"
        #location_promoter_annotation = f"location_promoter_annotation_{i}"
        #location_promoter_label = f"location_promoter_label_{i}"
        
        
        
        # Promoter and Sequence 
        promoter_cd = sbol2.ComponentDefinition(promoter_label_CD, sbol2.BIOPAX_DNA)
        promoter_cd.roles = [sbol2.SO_PROMOTER]
        seq = sbol2.Sequence(promoter_seq_label, fragment_seq, sbol2.SBOL_ENCODING_IUPAC)
        doc.addSequence(seq)
        promoter_cd.sequences = [seq.persistentIdentity]
        doc.addComponentDefinition(promoter_cd)

            # define the strain once, then have it as a reference for each promoter row
        strain_md = sbol2.ModuleDefinition(strain_label_CD)
        strain_md.addRole("http://identifiers.org/ncit/NCIT:C14419")
        doc.addModuleDefinition(strain_md)
        strain_c1 = strain_md.modules.create('chassis')
        strain_c1.definition = chassis_md.persistentIdentity
        strain_c2 = strain_md.functionalComponents.create('promoter')
        strain_c2.definition = promoter_cd.persistentIdentity
    
        #annotation = sbol2.SequenceAnnotation("promoter_location")
        #range = sbol2.Range("prange", start, end)
    
        #if (strand == "-"):
        #    range.orientation = sbol2.SBOL_ORIENTATION_REVERSE_COMPLEMENT
                
        #annotation.locations.add(range)
        #promoter_cd.sequenceAnnotations.add(annotation)
        '''
        # Engineered Region 
        engineered_cd = sbol2.ComponentDefinition(engr_region_label_CD, sbol2.BIOPAX_DNA)
        engineered_cd.roles = ["https://identifiers.org/so/SO:0000804"]
        sub = engineered_cd.components.create('promoter')
        sub.definition = promoter_cd.persistentIdentity
        doc.addComponentDefinition(engineered_cd)
        # No sequence?
        '''
        
        #strain_c2 = strain_md.functionalComponents.create('engineered_region')
        #strain_c2.definition = strain_md.persistentIdentity
        
        # Sample Design  
        sample_md = sbol2.ModuleDefinition(sample_design_label_CD)
        doc.addModuleDefinition(sample_md)
        sample_md.addRole("http://identifiers.org/obo/OBI:0000073")

        m_strain = sample_md.modules.create('strain')
        m_strain.definition = strain_md.persistentIdentity
        
        m_media = sample_md.modules.create('media')
        m_media.definition = media_md.persistentIdentity
    
        # Experiment and Measurement Data
        exp = sbol2.ExperimentalData(f'sample_{i}_expression_data')
        exp.wasDerivedFrom.append(sample_md.persistentIdentity)
        exp.attachments = [exp_attachment.persistentIdentity]
        doc.add(exp)
    doc.write('test.xml')
        


In [6]:
loop_over_df(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
