In [1]:
import pandas as pd
import sbol2
pd.set_option('display.max_columns', None)

In [2]:
def loop_over_df(df):
    doc = sbol2.Document()
    sbol2.setHomespace('http://github.com/cywlol/promoters')
    version = '1.0'
    
    for i, row in df.iterrows():
        if (i == 10):
            break
            
        fragment_seq = row["fragment"]
        rna_exp_1 = row["RNA_exp_1"]
        rna_exp_2 = row["RNA_exp_2"]
        rna_exp_ave = row["RNA_exp_ave"]
        dna_sum_1 = row["DNA_sum_1"]
        dna_sum_2 = row["DNA_sum_2"]
        dna_ave = row["DNA_ave"]
        num_integrated = row["num_integrated_barcodes"]
        start = row["start"]
        end = row["end"]
        strand = row["strand"]
        variation = row["variation"]
        media_label = "LB"
    
        media_label_MD = f"media_{i}"
        chassis_label_MD = f"chassis_{i}"
        promoter_label_CD = f"promoter_{i}"
        promoter_seq = f"promoter_seq_{i}"
        engr_region_label_CD = f"engr_region_{i}"
        strain_label_CD = f"strain{i}"
        sample_design_label_CD = f"sample_design_{i}"
        location_promoter_annotation = f"location_promoter_annotation_{i}"
        location_promoter_label = f"location_promoter_label_{i}"
        attachment_label = f"ecoli_mg1655_fasta_{i}"
        data_label = f"exp_data_{i}"
        
        # Media (MD)
        media_md = sbol2.ModuleDefinition(media_label_MD)
        doc.addComponentDefinition(media_md)
        media_cd.addRole("http://identifiers.org/NCIT/NCIT:C85504") 
        
        # Chassis 
        chassis_cd = sbol2.ModuleDefinition(chassis_label_MD) 
        doc.addComponentDefinition(chassis_cd)
    
        fasta_attachment = sbol2.Attachment(attachment_label)
        fasta_attachment.name = 'E. coli MG1655 genome'
        fasta_attachment.description = 'E. coli MG1655 genome sequence'
        fasta_attachment.source = 'https:/0www.ncbi.nlm.nih.gov/nuccore/U00096.3?report=fasta'
        fasta_attachment.format = 'https://identifiers.org/edam/format_1929'
        chassis_cd.attachments = [fasta_attachment.persistentIdentity]
        doc.addAttachment(fasta_attachment)
        
        # Promoter and Sequence 
        promoter_cd = sbol2.ComponentDefinition(promoter_label_CD, sbol2.BIOPAX_DNA)
        promoter_cd.roles = [sbol2.SO_PROMOTER]
        seq = sbol2.Sequence(promoter_seq, fragment_seq, sbol2.SBOL_ENCODING_IUPAC)
        doc.addSequence(seq)
        promoter_cd.sequences = [seq.persistentIdentity]
        doc.addComponentDefinition(promoter_cd)
    
        annotation = sbol2.SequenceAnnotation("promoter_location")
        range = sbol2.Range("prange", start, end)
    
        if (strand == "-"):
            range.orientation = sbol2.SBOL_ORIENTATION_REVERSE_COMPLEMENT
                
        annotation.locations.add(range)
        promoter_cd.sequenceAnnotations.add(annotation)
        
        # Engineered Region 
        engineered_cd = sbol2.ComponentDefinition(engr_region_label_CD, sbol2.BIOPAX_DNA)
        engineered_cd.roles = ["https://identifiers.org/so/SO:0000804"]
        sub = engineered_cd.components.create('promoter')
        sub.definition = promoter_cd.persistentIdentity
        doc.addComponentDefinition(engineered_cd)
        # No sequence?
        
        # Strain 
        
        strain_cd = sbol2.ModuleDefinition(strain_label_CD)
        doc.addComponentDefinition(strain_cd)
        strain_c1 = strain_cd.modules.create('chassis')
        strain_c1.definition = chassis_cd.persistentIdentity
        
        strain_c2 = strain_cd.functionalComponents.create('engineered_region')
        strain_c2.definition = engineered_cd.persistentIdentity
        
        # Sample Design  
        sample_md = sbol2.ModuleDefinition(sample_design_label_CD)
        doc.addModuleDefinition(sample_md)
        
        fc_strain = sample_md.functionalComponents.create('strain')
        fc_strain.definition = strain_cd.persistentIdentity
        
        fc_media = sample_md.modules.create('media')
        fc_media.definition = media_cd.persistentIdentity
    
        # Experiment and Measurement Data
        # exp = sbol2.ExperimentalData('data')
        # exp.wasDerivedFrom.append(sample_md.persistentIdentity)
        # measure = sbol2.Measurement('rna_exp_1', rna_exp_1)
        # exp.addPropertyValue('rna_exp_1',measure)
        # doc.validate()
    
    doc.write('test.xml')
        


NameError: name 'df' is not defined

In [15]:
loop_over_df(df)

In [2]:
def loop_over_df(data, column_names):
import pandas as pd
import sbol2

# Sample dataframe for demonstration
data = {
    "RNA_exp_1": [1.316755] * 15,
    "RNA_exp_2": [1.635498] * 15,
    "RNA_exp_ave": [1.476127] * 15,
    "DNA_sum_1": [1.006226] * 15,
    "DNA_sum_2": [1.006226] * 15,
    "DNA_ave": [1.006226] * 15,
    "Start": [1838405 + i for i in range(15)],
    "End": [1838702 + i for i in range(15)],
    "Strand": ["-"] * 15,
    "Variation": [0.312743] * 15,
    "Mapped": [1] * 15,
    "Integrated": [1] * 15
}
df = pd.DataFrame(data)

# SBOL setup
sbol2.setHomespace('http://github.com/cywlol/promoters')
doc = sbol2.Document()
version = '1.0'
media_label = "LB"

# Media (MD)
media_cd = sbol2.ModuleDefinition(media_label, version)
doc.addModuleDefinition(media_cd)

# Chassis
chassis_cd = sbol2.ModuleDefinition('Ecoli_K12', version)
doc.addModuleDefinition(chassis_cd)

def loop_over_df(df):
        
    doc = sbol2.Document()
    sbol2.setHomespace('http://github.com/cywlol/promoters')
    version = '1.0'


for i, row in df.iterrows():
    if (i == 10):
        break
        
    fragment_seq = row["fragment"]
    rna_exp_1 = row["RNA_exp_1"]
    rna_exp_2 = row["RNA_exp_2"]
    rna_exp_ave = row["RNA_exp_ave"]
    dna_sum_1 = row["DNA_sum_1"]
    dna_sum_2 = row["DNA_sum_2"]
    dna_ave = row["DNA_ave"]
    num_integrated = row["num_integrated_barcodes"]
    start = row["start"]
    end = row["end"]
    strand = row["strand"]
    variation = row["variation"]
    media_label = "LB"

    media_label_MD = f"media_{i}"
    chassis_label_MD = f"chassis_{i}"
    promoter_label_CD = f"promoter_{i}"
    promoter_seq = f"promoter_seq_{i}"
    engr_region_label_CD = f"engr_region_{i}"
    strain_label_CD = f"strain{i}"
    sample_design_label_CD = f"sample_design_{i}"
    location_promoter_annotation = f"location_promoter_annotation_{i}"
    location_promoter_label = f"location_promoter_label_{i}"
    attachment_label = f"ecoli_mg1655_fasta_{i}"
    data_label = f"exp_data_{i}"
    
    # Media (MD)
    media_md = sbol2.ModuleDefinition(media_label_MD)
    doc.addComponentDefinition(media_md)
    media_cd.addRole("http://identifiers.org/NCIT/NCIT:C85504") 
    
    # Chassis 
    chassis_cd = sbol2.ModuleDefinition(chassis_label_MD) 
    doc.addComponentDefinition(chassis_cd)

    fasta_attachment = sbol2.Attachment(attachment_label)
    fasta_attachment.name = 'E. coli MG1655 genome'
    fasta_attachment.description = 'E. coli MG1655 genome sequence'
    fasta_attachment.source = 'https:/0www.ncbi.nlm.nih.gov/nuccore/U00096.3?report=fasta'
    fasta_attachment.format = 'https://identifiers.org/edam/format_1929'
    chassis_cd.attachments = [fasta_attachment.persistentIdentity]
    doc.addAttachment(fasta_attachment)
    
    # Promoter and Sequence 
    promoter_cd = sbol2.ComponentDefinition(promoter_label_CD, sbol2.BIOPAX_DNA)
    promoter_cd.roles = [sbol2.SO_PROMOTER]
    seq = sbol2.Sequence(promoter_seq, fragment_seq, sbol2.SBOL_ENCODING_IUPAC)
    doc.addSequence(seq)
    promoter_cd.sequences = [seq.persistentIdentity]
    doc.addComponentDefinition(promoter_cd)

    annotation = sbol2.SequenceAnnotation("promoter_location")
    range = sbol2.Range("prange", start, end)

    if (strand == "-"):
        range.orientation = sbol2.SBOL_ORIENTATION_REVERSE_COMPLEMENT
            
    annotation.locations.add(range)
    promoter_cd.sequenceAnnotations.add(annotation)
    
    # Engineered Region 
    engineered_cd = sbol2.ComponentDefinition(engr_region_label_CD, sbol2.BIOPAX_DNA)
    engineered_cd.roles = ["https://identifiers.org/so/SO:0000804"]
    sub = engineered_cd.components.create('promoter')
    sub.definition = promoter_cd.persistentIdentity
    doc.addComponentDefinition(engineered_cd)
    # No sequence?
    
    # Strain 
    
    strain_cd = sbol2.ModuleDefinition(strain_label_CD)
    doc.addComponentDefinition(strain_cd)
    strain_c1 = strain_cd.modules.create('chassis')
    strain_c1.definition = chassis_cd.persistentIdentity
    
    strain_c2 = strain_cd.functionalComponents.create('engineered_region')
    strain_c2.definition = engineered_cd.persistentIdentity
    
    # Sample Design  
    sample_md = sbol2.ModuleDefinition(sample_design_label_CD)
    doc.addModuleDefinition(sample_md)
    
    fc_strain = sample_md.functionalComponents.create('strain')
    fc_strain.definition = strain_cd.persistentIdentity
    
    fc_media = sample_md.modules.create('media')
    fc_media.definition = media_cd.persistentIdentity

    # Experiment and Measurement Data
    # exp = sbol2.ExperimentalData('data')
    # exp.wasDerivedFrom.append(sample_md.persistentIdentity)
    # measure = sbol2.Measurement('rna_exp_1', rna_exp_1)
    # exp.addPropertyValue('rna_exp_1',measure)
    # doc.validate()


loop_over_df(df)
doc.validate()
doc.write('fragmented_promoters.xml')

    

In [3]:
loop_over_df(df4, [])

NameError: name 'df4' is not defined

In [13]:
df = pd.read_csv("frag-rLP5_LB_expression.txt", delimiter=" ")
df.columns
# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in LB media	

Index(['fragment', 'RNA_exp_1', 'RNA_exp_2', 'RNA_exp_ave', 'DNA_sum_1',
       'DNA_sum_2', 'DNA_ave', 'num_mapped_barcodes',
       'num_integrated_barcodes', 'start', 'end', 'strand', 'variation'],
      dtype='object')

In [25]:
df2 = pd.read_csv("frag-rLP5-M9_expression.txt", delimiter=" ")
df2
# Expression levels from genomic fragment MPRA (random 200–300 bp sheared fragments) in M9 media at rLP5

# fragment: 150–300 bp genomic DNA sequence that were randomly sheared and barcoded
# RNA_exp_1: RNA expression level (replicate 1) – normalized measurement of transcript abundance for this fragment in the first RNA-Seq replicate. (normalized by DNA)
# RNA_exp_2: RNA expression level (replicate 2) – same as above, but from a second biological replicate.
# RNA_exp_ave: Average RNA expression – mean of RNA_exp_1 and RNA_exp_2
# DNA_sum_1: DNA integration abundance (replicate 1) – quantifies how much of this DNA fragment (or barcode) was actually integrated in the first DNA-Seq sample. 
# DNA_sum_2: DNA integration abundance (replicate 2) – same as above, from the second replicate.
# DNA_ave: Average DNA integration level – mean of DNA_sum_1 and DNA_sum_2
# num_integrated_barcodes: Number of barcodes that were integrated into the genome for this fragment. Typically should match num_mapped_barcodes unless there are sequencing/integration issues.
# start: Start coordinate (in bp) in the E. coli MG1655 reference genome (U00096.2). 
# end: End coordinate in the E. coli MG1655 reference genome (U00096.2). 
# strand: DNA strand orientation (+ or -) 
# variation: Standard deviation or variability in RNA measurement across barcode replicates 



Unnamed: 0,fragment,RNA_exp_1,RNA_exp_2,RNA_exp_ave,DNA_sum_1,DNA_sum_2,DNA_ave,num_mapped_barcodes,num_integrated_barcodes,start,end,strand,variation
0,ATCTTTAAATATTTAAAAGAACTGGGAGTACCCGCGAGTGCCGCTG...,0.890051,0.922716,0.906383,1.233452,1.233452,1.233452,3,1,1892252,1892540,+,0.051998
1,AGCCGGGATCTCAACTGGCACACTGACAGCAGGCTCAGCAACAGCT...,0.417847,0.500726,0.459286,1.987298,1.987298,1.987298,1,1,3110405,3110614,+,0.261046
2,CTTCCAACCCATGGCGCGTGCGTACATAAAGGTTTCGGACGCGCGC...,1.652097,0.447871,1.049984,1.136735,1.136735,1.136735,1,1,1838405,1838702,-,1.883143
3,TGACTTCTTCGTGAACTTGCTGGATATGCGTTACGAGTGGAAAGCG...,0.667351,0.607438,0.637395,1.979343,1.979343,1.979343,4,1,4133776,4134009,+,0.135708
4,TGGGGGGATGTATGGGTACGTTGTAATTAGGGATTTAACGAATTAG...,0.573034,0.632320,0.602677,1.342102,1.342102,1.342102,3,1,2862818,2863076,-,0.142033
...,...,...,...,...,...,...,...,...,...,...,...,...,...
318452,ATGCCAGCAGGTCAGTGACCTGGCTGTGAATCGAGATAAAGCCGAT...,0.856074,0.842344,0.849209,6.054634,6.054634,6.054634,3,1,887886,888093,+,0.023327
318453,TGCCCGTTGATTTTCAGAGAAGGGGAATTAGTACAGCAGACGGGCG...,0.354607,0.589399,0.472003,2.168799,2.168799,2.168799,1,1,3055172,3055425,+,0.733025
318454,GGCCTGGTGGCCTGCATCGCGCTGGCGCTGATTATCGCCACGCTCG...,1.053131,1.695967,1.374549,10.049118,10.049118,10.049118,2,1,3844004,3844212,-,0.687423
318455,AAAATGAAATTGGGCAGTTGAAACCAGACGTTTCGCCCCTATTACA...,0.625138,1.328967,0.977053,3.753320,3.753320,3.753320,2,1,1862773,1863068,-,1.088059


In [22]:
df4 = pd.read_csv("endo_scramble_expression_formatted_std.txt", delimiter= "\t")
df4
# Positive promoter controls are synthetic thus have no actual location.
# This refers to the 2,000 out of the 17,000 TSS they found, scrambled them to find the functional sites.
# The dataset includes the controls, scrambled variants, and unscrambled

# name: ID represented as {[TSS name][genomic position][strand + the scrambled region]}
# tss_name: Name of the original (unscrambled) transcription start site (TSS) this variant is derived from.
# tss_position: Start genome coordinate of the TSS.
# strand: Strand direction
# scramble_start: Start of the 10 bp scrambled window relative to var_left and var_right
# scramble_end: End of the scrambled window relative to var_left and var_right
# var_left: Genomic coordinate of the start (5' end) of the full 150 bp promoter variant.
# var_right: Genomic coordinate of the end (3' end) of the full 150 bp promoter variant.
# scramble_start_pos: Genomic coordinate where the scrambled 10 bp region begins.
# scramble_end_pos: Genomic coordinate where the scrambled 10 bp region ends.
# scramble_pos_rel_tss: Position of the scrambled region relative to the TSS.
# variant: The sequence
# RNA_exp_sum_1_1: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 1 (normalized by DNA?)
# RNA_exp_sum_1_2: Sum of RNA expression counts for barcodes in replicate 1, technical replicate 2.
# RNA_exp_sum_2_1: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 1.
# RNA_exp_sum_2_2: Sum of RNA expression counts for barcodes in replicate 2, technical replicate 2.
# RNA_exp_sum_1: Sum of RNA expression across both technical replicates for biological replicate 1.
# RNA_exp_sum_2: Sum of RNA expression across both technical replicates for biological replicate 2.
# RNA_exp_sum_ave: (RNA_exp_sum_1 + RNA_exp_sum_2)/2)
# DNA_1: DNA-seq count in biological replicate 1 (how much of this fragment was integrated).
# DNA_2: DNA-seq count in biological replicate 2
# DNA_ave: (DNA_1 + DNA_2) / 2
# expn_med: Median RNA expression across all barcodes, normalized by DNA abundance (RNA/DNA).
# num_barcodes_integrated: Number of barcodes actually integrated and measured in DNA/RNA-seq.
# category: Label indicating whether this is scrambled, unscrambled, negative, positive control
# unscrambled_exp: Measured promoter activity of the wild-type (unscrambled) version of this promoter.
# relative_exp: (expn_med / unscrambled_exp)

Unnamed: 0,name,tss_name,tss_position,strand,scramble_start,scramble_end,var_left,var_right,scramble_start_pos,scramble_end_pos,scramble_pos_rel_tss,variant,RNA_exp_sum_1_1,RNA_exp_sum_1_2,RNA_exp_sum_2_1,RNA_exp_sum_2_2,RNA_exp_sum_1,RNA_exp_sum_2,RNA_exp_sum_ave,DNA_1,DNA_2,DNA_ave,expn_med,num_barcodes_mapped,num_barcodes_integrated,category,expn_med_fitted,expn_med_fitted_scaled,unscrambled_exp,relative_exp
0,"TSS_5574_storz,1423414,-_scrambled140-150",TSS_5574_storz,1423414.0,-,140.0,150.0,1423384.0,1423534.0,1423524.0,1423534.0,-110.0,GGATTAGTCCAGCCGAATTGCTTAATACTTTCTACCAGTTGTGCCA...,0.504318,0.689500,0.654339,0.636485,0.596909,0.645412,0.621161,36.404637,35.401688,35.903162,0.500461,39,23,scramble,2.539165,0.611763,0.652428,0.952075
1,"TSS_7055_storz_regulondb,1846816,+_scrambled70-80",TSS_7055_storz_regulondb,1846816.0,+,70.0,80.0,1846696.0,1846846.0,1846766.0,1846776.0,-50.0,CCATTTGATTAACTCCTGTCGTGATATTTATTCACAAAATTAACAC...,0.146131,0.113741,0.182050,0.181035,0.129936,0.181542,0.155739,31.003500,28.704602,29.854051,0.130204,25,12,scramble,1.691925,0.407636,0.258146,0.603299
2,"TSS_17515_regulondb,4477590,-_scrambled135-145",TSS_17515_regulondb,4477590.0,-,135.0,145.0,4477560.0,4477710.0,4477695.0,4477705.0,-105.0,ATACAGACAGTAAAATGTTCTGCCACGTTACAATACTCTTATCAGT...,1.110148,1.126184,1.072438,1.179107,1.118166,1.125772,1.121969,15.980134,17.984036,16.982085,0.888817,25,13,scramble,3.427820,0.825867,0.787990,1.423837
3,"TSS_4927_regulondb,1269614,+_scrambled10-20",TSS_4927_regulondb,1269614.0,+,10.0,20.0,1269494.0,1269644.0,1269504.0,1269514.0,-110.0,GCGGTAATAACCCTGCGATCGATCGGTGCTGCCAGGTCGTGCCAGA...,1.166634,1.650663,1.004031,1.357759,1.408649,1.180895,1.294772,7.739634,5.934540,6.837087,1.061866,15,6,scramble,3.823799,0.921270,1.344917,0.962715
4,"TSS_17369_storz,4439341,-_scrambled140-150",TSS_17369_storz,4439341.0,-,140.0,150.0,4439311.0,4439461.0,4439451.0,4439461.0,-110.0,TATATTGCAAGGACTGCGTAGGCCTGATCGGCATAGCGTATCAGGC...,0.582324,0.917035,0.719130,0.562781,0.749680,0.640956,0.695318,20.349647,20.971240,20.660443,0.767422,18,9,scramble,3.150038,0.758940,0.656802,1.058641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55312,pos_control_apFAB346,pos_control_apFAB346,,,,,,,,,,GGCGCGCCTTGACAATTAATCATCCGGCTCGTAATGTTTGTGGATAGCT,3.762168,2.852998,2.689369,4.148709,3.307583,3.419039,3.363311,2.017238,1.822945,1.920092,2.581585,5,3,pos_control,7.301296,1.759105,,
55313,pos_control_apFAB87,pos_control_apFAB87,,,,,,,,,,GGCGCGCCAAAAAATTTATTTGCTTATTAATCATCCGGCTCGCATA...,1.736385,1.421404,1.646361,1.548851,1.578895,1.597606,1.588251,4.714946,6.376120,5.545533,1.436969,7,4,pos_control,4.682128,1.128068,,
55314,pos_control_apFAB70,pos_control_apFAB70,,,,,,,,,,GGCGCGCCTTGACATCGCATCTTTTTGTACCTATAATGTGTGGATA...,10.670367,13.585706,10.683286,13.121773,12.128037,11.902529,12.015283,7.862836,7.335839,7.599337,11.978279,6,3,pos_control,28.803279,6.939591,,
55315,pos_control_apFAB49,pos_control_apFAB49,,,,,,,,,,GGCGCGCCAAAAAGAGTATTGACTTTTATCCCTTGCGGCGAATACT...,1.135329,1.039307,0.922069,0.733666,1.087318,0.827868,0.957593,3.780015,4.179774,3.979894,0.953658,5,3,pos_control,3.576194,0.861615,,


In [28]:
df5 = pd.read_csv("fLP3_Endo2_lb_expression_formatted_std.txt", delimiter="\t")
df5
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the fLP3 site
# name: Unique identifier for the promoter variant in the format [TSS_name,TSS_position,strand]
# tss_name: The named identifier for the transcription start site (TSS), such as from RegulonDB or Storz 
# tss_position: Genomic coordinate of the TSS — the position where transcription starts.
# strand: Strand direction
# start: Genomic coordinate where the 150 bp promoter fragment starts.
# end: Genomic coordinate where the 150 bp promoter fragment ends.
# variant: The promoter sequence
# RNA_exp_sum_1_1: Sum of RNA-seq counts for this variant in biological replicate 1, technical replicate 1.
# RNA_exp_sum_1_2: Sum of RNA-seq counts in biological replicate 1, technical replicate 2.
# RNA_exp_sum_2_1: Sum of RNA-seq counts in biological replicate 2, technical replicate 1.
# RNA_exp_sum_2_2: Sum of RNA-seq counts in biological replicate 2, technical replicate 2.
# RNA_exp_sum_1: Total RNA expression for biological replicate 1 
# RNA_exp_sum_2: Total RNA expression for biological replicate 2 
# RNA_exp_sum_ave: Average of RNA expression across the two biological replicates.
# DNA_1: DNA-seq count in biological replicate 1 
# DNA_2: DNA-seq count in biological replicate 2.
# DNA_ave: Average of DNA_1 and DNA_2
# expn_med: Median RNA/DNA expression across barcodes 
# num_barcodes_integrated: Number of barcodes that were actually integrated and sequenced.
# category: Type of promoter: ['tss', 'neg_control', 'pos_control']
# active: Binary classification — "active" or "inactive" promoter based on expn_med


Unnamed: 0,name,tss_name,tss_position,strand,start,end,variant,RNA_exp_sum_1_1,RNA_exp_sum_1_2,RNA_exp_sum_2_1,RNA_exp_sum_2_2,RNA_exp_sum_1,RNA_exp_sum_2,RNA_exp_sum_ave,DNA_1,DNA_2,DNA_ave,expn_med,num_barcodes_mapped,num_barcodes_integrated,category,expn_med_fitted,expn_med_fitted_scaled,active
0,"TSS_11125_storz_regulondb,2945404,+",TSS_11125_storz_regulondb,2945404.0,+,2945284.0,2945434.0,GCAAATTTTGCACAAAAAATAGGCTTTAGTGATTTGTTTTTGTTCA...,43.807236,55.833039,54.661823,49.577995,49.820137,52.119909,50.970023,31.625906,35.101752,33.363829,52.010547,50,28,tss,46.359818,46.076834,active
1,"TSS_1218_storz,289782,+",TSS_1218_storz,289782.0,+,289662.0,289812.0,GTATCTGCCTCCGATTCTCTGCAGAAGCAGAAAGACATTGGATCGA...,0.562627,0.549348,0.816074,0.585183,0.555988,0.700629,0.628308,36.298833,31.606403,33.952618,0.565387,53,26,tss,0.749499,0.744924,inactive
2,"TSS_12352_storz,3253602,-",TSS_12352_storz,3253602.0,-,3253572.0,3253722.0,CGTTTGTCTGCGCTGTGTGCCGCAACGACCGCAGCAATGGGGGCCG...,0.516503,0.717569,0.658121,0.587724,0.617036,0.622923,0.619979,72.105393,72.498105,72.301749,0.577755,108,54,tss,0.760464,0.755822,inactive
3,"TSS_6478_storz,1684674,-",TSS_6478_storz,1684674.0,-,1684644.0,1684794.0,GCAGATACAACTCACACAATGCACCCGCTGTGTGAAATAAACAGAG...,1.006087,1.084363,0.763596,0.900281,1.045225,0.831939,0.938582,35.065895,37.408634,36.237265,0.936175,56,27,tss,1.078232,1.071651,active
4,"TSS_2956_storz,770940,+",TSS_2956_storz,770940.0,+,770820.0,770970.0,TTATAAAGATATGACCAAGTTCTGGGGCAAGTTGTTTGGTATCAAC...,0.715541,0.744280,0.679240,0.634099,0.729911,0.656670,0.693290,79.950519,78.564136,79.257327,0.632548,116,59,tss,0.809042,0.804104,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17616,pos_control_apFAB54,pos_control_apFAB54,,,,,GGCGCGCCTTGACATAAAGTCTAACCTATAGGATACTTACAGCCAT...,38.494902,57.975130,39.317072,44.515699,48.235016,41.916385,45.075701,3.128893,3.379367,3.254130,45.188710,28,5,pos_control,40.311705,40.065639,active
17617,pos_control_apFAB36,pos_control_apFAB36,,,,,GGCGCGCCAAAAAGAGTATTGACTATTAATCATCCGGCTCGTATAA...,58.847253,40.995121,61.542591,50.045329,49.921187,55.793960,52.857573,2.569912,3.470787,3.020350,54.347974,14,4,pos_control,48.432137,48.136503,active
17618,pos_control_apFAB82,pos_control_apFAB82,,,,,GGCGCGCCAAAAAATTTATTTGCTTTAAAGTCTAACCTATAGGCAT...,8.342111,7.818100,5.151892,7.300009,8.080105,6.225951,7.153028,5.793876,8.068636,6.931256,7.260103,13,5,pos_control,6.684909,6.644103,active
17619,pos_control_apFAB114,pos_control_apFAB114,,,,,GGCGCGCCTCGACATCAGGAAAATTTTTCTGATACTTACAGCCATG...,0.696238,0.544310,0.953141,0.916195,0.620274,0.934668,0.777471,11.040219,7.347735,9.193977,0.798860,20,8,pos_control,0.956492,0.950653,inactive


In [30]:
df6 = pd.read_csv("peak_tile_expression_formatted_std.txt", delimiter="\t")
df6["category"].unique()
# The authors first used sheared genomic fragments (~200–300 bp) to find candidate promoter regions, 
# and then designed a tiling oligo library: ~150 bp sequences overlapping by 10 bp Spanning all ~3,500 candidate promoter regions
# LB and nth-ydgR intergenic locus 

# variant: The 150 bp DNA sequence (oligo) used in the MPRA tile
# name: Identifier for the tile, usually formatted as “peak_start_peak_end_strand_posStart-posEnd”.
# peak_start: Genomic start coordinate of the candidate promoter region (the whole peak region).
# peak_end: Genomic end coordinate of the candidate promoter region.
# strand: Strand direction
# tile_start: Start of this 150 bp oligo relative to the peak
# tile_end: End of the tile 
# RNA_exp_sum_1_1: Sum of RNA reads for this tile in biological replicate 1, technical replicate 1
# RNA_exp_sum_1_2: Same as above, but technical replicate 2
# RNA_exp_sum_2_1: Sum of RNA reads in biological replicate 2, technical replicate 1.
# RNA_exp_sum_2_2: Same as above, but technical replicate 2.
# RNA_exp_sum_1: Total RNA expression in biological replicate 1
# RNA_exp_sum_2: Total RNA expression in biological replicate 2 
# RNA_exp_sum_ave: Average RNA expression across the two biological replicates.
# DNA_1: DNA read count in biological replicate 1
# DNA_2: DNA read count in biological replicate 2.
# DNA_ave: Average of DNA_1 and DNA_2
# expn_med: Median RNA/DNA expression across all barcodes for this tile 
# num_barcodes_integrated: Number of barcodes that were successfully integrated and sequenced.
# category: ['tile', 'neg_control', 'random', 'pos_control']
# peak_length: Length (in bp) of the peak region from which this tile was derived (e.g., 362 bp).
# tile_start_relative: Position of the tile’s start within the peak (tile_start / peak_length)
# start: Genomic start coordinate of the tile (based on tile_start + peak_start).
# end: Genomic end coordinate of the tile.
# active: Binary label indicating if the tile is active or inactive

array(['tile', 'neg_control', 'random', 'pos_control'], dtype=object)

In [17]:
df7 = pd.read_csv("rLP5_Endo2_lb_expression_formatted_std.txt", delimiter="\t")
df7
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP5 site

Unnamed: 0,name,tss_name,tss_position,strand,start,end,variant,RNA_exp_1,RNA_exp_2,RNA_exp_sum_ave,DNA_sum,expn_med1,expn_med2,expn_med,num_barcodes_mapped,num_barcodes_integrated,category,expn_med_fitted,expn_med_fitted_scaled,active
0,"TSS_11125_storz_regulondb,2945404,+",TSS_11125_storz_regulondb,2945404.0,+,2945284.0,2945434.0,GCAAATTTTGCACAAAAAATAGGCTTTAGTGATTTGTTTTTGTTCA...,54.590828,53.711399,54.151114,49.578588,55.212763,51.214980,53.213872,60,47,tss,53.213872,60.237193,active
1,"TSS_15364_storz,3953827,-",TSS_15364_storz,3953827.0,-,3953797.0,3953947.0,CACCGCCACGCCAGCAGCCAGACCGCCGCCGCCGACTGGCACAAAT...,0.748428,0.669080,0.708754,113.905547,0.611601,0.666221,0.638911,136,106,tss,0.638911,0.723236,inactive
2,"TSS_12352_storz,3253602,-",TSS_12352_storz,3253602.0,-,3253572.0,3253722.0,CGTTTGTCTGCGCTGTGTGCCGCAACGACCGCAGCAATGGGGGCCG...,0.706336,0.711005,0.708670,89.722359,0.639401,0.644214,0.641807,121,100,tss,0.641807,0.726515,inactive
3,"TSS_6478_storz,1684674,-",TSS_6478_storz,1684674.0,-,1684644.0,1684794.0,GCAGATACAACTCACACAATGCACCCGCTGTGTGAAATAAACAGAG...,1.061173,0.855839,0.958506,54.485220,0.978561,0.885794,0.932177,59,52,tss,0.932177,1.055209,active
4,"TSS_16748_storz,4275469,+",TSS_16748_storz,4275469.0,+,4275349.0,4275499.0,GTCAATATGCTCGTCAATCCATGCGATAAGATCCTGAATAATTTTC...,0.892963,0.802813,0.847888,48.016955,0.778401,0.718905,0.748653,49,41,tss,0.748653,0.847462,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17762,neg_control_1215677:1215827,neg_control_1215677:1215827,,+,1215677.0,1215827.0,AGAATCAGCAGTGTTAGGGCAGGCTGTCACCAATTTAATGCTTTCA...,1.332975,1.417616,1.375296,23.106246,1.223201,1.279809,1.251505,41,27,neg_control,1.251505,1.416682,active
17763,neg_control_3720155:3720305,neg_control_3720155:3720305,,+,3720155.0,3720305.0,AGACAAACAGTCTCAAGCACCCGTGGCTATTCTAGCTTAATAAGTT...,0.921965,0.881224,0.901595,28.526390,0.856241,0.924306,0.890274,27,23,neg_control,0.890274,1.007774,active
17764,neg_control_4474820:4474970,neg_control_4474820:4474970,,+,4474820.0,4474970.0,CTTAAGTAATCGAGAAAAAACAAATTTAATACAAAGGCTATTTGAT...,0.469512,0.601874,0.535693,10.462048,0.407734,0.369723,0.388728,8,7,neg_control,0.388728,0.440034,inactive
17765,neg_control_533510:533660,neg_control_533510:533660,,+,533510.0,533660.0,ACCGGTCAGCAAAATGGCGGTTACAGTTCGTGAAGCGGCGCTGGTG...,0.889601,1.078358,0.983979,0.934222,1.070301,0.693230,0.881765,3,3,neg_control,0.881765,0.998143,inactive


In [1]:
df8= pd.read_csv("rLP6_Endo2_lb_expression_formatted_std.txt", delimiter="\t")
df8
# Expression summary for the TSS promoter library (17,635 reported TSSs) in LB, integrated at the rLP6 site

NameError: name 'pd' is not defined