# Description

Generates augmented reverse_LaFleur_supp.csv data that includes reversed complements. Does not reverse if Observed log(TX/Txref) == 0.

Includes original columns:
* File Name
* Upstream DNA
* Promoter Sequence
* Downstream DNA
* Observed log(TX/Txref)

and new columns:
* Augmented Observed log(TX/Txref)
    * -1 to 1
* Reversed
    * Boolean



In [6]:
import pandas as pd

In [None]:
# Load the data
original_df = pd.read_csv('../Data/Combined/LaFleur_supp.csv')
original_df['Reversed'] = False
original_df['Augmented Observed log(TX/Txref)'] = original_df['Observed log(TX/Txref)']
original_df.head()

Unnamed: 0,File Name,Upstream DNA,Promoter Sequence,Downstream DNA,Observed log(TX/Txref),Reversed,Augmented Observed log(TX/Txref)
0,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,GAATTCGATCAAATTTCGAG,-3.386326,False,-3.386326
1,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATG...,GAATTCGATCAAATTTCGAG,-3.392439,False,-3.392439
2,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,GAATTCGATCAAATTTCGAG,-3.698903,False,-3.698903
3,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATA...,GAATTCGATCAAATTTCGAG,-3.979249,False,-3.979249
4,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATA...,GAATTCGATCAAATTTCGAG,-2.57763,False,-2.57763


In [None]:
def reverse_complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    return ''.join(complement[base.upper()] for base in reversed(seq))

reversed_data = []
# Iterate over the rows of the dataframe
for i, row in original_df.iterrows():
    if row['Observed log(TX/Txref)'] != 0:
        upstream_seq = reverse_complement(str(row['Downstream DNA'])) # use downstream as upstream
        rev_seq = reverse_complement(row['Promoter Sequence'])
        downstream_seq = reverse_complement(str(row['Upstream DNA'])) # use upstream as downstream
        expr = -row['Observed log(TX/Txref)']

        reversed_data.append({
            'File Name': row['File Name'],
            'Upstream DNA': upstream_seq,
            'Promoter Sequence': rev_seq,
            'Downstream DNA': downstream_seq,
            'Observed log(TX/Txref)': row['Observed log(TX/Txref)'],
            'Augmented Observed log(TX/Txref)': expr,
            'Reversed' : True
        })

# Create a new dataframe from the reversed data
df_reversed = pd.DataFrame(reversed_data)
df_reversed.head()

Unnamed: 0,File Name,Upstream DNA,Promoter Sequence,Downstream DNA,Observed log(TX/Txref),Augmented Observed log(TX/Txref),Reversed
0,La Fleur et al (Fig 3a).csv,CTCGAAATTTGATCGAATTC,AACGTACAAACTAAGGTAGAGCCGCGGGGATTATAAGAGAAATAGG...,TTCTGGAATTTGGTACCGAG,-3.386326,3.386326,True
1,La Fleur et al (Fig 3a).csv,CTCGAAATTTGATCGAATTC,AACGTACAAACTAAGGTAGAGGCCGCCGCATTATAAGAGAAATAGG...,TTCTGGAATTTGGTACCGAG,-3.392439,3.392439,True
2,La Fleur et al (Fig 3a).csv,CTCGAAATTTGATCGAATTC,AACGTACAAACTAAGGTAGAGGCCCCCCGATTATAAGAGAAATAGG...,TTCTGGAATTTGGTACCGAG,-3.698903,3.698903,True
3,La Fleur et al (Fig 3a).csv,CTCGAAATTTGATCGAATTC,AACGTACAAACTAAGGTAGAGTACTTAATATTATAAGAGAAATAGG...,TTCTGGAATTTGGTACCGAG,-3.979249,3.979249,True
4,La Fleur et al (Fig 3a).csv,CTCGAAATTTGATCGAATTC,AACGTACAAACTAAGGTAGAGTGATTATTATTATAAGAGAAATAGG...,TTCTGGAATTTGGTACCGAG,-2.57763,2.57763,True


In [11]:
# Merge the original and reversed dataframes
df = pd.concat([original_df, df_reversed], ignore_index=True)
df.head(-10)

Unnamed: 0,File Name,Upstream DNA,Promoter Sequence,Downstream DNA,Observed log(TX/Txref),Reversed,Augmented Observed log(TX/Txref)
0,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,GAATTCGATCAAATTTCGAG,-3.386326,False,-3.386326
1,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATG...,GAATTCGATCAAATTTCGAG,-3.392439,False,-3.392439
2,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,GAATTCGATCAAATTTCGAG,-3.698903,False,-3.698903
3,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATA...,GAATTCGATCAAATTTCGAG,-3.979249,False,-3.979249
4,La Fleur et al (Fig 3a).csv,CTCGGTACCAAATTCCAGAA,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATA...,GAATTCGATCAAATTTCGAG,-2.577630,False,-2.577630
...,...,...,...,...,...,...,...
97512,De Novo Designs (Fig 5 b and c).csv,ATGGAACCTCGAAATTTGATCTTAAACAAAATTATTTGTAGAGGCT...,GAATTCTTATGCGTAATATTAAGAAGAGAGTCCAGGGCTGGAAAGG...,TGTTGGACCAAAACGAAAAAAGACGCTCGAAAGCGTCTCTTT,-2.413485,True,2.413485
97513,De Novo Designs (Fig 5 b and c).csv,ATGGAACCTCGAAATTTGATCTTAAACAAAATTATTTGTAGAGGCT...,GAATTCCCACGATGCACGATACCAGCTTGGGCCTTTCAGGGCAAAG...,TGTTGGACCAAAACGAAAAAAGACGCTCGAAAGCGTCTCTTT,-3.420203,True,3.420203
97514,De Novo Designs (Fig 5 b and c).csv,ATGGAACCTCGAAATTTGATCTTAAACAAAATTATTTGTAGAGGCT...,GAATTCGGTCTATTTTAGACGAAGGACGATCTAAGTCCAGTCTTGT...,TGTTGGACCAAAACGAAAAAAGACGCTCGAAAGCGTCTCTTT,-5.042672,True,5.042672
97515,De Novo Designs (Fig 5 b and c).csv,ATGGAACCTCGAAATTTGATCTTAAACAAAATTATTTGTAGAGGCT...,GAATTCTAGGTTGCGGTATACCACACGTAGGCCTAGGTGTCAAACC...,TGTTGGACCAAAACGAAAAAAGACGCTCGAAAGCGTCTCTTT,-6.237482,True,6.237482


In [12]:
df.to_csv('../Data/Augmented/reverse_LaFleur_supp.csv', index=False)