# Description

Generates augmented reverse_LaFleur_h10_h35.csv data that includes reversed complements for only -10 and -35. Does not reverse if Observed log(TX/Txref) == 0.

In [1]:
import pandas as pd

In [2]:
# Load the data
original_df = pd.read_csv('../Data/LaFleur/41467_2022_32829_MOESM5_ESM.csv')[['UP', 'h35', 'spacs', 'h10', 'disc', 'ITR', 'Observed']]
original_df['Observed log(TX/Txref)'] = original_df['Observed']
original_df['Reversed'] = False
original_df['Promoter Sequence'] = original_df['UP'] + original_df['h35'] + original_df['spacs'] + original_df['h10'] + original_df['disc'] + original_df['ITR']
original_df['Sequence ID'] = original_df.index
original_df.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed,Observed log(TX/Txref),Reversed,Promoter Sequence,Sequence ID
0,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CCCCGCGG,CTCTACCTTAGTTTGTACGTT,-3.386326,-3.386326,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,0
1,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGGCGG,CTCTACCTTAGTTTGTACGTT,-3.50314,-3.50314,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,1
2,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGCCCG,CTCTACCTTAGTTTGTACGTT,-4.207206,-4.207206,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,2
3,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,GCGGCGGC,CTCTACCTTAGTTTGTACGTT,-3.392439,-3.392439,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATG...,3
4,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGGGGGGC,CTCTACCTTAGTTTGTACGTT,-3.698903,-3.698903,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,4


In [3]:
def reverse_complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    return ''.join(complement[base.upper()] for base in reversed(seq))

reversed_data = []
# Iterate over the rows of the dataframe
for i, row in original_df.iterrows():
    if row['Observed log(TX/Txref)'] != 0:
        rev_h35 = reverse_complement(row['h35'])
        rev_h10 = reverse_complement(row['h10'])

        rev_promoter = row['UP'] + rev_h35 + row['spacs'] + rev_h10 + row['disc'] + row['ITR']

        reversed_data.append({
            'UP': row['UP'],
            'h35': rev_h10,
            'spacs': row['spacs'],
            'h10': rev_h35,
            'disc': row['disc'],
            'ITR': row['ITR'],
            'Observed log(TX/Txref)': -row['Observed log(TX/Txref)'],
            'Reversed' : True,
            'Promoter Sequence' : rev_promoter,
            'Sequence ID' : row['Sequence ID']
        })

# Create a new dataframe from the reversed data
df_reversed = pd.DataFrame(reversed_data)
df_reversed.head()

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed log(TX/Txref),Reversed,Promoter Sequence,Sequence ID
0,TTTTCTATCTACGTAC,ATTATA,CTATTTCCTATTTCTCT,TGTCAA,CCCCGCGG,CTCTACCTTAGTTTGTACGTT,3.386326,True,TTTTCTATCTACGTACTGTCAACTATTTCCTATTTCTCTATTATAC...,0
1,TTTTCTATCTACGTAC,ATTATA,CTATTTCCTATTTCTCT,TGTCAA,CGCGGCGG,CTCTACCTTAGTTTGTACGTT,3.50314,True,TTTTCTATCTACGTACTGTCAACTATTTCCTATTTCTCTATTATAC...,1
2,TTTTCTATCTACGTAC,ATTATA,CTATTTCCTATTTCTCT,TGTCAA,CGCGCCCG,CTCTACCTTAGTTTGTACGTT,4.207206,True,TTTTCTATCTACGTACTGTCAACTATTTCCTATTTCTCTATTATAC...,2
3,TTTTCTATCTACGTAC,ATTATA,CTATTTCCTATTTCTCT,TGTCAA,GCGGCGGC,CTCTACCTTAGTTTGTACGTT,3.392439,True,TTTTCTATCTACGTACTGTCAACTATTTCCTATTTCTCTATTATAG...,3
4,TTTTCTATCTACGTAC,ATTATA,CTATTTCCTATTTCTCT,TGTCAA,CGGGGGGC,CTCTACCTTAGTTTGTACGTT,3.698903,True,TTTTCTATCTACGTACTGTCAACTATTTCCTATTTCTCTATTATAC...,4


In [4]:
# Merge the original and reversed dataframes
df = pd.concat([original_df, df_reversed], ignore_index=True)
df.head(-10)

Unnamed: 0,UP,h35,spacs,h10,disc,ITR,Observed,Observed log(TX/Txref),Reversed,Promoter Sequence,Sequence ID
0,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CCCCGCGG,CTCTACCTTAGTTTGTACGTT,-3.386326,-3.386326,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,0
1,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGGCGG,CTCTACCTTAGTTTGTACGTT,-3.503140,-3.503140,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,1
2,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGCGCCCG,CTCTACCTTAGTTTGTACGTT,-4.207206,-4.207206,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,2
3,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,GCGGCGGC,CTCTACCTTAGTTTGTACGTT,-3.392439,-3.392439,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATG...,3
4,TTTTCTATCTACGTAC,TTGACA,CTATTTCCTATTTCTCT,TATAAT,CGGGGGGC,CTCTACCTTAGTTTGTACGTT,-3.698903,-3.698903,False,TTTTCTATCTACGTACTTGACACTATTTCCTATTTCTCTTATAATC...,4
...,...,...,...,...,...,...,...,...,...,...,...
26946,TTTTCTATCTACGTAC,TCTTTA,TATTTCCTATTTCTG,TAATGA,ACGTGTT,CTCTACCTTAGTTTGTACGTT,,2.523616,True,TTTTCTATCTACGTACTAATGATATTTCCTATTTCTGTCTTTAACG...,13466
26947,TTTTCTATCTACGTAC,GACGTA,TATTTCCTATTTCTG,TCTGTC,TAGTGTT,CTCTACCTTAGTTTGTACGTT,,2.357943,True,TTTTCTATCTACGTACTCTGTCTATTTCCTATTTCTGGACGTATAG...,13467
26948,TTTTCTATCTACGTAC,GAGGTA,TATTTCCTATTTCTG,TCTGTC,TAGTGTT,CTCTACCTTAGTTTGTACGTT,,2.011013,True,TTTTCTATCTACGTACTCTGTCTATTTCCTATTTCTGGAGGTATAG...,13468
26949,TTTTCTATCTACGTAC,TATGTA,TATTTCCTATTTCTG,TCTGTC,TAGTGTT,CTCTACCTTAGTTTGTACGTT,,1.962556,True,TTTTCTATCTACGTACTCTGTCTATTTCCTATTTCTGTATGTATAG...,13469


In [5]:
df.to_csv('../Data/Augmented/reverse_LaFleur_h10_h35.csv', index=False)