In [None]:
import pandas as pd

def read_sequences_from_file(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('>'):
                sequences.append(line.strip())
    return sequences

def find_first_sequence_with_gataa(sequences):
    for sequence in sequences:
        if 'GATAA' in sequence:
            return sequence
    return None

def generate_substrings(base_str):
    substrings = []
    length = len(base_str)
    for i in range(length):
        for j in range(i+1, length+1):
            if base_str[i:j] and len(base_str[i:j]) != length:
                substrings.append(base_str[i:j])
    return sorted(set(substrings), key=len, reverse=True)

def align_sequences(sequences, reference_sequence):
    ref_index = reference_sequence.find('GATAA')
    substrings = generate_substrings('GATAA')

    columns = [i for i in range(-7, 15)]  
    df = pd.DataFrame(columns=columns)

    for sequence in sequences:
        aligned = False
        if 'GATAA' in sequence:
            idx = sequence.find('GATAA')
            aligned = True
            entire = True
        else:
            for substring in substrings:
                if substring in sequence:
                    idx = sequence.find(substring)
                    ref_index2 = reference_sequence.find(substring)
                    entire = False
                    aligned = True
                    break

        if aligned and entire:
            offset = ref_index - idx
            indices = range(offset, offset + len(sequence))
            df = df.append(pd.Series(list(sequence), index=indices), ignore_index=True)
            
        if aligned and not entire:
            offset = ref_index2 - idx
            indices = range(offset, offset + len(sequence))
            df = df.append(pd.Series(list(sequence), index=indices), ignore_index=True)
        

    return df

input_file_path = 'Binned data/bin_10/bin_10.fasta'
output_file_path = 'aligned_sequences2.csv'

sequences = read_sequences_from_file(input_file_path)

reference_sequence = find_first_sequence_with_gataa(sequences)
if not reference_sequence:
    raise ValueError("No sequence containing 'GATAA' was found in the file.")

df_aligned = align_sequences(sequences, reference_sequence)

In [57]:
df_aligned

Unnamed: 0,-7,-6,-5,-4,-3,-2,-1,0,1,2,...,5,6,7,8,9,10,11,12,13,14
0,,,,,,,,,,,...,A,A,A,G,C,G,G,,,
1,,,,,A,A,A,C,G,A,...,,,,,,,,,,
2,,,,,,C,T,C,G,A,...,A,,,,,,,,,
3,,,,,,,,G,T,A,...,A,A,A,,,,,,,
4,,,,,,,,,G,A,...,C,G,A,A,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4493,,,,,G,T,A,G,T,A,...,,,,,,,,,,
4494,,,,,,A,G,C,T,T,...,A,,,,,,,,,
4495,,,,,,,A,C,G,A,...,G,C,,,,,,,,
4496,,,,,,,,,,A,...,C,T,T,T,A,,,,,


# Generating txt file for weblogo

In [None]:
columns_to_extract = [col for col in range(0, 8) if col in df_aligned.columns]
extracted_data = df_aligned[columns_to_extract]
extracted_data = extracted_data.fillna('-')

formatted_data = extracted_data.apply(lambda row: ''.join(row.astype(str)), axis=1)

output_file_path = 'BIN10_aligned.txt'
with open(output_file_path, 'w') as file:
    for line in formatted_data:
        file.write(line + '\n')