In [25]:
pip install seaborn


Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.3/293.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [20]:
# this code creates two lists to use them for building tables from TSV

import os

def read_sequence_column(tsv_file):
    with open(tsv_file, 'r') as file:
        lines = file.readlines()

    # Find the index of the "Sequence" column in the header row
    header_row = lines[1].strip().split('\t')
    sequence_index = header_row.index('Sequence')

    # Extract values from the first row ("data1") and store them in a set to avoid duplicates
    unique_values = set()
    data_row = lines[0].strip().split('\t')  # Read only the first data row
    unique_values.add(data_row[sequence_index])

    return unique_values

def extract_unique_name_fragments(tsv_file, unique_name_fragments_set):
    basename = os.path.basename(tsv_file)
    start_index = basename.find("bbkstar_results_") + len("bbkstar_results_")
    end_index = basename.find("_A")
    unique_name_fragment = basename[start_index:end_index]

    # Check if the unique_name_fragment is already in the set
    if unique_name_fragment not in unique_name_fragments_set:
        unique_name_fragments_set.add(unique_name_fragment)
        return unique_name_fragment
    else:
        return None


def get_tsv_files_paths(base_dir):
    tsv_files = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.tsv'):
                tsv_files.append(os.path.join(root, file))
    return tsv_files

if __name__ == "__main__":
    base_directory = "/home/vlad/Desktop/Sucessful_Osprey_February/result"  # Replace this with the actual path to the parent directory
    tsv_files_paths = get_tsv_files_paths(base_directory)

    # Create a set to store unique name fragments
    unique_name_fragments = set()
    
    unique_sequences = set()
    for tsv_file in tsv_files_paths:
        sequences = read_sequence_column(tsv_file)
        unique_sequences.update(sequences)
        
        unique_name_fragment = extract_unique_name_fragments(tsv_file, unique_name_fragments)
        if unique_name_fragment is not None:
            list_of_unique_name_fragments = list(unique_name_fragments)

    # Sort the unique sequences by numerical values (ignoring letters)
    sorted_unique_sequences = sorted(unique_sequences, key=lambda x: int(''.join(filter(str.isdigit, x))))



#     # Print the resulting list
print(list_of_unique_name_fragments)
print(sorted_unique_sequences)



['6y4o', '2bcx', '6xxf']
['A11=glu', 'A14=glu', 'A15=ala', 'A18=leu', 'A19=phe', 'A27=ile', 'A32=leu', 'A35=val', 'A36=met', 'A38=ser', 'A39=leu', 'A41=gln', 'A47=glu', 'A51=met', 'A52=ile', 'A54=glu', 'A55=val', 'A63=ile', 'A68=phe', 'A71=met', 'A72=met', 'A74=arg', 'A75=lys']


In [21]:
# this code creates two tables you can use to build a heatmap from TSV data

import os
import pandas as pd

def read_sequence_column(tsv_file):
    with open(tsv_file, 'r') as file:
        lines = file.readlines()

    # Find the index of the "Sequence" column in the header row
    header_row = lines[1].strip().split('\t')
    sequence_index = header_row.index('Sequence')

    # Extract values from the row with "Seq ID" as the sequence
    row0_data = lines[0].strip().split('\t')
    row2_data = lines[2].strip().split('\t')

    # Find the index of the "K* Score (Log10)" column in the header row
    k_score_index = header_row.index('K* Score (Log10)')

    # Subtract values of row2 from row0 for the "K* Score (Log10)" column
    k_delta = float(row2_data[k_score_index]) - float(row0_data[k_score_index])

    # Get the value of the "Sequence" column from row2
    sequence_wt = row0_data[sequence_index]
    sequence_mut = row2_data[sequence_index].split('=')[1]

#     print(sequence, k_delta)
    return sequence_wt, sequence_mut, k_delta

def get_tsv_files_paths(base_dir):
    tsv_files = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.tsv'):
                tsv_files.append(os.path.join(root, file))
    return tsv_files

if __name__ == "__main__":
    base_directory = "/home/vlad/Desktop/Sucessful_Osprey_February/result"
    tsv_files_paths = get_tsv_files_paths(base_directory)

    # ... (existing code to get unique sequences and name fragments) ...

    # Create an empty dictionary to store the data for the DataFrame
    data_dict1 = {'WT_amino_acids': []}
    data_dict2 = {'WT_amino_acids': []}

    for name_fragment in list_of_unique_name_fragments:
        data_dict1[name_fragment] = []
        data_dict2[name_fragment] = []

    # Process each TSV file
    for tsv_file in tsv_files_paths:
        # Check if the file name contains any value from list_of_unique_name_fragments
        if any(name_fragment in os.path.basename(tsv_file) for name_fragment in list_of_unique_name_fragments):
            sequence_wt, sequenc_mut, k_delta = read_sequence_column(tsv_file)

            # Append data to the dictionary
            data_dict1['WT_amino_acids'].append(sequence_wt)
            data_dict2['WT_amino_acids'].append(sequence_wt)

            # Find the matching name fragment and store the k_delta value
            for name_fragment in list_of_unique_name_fragments:
                if name_fragment in os.path.basename(tsv_file):
                    data_dict1[name_fragment].append(k_delta)
                    data_dict2[name_fragment].append(sequenc_mut)
                else:
                    data_dict1[name_fragment].append(float('nan'))
                    data_dict2[name_fragment].append(float('nan'))

    # Create the DataFrame
    df1 = pd.DataFrame(data_dict1)
    df1.set_index('WT_amino_acids', inplace=True)
    df2 = pd.DataFrame(data_dict2)
    df2.set_index('WT_amino_acids', inplace=True)
    
    df1.sort_values(by='WT_amino_acids', ascending=True, inplace=True)    
    df2.sort_values(by='WT_amino_acids', ascending=True, inplace=True)
    merged_df1 = df1.groupby('WT_amino_acids').max()
    
def custom_merge(series):
    return series.apply(lambda x: ', '.join(x.dropna().unique()) if x.notnull().any() else "NaN")

# Apply the custom_merge function to each group of rows with the same index ('WT_amino_acids')
merged_df2 = df2.groupby('WT_amino_acids').apply(custom_merge)

print(merged_df1)
print(merged_df2)


                    6y4o      2bcx      6xxf
WT_amino_acids                              
A11=glu              NaN  0.000000  0.485841
A14=glu              NaN  0.231432  0.267061
A15=ala              NaN  0.000000  1.990904
A18=leu              NaN  0.000000  0.144689
A19=phe         0.000000  0.000000  0.000000
A27=ile         0.206819  0.008724  0.000000
A32=leu         0.000000  0.015981  0.000000
A35=val         0.325599  0.000000  2.673205
A36=met         0.000000  0.000000  0.000000
A38=ser         3.897585  3.471798  4.589735
A39=leu         3.141124  0.000000  0.000000
A41=gln         2.593957  3.167038  0.289900
A47=glu              NaN  0.000000       NaN
A51=met         0.000000  0.000000  0.627191
A52=ile         0.063553  0.161253  0.000000
A54=glu              NaN  0.926277  4.178508
A55=val         0.000000  0.000000  0.000000
A63=ile         0.289148  0.113335  0.000000
A68=phe         0.000000  0.000000  0.000000
A71=met         0.000000  0.000000  0.000000
A72=met   