In [1]:
#this code reads the text files of codons generated from SNAP.pl
#the output gives an excel file of per codon dn, ds for each subtype
import os
import numpy as np
import pandas as pd

# Specify the directory containing the text files
directory = 'G:\\Projects\\HIV_updated\\snap_outputs\\2021\\codons'

# Initialize a dictionary to store the results for each file
results = {}

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        # Read the input text file
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as file:
            input_text = file.read()

        comparisons = input_text.split('This is comparison')
        # Initialize a list to store the matrices
        matrices = []

        # Process each comparison
        for comparison in comparisons:
            # Skip empty comparisons
            if not comparison.strip():
                continue

            # Split the comparison into lines
            lines = comparison.strip().split('\n')

            # Extract the comparison header
            header = lines[0].split(':')[1].strip()

            # Extract the comparison data
            data = [line.split() for line in lines[2:]]
            # Fill blank cells with zeros in the "syn" and "non" columns
            for i in range(len(data)):
                while len(data[i]) < 8:
                    data[i].append('0.00')

            # Convert the comparison data to a matrix
            matrix = np.array(data)[:, 6:8].astype(float)

            # Append the matrix to the list
            matrices.append(matrix)

        # Calculate the sum of the matrices
        sum_matrix = np.sum(matrices, axis=0)

        # Divide the sum by the total number of comparisons
        avg_matrix = sum_matrix / len(matrices)
        
        # Round the values in the average matrix to two decimal places
        avg_matrix = np.round(avg_matrix, 2)

        # Create a DataFrame from the average matrix
        df = pd.DataFrame(avg_matrix, columns=['syn', 'non'])

        # Add the index numbers as a column
        df['codon'] = np.arange(1, len(df) + 1)

        # Reorder the columns
        df = df[['codon', 'syn', 'non']]

        # Store the DataFrame in the results dictionary with the filename as the key
        results[filename] = df

# Create a new Excel file
output_filepath = os.path.join(directory, 'G:\\Projects\\HIV_updated\\snap_outputs\\2021\\2021.xlsx')
with pd.ExcelWriter(output_filepath) as writer:
    # Write each DataFrame to a separate sheet in the Excel file
    for filename, df in results.items():
        # Extract the sheet name from the filename (without the extension)
        sheet_name = os.path.splitext(filename)[0]
        df.to_excel(writer, sheet_name=sheet_name, index=False)


In [2]:
import pandas as pd

# Read the Excel file
output_filepath = 'G:\\Projects\\HIV_updated\\snap_outputs\\2021\\2021.xlsx'
excel_file = pd.ExcelFile(output_filepath, engine='openpyxl')

# Initialize a dictionary to store the sheet names and non/syn values
non_syn_values = {}

# Iterate over each sheet in the Excel file
for sheet_name in excel_file.sheet_names:
    # Read the sheet into a DataFrame
    df = pd.read_excel(output_filepath, sheet_name=sheet_name)

    # Calculate the average for each column
    avg_syn = df['syn'].mean()
    avg_non = df['non'].mean()

    # Calculate non/syn
    non_syn_ratio = avg_non / avg_syn

    # Store the non/syn value in the dictionary
    non_syn_values[sheet_name] = non_syn_ratio

# Create a new DataFrame with the sheet names and non/syn values
summary_df = pd.DataFrame(list(non_syn_values.items()), columns=['Subtypes', 'dn/ds'])

# Open the existing Excel file
with pd.ExcelWriter(output_filepath, mode='a', engine='openpyxl') as writer:
    # Write the summary DataFrame to a new sheet
    summary_df.to_excel(writer, sheet_name='Summary', index=False)
