# My Analysis

In [None]:
data_dict = {}

with open('pollutionData.txt', 'r') as input_file:
    next(input_file)
    for line in input_file:
        values = line.strip().split('\t')
        key = (values[0], values[1])
        distance = float(values[5])

        if key in data_dict:
            existing_value = data_dict[key]
            if distance < existing_value[1] and distance < 400:
                data_dict[key] = (values[3], distance)
        elif distance < 400:
            data_dict[key] = (values[3], distance)

with open('smallest_dist.txt', 'w') as output_file:
    for key, value in data_dict.items():
        output_file.write(f"{key[0]}\t{key[1]}\t{value[0]}\t{value[1]}\n")


In [None]:
# Map the pollution in smallest_dist.txt to the sample location in Sunagawa file using the enzyme id in both files
import pandas as pd

sunagawa_data = pd.read_excel('Sunagawa_TableS1.xlsx')

new_data = pd.read_csv('smallest_dist.txt', sep='\t', header=None)

# map 3rd column to 1st column
mapping_dict = dict(zip(sunagawa_data.iloc[1:, 2], sunagawa_data.iloc[1:, 0]))

# replace 2nd column with corresponding 1st column
new_data[1] = new_data[1].map(mapping_dict)

new_data.to_csv('mapped.txt', sep='\t', header=False, index=False)

In [None]:
import pandas as pd

file3_path = 'TARA243.KO.profile.release'
df3 = pd.read_csv(file3_path, delimiter='\t')

file2_path = 'mapped.txt'
df2 = pd.read_csv(file2_path, delimiter='\t', header=None, names=['ID', 'Sample', 'Value1', 'Value2'])

df_ko_filtered = df3.iloc[:, 0:1]

sample_names_to_keep = df2['Sample'].unique()
columns_to_keep_ko = [col for col in df3.columns if any(sample_name in col for sample_name in sample_names_to_keep)]
df_ko_filtered = pd.concat([df_ko_filtered, df3[columns_to_keep_ko]], axis=1)

output_ko_file_path = 'TARA243_KO_filtered.KO.profile.release'
df_ko_filtered.to_csv(output_ko_file_path, sep='\t', index=False)

In [None]:
# Calculate the correlation between the x values (distance) and y values (abundance)
# Enzyme abundance and pollution distance is correlated
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

# Read all the second values and y values from mapped_data
second_values = mapped_data[1].values
y_values = mapped_data[2].values

correlation_coefficients = {}
for row_index in range(len(release_df)):
    x_values = [release_df.iloc[row_index][str(second_value)] for second_value in second_values]

    # Resize y_values to match the length of x_values
    desired_length = len(x_values)
    y_values_resized = np.resize(y_values, desired_length)

    correlation_coefficient = calculate_correlation(x_values, y_values_resized)
    correlation_coefficients[row_index] = correlation_coefficient

# Sort the coefficients and row indices from highest to lowest
sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

# Now, sorted_coefficients is a list of (row_index, correlation_coefficient) tuples, sorted by the coefficient in descending order

# Create a list to store the first column values from the release file
first_column_values = release_df.iloc[:, 0].values

# Save the sorted coefficients and row indices along with the corresponding first values
with open("sorted_coefficients.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        first_value = first_column_values[row_index]  # Get the corresponding first value
        file.write(f"Row {row_index}: Enzyme = {first_value}, Correlation = {correlation_coefficient}\n")


# T-Distribution

In [None]:
import numpy as np
import scipy.stats as stats

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

degrees_of_freedom = 39

# Calculate and add the t-value distribution to the output file
with open("sorted_coefficients.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        first_value = first_column_values[row_index]  # Get the corresponding first value

        # Calculate the new value and t-value probability distribution
        x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        file.write(f"Row {row_index}: First Value = {first_value}, Correlation Coefficient = {correlation_coefficient}, x = {x}, Probability = {probability}\n")


In [None]:
# Take the correlation values, starting from the first one, in sorted_coefficients.txt and add it to the next until the value exceeds 1.
# Store the rows used in significant_enzymes.txt

significant_rows = []

# Initialize the total probability
total_probability = 0

# Iterate through the sorted coefficients
for row_index, correlation_coefficient in sorted_coefficients:
    first_value = first_column_values[row_index]  # Get the corresponding first value

    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    # Check if adding the current probability exceeds 1
    if total_probability + probability <= 1:
        significant_rows.append((row_index, first_value, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

# Read all the second values and y values from mapped_data
second_values = mapped_data[1].values
y_values = mapped_data[2].values

if significant_rows:
    significant_rows = significant_rows[:-1]

# Write the modified content back to the file, excluding the last row
with open("significant_enzymes.txt", "w") as file:
    for row_index, first_value, correlation_coefficient, x, probability in significant_rows:
        file.write(f"Enzyme = {first_value}, Correlation = {correlation_coefficient}\n")
        #, x = {y_values}

# Mean and Standard Deviation

In [None]:
# Read enzyme names from significant_enzymes.txt
enzyme_names = []
with open('significant_enzymes.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        enzyme = line.split('=')[1].split(',')[0].strip()
        enzyme_names.append(enzyme)

# Find matching rows in TARA243.KO.profile.release and copy to updated_releasefile.txt
with open('TARA243.KO.profile.release', 'r') as release_file, open('updated_releasefile.txt', 'w') as updated_file:
    # Copy the first row of the release file
    first_row = release_file.readline()
    updated_file.write(first_row)

    # Copy rows containing enzyme names to the new file
    for line in release_file:
        data = line.split()
        if data[0] in enzyme_names:
            updated_file.write(line)


In [None]:
# Calculate the mean and standard deviation
import pandas as pd

# Read the significant enzymes file to extract enzyme names
enzyme_names = []
with open("significant_enzymes.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        enzyme = line.split(' ')[2].replace(",", "")
        enzyme_names.append(enzyme)

# Read the release file
release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True)

# Create a dictionary to store rows for each enzyme
enzyme_rows = {enzyme: [] for enzyme in enzyme_names}

# Find rows for each enzyme in the release file
for enzyme in enzyme_names:
    enzyme_rows[enzyme] = release_df[release_df['ko'].str.contains(enzyme)]

# Calculate mean and standard deviation for each row and save to a file
with open("mean&sd.txt", "w") as file:
    for enzyme, rows in enzyme_rows.items():
        means = rows.iloc[:, 1:].mean(axis=1)
        std_devs = rows.iloc[:, 1:].std(axis=1)
        for index, (mean, std_dev) in enumerate(zip(means, std_devs)):
            file.write(f"Enzyme: {enzyme} Mean: {mean} Standard Deviation: {std_dev}\n")


In [None]:
# Normalize abundance data using significant enzyme's mean and sd
import pandas as pd

# Read mean&sd.txt file and store the mean and standard deviation for each enzyme
mean_sd_values = {}
with open("mean&sd.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        data = line.split()
        enzyme = data[1]
        mean = float(data[3])
        std_dev = float(data[6])
        mean_sd_values[enzyme] = {'mean': mean, 'std_dev': std_dev}

# Read update_releasefile.txt and normalize values using mean and std dev
normalized_values = []
with open("updated_releasefile.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        data = line.split()
        enzyme = data[0]  # Assuming enzyme name is the first column value

        # Get the mean and std dev for the enzyme
        if enzyme in mean_sd_values:
            mean = mean_sd_values[enzyme]['mean']
            std_dev = mean_sd_values[enzyme]['std_dev']

            # Normalize each value in the row using its corresponding mean and std dev
            normalized_row = [f"{(float(value) - mean) / std_dev}" for value in data[1:]]
            normalized_values.append([enzyme] + normalized_row)

# Read the first row of TARA243.KO.profile.release
with open('TARA243.KO.profile.release', 'r') as release_file:
    first_row = release_file.readline()

# Write the first row to normalized_releasefile.txt
with open('normalized_releasefile.txt', 'w') as normalized_file:
    normalized_file.write(first_row)

    # Write the normalized values to the file
    for row in normalized_values:
        normalized_file.write('\t'.join(row) + '\n')


In [None]:
#Remove non-related samples -> 59x28
sample_names = []
with open('mapped.txt', 'r') as mapped_file:
    lines = mapped_file.readlines()
    for line in lines:
        sample_name = line.split('\t')[1].strip()
        sample_names.append(sample_name)

# Copy the first column of normalized_releasefile.txt into sorted_normalized.txt
with open('normalized_releasefile.txt', 'r') as release_file, open('related_samples_normalized.txt', 'w') as sorted_file:
    # Read the first row of the release file
    first_row = release_file.readline()

    # Find the index of the first column in normalized_releasefile.txt
    first_column_index = 0

    # Find the indices of matching sample names in the first row
    indices = [i for i, name in enumerate(first_row.split('\t')) if name.strip() in sample_names]

    # Add the first column of normalized_releasefile.txt to sorted_normalized.txt
    sorted_file.write('\t'.join(first_row.split('\t')[i] for i in indices))

    # Copy corresponding columns for each sample
    for line in release_file:
        data = line.split('\t')
        sorted_file.write(data[first_column_index] + '\t' + '\t'.join(data[i] for i in indices))

In [None]:
# Sorted list based on the number of positive numbers in each row from highest to lowest
def count_positive(row):
    return sum(1 for value in row.split('\t')[1:] if float(value) > 0)

data_rows = []
with open('related_samples_normalized.txt', 'r') as sorted_file:
    # Skip the first row
    header = sorted_file.readline()

    # Read and store data rows
    for line in sorted_file:
        data_rows.append(line)

# Sort data_rows based on the number of positive values
sorted_rows = sorted(data_rows, key=count_positive, reverse=True)

# Write sorted data to a new file
with open('sorted_normalized.txt', 'w') as output_file:
    # Write the header
    output_file.write(header)

    # Write the sorted rows
    for row in sorted_rows:
        output_file.write(row)

In [None]:
# Finding enzymes that have at least 20 positive values in their row
n = 10

# Open the input file (sorted_normalized.txt) and output file (single.txt)
with open('sorted_normalized.txt', 'r') as input_file, open('single.txt', 'w') as output_file:
    # Copy the first row to the output file
    first_row = input_file.readline()
    output_file.write(first_row)

    # Iterate through the remaining rows
    for line in input_file:
        # Split the line into values
        values = line.strip().split('\t')

        # Count the number of positive values in the row (excluding the first column)
        count_positives = sum(float(value) > 0 for value in values[1:])

        # If the row has at least 'n' positive values, copy the entire row to the output file
        if count_positives >= n:
            output_file.write(line)

# Binomial Distribution and Mann U Whitney Test for Combo2

In [None]:
from scipy.stats import binom

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

n = 10

with open('normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

# Read the content of single.txt
with open('single.txt', 'r') as input_file:
    # Read enzyme names from the first column
    sample_names = input_file.readline().strip().split('\t')[1:]

    # Open the output files (combo2.txt and Wilcoxon_y-values.txt)
    with open('combo2.txt', 'w') as output_file, open('Wilcoxon_y-values2.txt', 'w') as y_file, open('Wilcoxon_y-samples2.txt', 'w') as samples_file:
        # Copy the first row to the output files
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Pair', 'Values']) + '\n')

        # Read the remaining rows
        lines = input_file.readlines()
        num_rows = len(lines)

        # Iterate through each pair of rows
        for i in range(num_rows - 1):
            for j in range(i + 1, num_rows):
                # Initialize count for positive matches and list to store positive values
                count_matches = 0
                positive_values = []

                # Compare values in the current pair of rows
                for k in range(1, len(sample_names) + 1):
                    value_i = float(lines[i].strip().split('\t')[k])
                    value_j = float(lines[j].strip().split('\t')[k])

                    # If both values are positive, increment the count and store the positive values
                    if value_i > 0 and value_j > 0:
                        count_matches += 1
                        positive_values.append(f'{value_i}, {value_j}')

                # If the count is at least 'n', calculate the fraction based on the percentage of positive values
                if count_matches >= n:
                    sample1 = lines[i].strip().split('\t')[0]
                    sample2 = lines[j].strip().split('\t')[0]

                    # Retrieve the percentage for each enzyme from the dictionary
                    percentage_sample1 = enzyme_percentages.get(sample1, 0)
                    percentage_sample2 = enzyme_percentages.get(sample2, 0)

                    # Calculate the fraction based on the percentage of positive values
                    fraction = percentage_sample1 * percentage_sample2

                    # Calculate the distribution
                    distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                    # Calculate the p-value (sum of binomial probabilities from k to 41)
                    p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                    if p_value < 0.01:
                        output_file.write(f'{sample1}\t{sample2}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                        # Copy all values where both enzymes have positive values to Wilcoxon_y-values.txt
                        pair_name = f'{sample1}-{sample2}'
                        values_str = ', '.join(positive_values)
                        y_file.write(f'{pair_name}\t{values_str}\n')

                        # Replace each positive value with its corresponding sample name
                        for idx, value in enumerate(positive_values):
                            sample_name = sample_names[idx]
                            values_str = values_str.replace(value, sample_name)

                        samples_file.write(f'{pair_name}\t{values_str}\n')


In [None]:
# Take out positive values that are not in Wilcoxon_y-values2.txt and only in single.txt
y_values = {}
with open('Wilcoxon_y-values2.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        y_values[pair_name] = [float(value) for value in values_str.split(', ')]

# Read sample.txt
with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values.txt
with open('Wilcoxon_x-values2.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Values']) + '\n')

    for pair_name, y_positive_values in y_values.items():
        # Split pair_name into individual sample names
        sample_names = pair_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in sample.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in sample.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the pair's row in Wilcoxon_y-values.txt
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                # Write results to Wilcoxon_x-values.txt
                x_output_file.write(f'{pair_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
# Replace positive x values with sample names
y_values = {}
with open('Wilcoxon_y-values2.txt', 'r') as y_file:
    next(y_file)  # Skip the header
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        y_values[pair_name] = [float(value) for value in values_str.split(', ')]

# Read sample.txt
with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values.txt
with open('Wilcoxon_x-values2.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Pair', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for pair_name, y_positive_values in y_values.items():
        # Split pair_name into individual sample names
        sample_names = pair_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in sample.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in sample.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the pair's row in Wilcoxon_y-values.txt
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                # Write results to Wilcoxon_x-values.txt
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]  # Shift the index by one
                    x_output_file.write(f'{pair_name}\t{enzyme_name}\t{sample_name}\t{value}\n')


In [None]:
# Replace sample names with pollution data from mapped.txt and perform Mann U Whitney Test
# Function to extract and format values from mapped.txt
def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

# Read mapped.txt
with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

# Process Wilcoxon_y-samples.txt
y_values = {}
with open('Wilcoxon_y-samples2.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[pair_name] = sample_values

# Process Wilcoxon_x-values.txt
x_values = {}
with open('Wilcoxon_x-values2.txt', 'r') as x_file:
    next(x_file)  # Skip the header
    for line in x_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{pair_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

from scipy.stats import mannwhitneyu

# Process MannU.txt
with open('MannU2.txt', 'w') as mann_u_file:
    # Write the header
    mann_u_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    # Iterate over the x_values
    for key, x_sample_values in x_values.items():
        # Extract enzyme pair and enzyme name
        pair_name, enzyme_name_x = key.split()

        # Get corresponding y_values
        y_sample_values = y_values.get(pair_name, [])

        # Perform Mann-Whitney U test
        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        # Write results to MannU.txt
        mann_u_file.write(f'{pair_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
# Open MannU2.txt and copy the enzyme pairs with p-values less than 0.1 for both enzymes to sig_combo2.txt
with open('MannU2.txt', 'r') as mann_u_file, open('sig_combo2.txt', 'w') as output_file:
    # Write the header to the output file
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    # Iterate through MannU2.txt
    next(mann_u_file)  # Skip the header
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])

        # Check if Mann U value is less than 0.1
        if mann_u_value < 0.1:
            # Check if the enzyme pair has another entry with Mann U value less than 0.1
            next_values = next(mann_u_file, None)
            if next_values is not None:
                next_values = next_values.strip().split('\t')
                next_enzyme_name = next_values[1]
                next_mann_u_value = float(next_values[2])

                if next_mann_u_value < 0.1:
                    # Write the enzyme pair and Mann U values to sig_combo2.txt
                    output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                    output_file.write('\t'.join([enzyme_pair, next_enzyme_name, str(next_mann_u_value)]) + '\n')

# Mann U Whitney Test for combo3

In [None]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

n = 10

with open('normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('combo3.txt', 'w') as output_file, open('Wilcoxon_y-values3.txt', 'w') as y_file, open('Wilcoxon_y-samples3.txt', 'w') as samples_file:
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Trio', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 3):
            i, j, k = combo

            count_matches = 0
            positive_values = []

            for m in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[m])
                value_j = float(lines[j].strip().split('\t')[m])
                value_k = float(lines[k].strip().split('\t')[m])

                if all(value > 0 for value in [value_i, value_j, value_k]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3

                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    trio_name = f'{sample1}-{sample2}-{sample3}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{trio_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{trio_name}\t{values_str}\n')


In [None]:
y_values = {}
with open('Wilcoxon_y-values3.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values3.txt
with open('Wilcoxon_x-values3.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-samples3.txt
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                # Write results to Wilcoxon_x-values3.txt
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')


In [None]:
y_values = {}
with open('Wilcoxon_y-values3.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values3.txt
with open('Wilcoxon_x-values3.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-values3.txt
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                # Write results to Wilcoxon_x-values3.txt
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) -1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')


In [None]:
# Replace sample names with pollution data from mapped.txt and perform Mann U Whitney Test
# Function to extract and format values from mapped.txt

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

# Read mapped.txt
with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('Wilcoxon_y-samples3.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('Wilcoxon_x-values3.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

# Print or use y_values and x_values as needed
print("y_values:", y_values)
print("x_values:", x_values)

from scipy.stats import mannwhitneyu

# Process MannU.txt
with open('MannU3.txt', 'w') as mann_u_file:
    # Write the header
    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    # Iterate over the x_values
    for key, x_sample_values in x_values.items():
        # Extract enzyme set and enzyme name
        set_name, enzyme_name_x = key.split()

        # Get corresponding y_values
        y_sample_values = y_values.get(set_name, [])

        # Perform Mann-Whitney U test
        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        # Write results to MannU.txt
        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')


y_values: {'K03233-K07246-K00015': [-0.8064490859759093, -0.5499242942393597, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.5407378912754199, -0.6857254338650942, -0.4602867824144696, -0.8101427536927925, 2.08549555534568, -0.9081295066892736, 0.1487337543179629, 0.0393742760633358, -0.3651525363457956, -0.5532161661589211, -0.4995644618989992, -0.0692973964235244, -0.0692973964235244, -0.0692973964235244, 0.0045785774709032, 0.7871707331218247, -0.7184944739858129, -0.7184944739858129, 0.300015584641817], 'K03233-K07246-K01525': [-0.8064490859759093, -0.5499242942393597, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.5407378912754199, -0.6857254338650942, -0.4602867824144696, -0.8101427536927925, 2.08549555534568, -0.9081295066892736, 0.1487337543179629, 0.0393742760633358, -0.3651525363457956, -0.5532161661589211, -0.4995644618989992, -0.0692973964235244, -0.0692

In [None]:
with open('MannU3.txt', 'r') as mann_u_file, open('sig_combo3.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')


# Mann U Whiteney Test for Combo4

In [None]:
from scipy.stats import binom
from itertools import combinations

# Function to calculate the binomial distribution
def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

# Finding quadruplets of enzymes that have at least 32 matching positive values
# Copy those values to Wilcoxon_y-values4.txt
n = 10

# Read the content of normalized_releasefile.txt
with open('normalized_releasefile.txt', 'r') as enzyme_file:
    # Read enzyme names from the first column
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    # Read enzyme percentages into a dictionary
    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243  # Assuming 243 as the total number of values
        enzyme_percentages[enzyme_name] = percentage

# Read the content of single.txt
with open('single.txt', 'r') as input_file:
    # Read enzyme names from the first column
    sample_names = input_file.readline().strip().split('\t')[1:]

    # Open the output files (combo4.txt and Wilcoxon_y-values4.txt)
    with open('combo4.txt', 'w') as output_file, open('Wilcoxon_y-values4.txt', 'w') as y_file, open('Wilcoxon_y-samples4.txt', 'w') as samples_file:
        # Copy the first row to the output files
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Quadruplet', 'Values']) + '\n')

        # Read the remaining rows
        lines = input_file.readlines()
        num_rows = len(lines)

        # Iterate through each set of four rows
        for combo in combinations(range(num_rows), 4):
            i, j, k, l = combo

            # Initialize count for positive matches and list to store positive values
            count_matches = 0
            positive_values = []

            # Compare values in the current set of four rows
            for m in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[m])
                value_j = float(lines[j].strip().split('\t')[m])
                value_k = float(lines[k].strip().split('\t')[m])
                value_l = float(lines[l].strip().split('\t')[m])

                # If all values are positive, increment the count and store the positive values
                if all(value > 0 for value in [value_i, value_j, value_k, value_l]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}')

            # If the count is at least 'n', calculate the fraction based on the percentage of positive values
            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]

                # Retrieve the percentage for each enzyme from the dictionary
                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)

                # Calculate the fraction based on the percentage of positive values
                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4

                # Calculate the distribution
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    # Copy all values where all enzymes have positive values to Wilcoxon_y-values4.txt
                    quadruplet_name = f'{sample1}-{sample2}-{sample3}-{sample4}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{quadruplet_name}\t{values_str}\n')

                    # Replace each positive value with its corresponding sample name
                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{quadruplet_name}\t{values_str}\n')


In [None]:
# Take out positive values that are not in Wilcoxon_y-values4.txt and only in single.txt
y_values = {}
with open('Wilcoxon_y-values4.txt', 'r') as y_file:
    next(y_file)  # Skip the header
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

# Read single.txt
with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values4.txt
with open('Wilcoxon_x-values4.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-samples4.txt
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                # Write results to Wilcoxon_x-values4.txt
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
# Replace positive x values with sample names
y_values = {}
with open('Wilcoxon_y-values4.txt', 'r') as y_file:
    next(y_file)  # Skip the header
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

# Read single.txt
with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values4.txt
with open('Wilcoxon_x-values4.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-values4.txt
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                # Write results to Wilcoxon_x-values4.txt
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('Wilcoxon_y-samples4.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('Wilcoxon_x-values4.txt', 'r') as x_file:
    next(x_file)  # Skip the header
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

# Print or use y_values and x_values as needed
print("y_values:", y_values)
print("x_values:", x_values)

# Process MannU4.txt
with open('MannU4.txt', 'w') as mann_u_file:
    # Write the header
    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    # Iterate over the x_values
    for key, x_sample_values in x_values.items():
        # Extract enzyme set and enzyme name
        set_name, enzyme_name_x = key.split()

        # Get corresponding y_values
        y_sample_values = y_values.get(set_name, [])

        # Perform Mann-Whitney U test
        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        # Write results to MannU4.txt
        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')


y_values: {'K03233-K07246-K00015-K01525': [-0.8064490859759093, -0.5499242942393597, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.5407378912754199, -0.6857254338650942, -0.4602867824144696, -0.8101427536927925, 2.08549555534568, -0.9081295066892736, 0.1487337543179629, 0.0393742760633358, -0.3651525363457956, -0.5532161661589211, -0.4995644618989992, -0.0692973964235244, -0.0692973964235244, -0.0692973964235244, 0.0045785774709032, 0.7871707331218247], 'K03233-K07246-K00015-K01743': [-0.8064490859759093, -0.5499242942393597, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.4602867824144696, -0.5407378912754199, -0.6857254338650942, -0.4602867824144696, -0.8101427536927925, 2.08549555534568, -0.9081295066892736, 0.1487337543179629, 0.0393742760633358, -0.3651525363457956, -0.5532161661589211, -0.4995644618989992, -0.0692973964235244, -0.0692973964235244, -0.0692973964235244, 0.0045785774

In [None]:
with open('MannU4.txt', 'r') as mann_u_file, open('sig_combo4.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')


# Mann U Whitney Test for Combo5

In [None]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

# Finding combo of 5 enzymes that have at least 32 matching positive values
# Copy those values to Wilcoxon_y-values5.txt
n = 10

# Read the content of normalized_releasefile.txt
with open('normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    # Open the output files (combo4.txt and Wilcoxon_y-values4.txt)
    with open('combo5.txt', 'w') as output_file, open('Wilcoxon_y-values5.txt', 'w') as y_file, open('Wilcoxon_y-samples5.txt', 'w') as samples_file:
        # Copy the first row to the output files
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Enzyme5', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Quintuplet', 'Values']) + '\n')

        # Read the remaining rows
        lines = input_file.readlines()
        num_rows = len(lines)

        # Iterate through each set of five rows
        for combo in combinations(range(num_rows), 5):
            i, j, k, l, m = combo

            # Initialize count for positive matches and list to store positive values
            count_matches = 0
            positive_values = []

            # Compare values in the current set of four rows
            for a in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[a])
                value_j = float(lines[j].strip().split('\t')[a])
                value_k = float(lines[k].strip().split('\t')[a])
                value_l = float(lines[l].strip().split('\t')[a])
                value_m = float(lines[m].strip().split('\t')[a])

                # If all values are positive, increment the count and store the positive values
                if all(value > 0 for value in [value_i, value_j, value_k, value_l, value_m]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}, {value_m}')

            # If the count is at least 'n', calculate the fraction based on the percentage of positive values
            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]
                sample5 = lines[m].strip().split('\t')[0]

                # Retrieve the percentage for each enzyme from the dictionary
                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)
                percentage_sample5 = enzyme_percentages.get(sample5, 0)

                # Calculate the fraction based on the percentage of positive values
                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4 * percentage_sample5

                # Calculate the distribution
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{sample5}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    # Copy all values where all enzymes have positive values to Wilcoxon_y-values4.txt
                    quintuplet_name = f'{sample1}-{sample2}-{sample3}-{sample4}-{sample5}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{quintuplet_name}\t{values_str}\n')

                    # Replace each positive value with its corresponding sample name
                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{quintuplet_name}\t{values_str}\n')

In [None]:
# Take out positive values that are not in Wilcoxon_y-values5.txt and only in single.txt
y_values = {}
with open('Wilcoxon_y-values5.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('Wilcoxon_x-values5.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-samples5.txt
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                # Write results to Wilcoxon_x-values5.txt
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
# Replace positive x values with sample names
y_values = {}
with open('Wilcoxon_y-values5.txt', 'r') as y_file:
    next(y_file)  # Skip the header
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

# Read single.txt
with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values4.txt
with open('Wilcoxon_x-values5.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-values5.txt
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                # Write results to Wilcoxon_x-values5.txt
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('Wilcoxon_y-samples5.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('Wilcoxon_x-values5.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('MannU5.txt', 'w') as mann_u_file:
    # header
    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    # Iterate over the x_values
    for key, x_sample_values in x_values.items():
        # Extract enzyme set and enzyme name
        set_name, enzyme_name_x = key.split()

        # Get corresponding y_values
        y_sample_values = y_values.get(set_name, [])

        # Perform Mann-Whitney U test
        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        # Write results to MannU5.txt
        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
with open('MannU5.txt', 'r') as mann_u_file, open('sig_combo5.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)
        next_values_5 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None and next_values_5 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')
            next_values_5 = next_values_5.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            next_enzyme_name_5 = next_values_5[1]
            next_mann_u_value_5 = float(next_values_5[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1 and next_mann_u_value_5 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_5, str(next_mann_u_value_5)]) + '\n')


# Mann U Whitney Test for Combo6

In [None]:
from scipy.stats import binom
from itertools import combinations
def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

n = 10

# Read the content of normalized_releasefile.txt
with open('normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    # Open the output files (combo4.txt and Wilcoxon_y-values4.txt)
    with open('combo6.txt', 'w') as output_file, open('Wilcoxon_y-values6.txt', 'w') as y_file, open('Wilcoxon_y-samples6.txt', 'w') as samples_file:
        # Copy the first row to the output files
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Enzyme5', 'Enzyme6', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Set', 'Values']) + '\n')

        # Read the remaining rows
        lines = input_file.readlines()
        num_rows = len(lines)

        # Iterate through each set of five rows
        for combo in combinations(range(num_rows), 6):
            i, j, k, l, m, o = combo

            # Initialize count for positive matches and list to store positive values
            count_matches = 0
            positive_values = []

            # Compare values in the current set of four rows
            for a in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[a])
                value_j = float(lines[j].strip().split('\t')[a])
                value_k = float(lines[k].strip().split('\t')[a])
                value_l = float(lines[l].strip().split('\t')[a])
                value_m = float(lines[m].strip().split('\t')[a])
                value_o = float(lines[o].strip().split('\t')[a])

                # If all values are positive, increment the count and store the positive values
                if all(value > 0 for value in [value_i, value_j, value_k, value_l, value_m, value_o]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}, {value_m}, {value_o}')

            # If the count is at least 'n', calculate the fraction based on the percentage of positive values
            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]
                sample5 = lines[m].strip().split('\t')[0]
                sample6 = lines[o].strip().split('\t')[0]

                # Retrieve the percentage for each enzyme from the dictionary
                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)
                percentage_sample5 = enzyme_percentages.get(sample5, 0)
                percentage_sample6 = enzyme_percentages.get(sample6, 0)

                # Calculate the fraction based on the percentage of positive values
                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4 * percentage_sample5 * percentage_sample6

                # Calculate the distribution
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{sample5}\t{sample6}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    # Copy all values where all enzymes have positive values to Wilcoxon_y-values4.txt
                    set_name = f'{sample1}-{sample2}-{sample3}-{sample4}-{sample5}-{sample6}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{set_name}\t{values_str}\n')

                    # Replace each positive value with its corresponding sample name
                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{set_name}\t{values_str}\n')

In [None]:
# Take out positive values that are not in Wilcoxon_y-values6.txt and only in single.txt
y_values = {}
with open('Wilcoxon_y-values6.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('Wilcoxon_x-values6.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-samples5.txt
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                # Write results to Wilcoxon_x-values5.txt
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
# Replace positive x values with sample names
y_values = {}
with open('Wilcoxon_y-values6.txt', 'r') as y_file:
    next(y_file)  # Skip the header
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

# Read single.txt
with open('single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

# Compare positive values and write to Wilcoxon_x-values6.txt
with open('Wilcoxon_x-values6.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        # Split set_name into individual sample names
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            # Find sample_name in single.txt
            sample_row = next(row for row in data if row[0] == sample_name)

            # Extract positive values from the corresponding row in single.txt
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]

            # Find unique positive values not shared with the set's row in Wilcoxon_y-values6.txt
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                # Write results to Wilcoxon_x-values6.txt
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('Wilcoxon_y-samples6.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('Wilcoxon_x-values6.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('MannU6.txt', 'w') as mann_u_file:
    # header
    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    # Iterate over the x_values
    for key, x_sample_values in x_values.items():
        # Extract enzyme set and enzyme name
        set_name, enzyme_name_x = key.split()

        # Get corresponding y_values
        y_sample_values = y_values.get(set_name, [])

        # Perform Mann-Whitney U test
        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        # Write results to MannU6.txt
        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
with open('MannU6.txt', 'r') as mann_u_file, open('sig_combo6.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)
        next_values_5 = next(mann_u_file, None)
        next_values_6 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None and next_values_5 is not None and next_values_6 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')
            next_values_5 = next_values_5.strip().split('\t')
            next_values_6 = next_values_6.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            next_enzyme_name_5 = next_values_5[1]
            next_mann_u_value_5 = float(next_values_5[2])

            next_enzyme_name_6 = next_values_6[1]
            next_mann_u_value_6 = float(next_values_6[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1 and next_mann_u_value_5 < 0.1 and next_mann_u_value_6 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_5, str(next_mann_u_value_5)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_6, str(next_mann_u_value_6)]) + '\n')


## Results


In [None]:
import pandas as pd
from scipy.stats import binom

# Function to calculate the binomial distribution
def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

# Function to count the number of samples in a set of y-samples
def count_samples(y_samples):
    return len(set(y_samples.split(', ')))

# Read the content of significant_enzymes.txt
significant_enzymes_df = pd.read_csv('significant_enzymes.txt', sep=', ', header=None, engine='python')
significant_enzymes_df.columns = ['Enzyme', 'Correlation']

# Read the content of sig_combo2.txt, sig_combo3.txt, sig_combo4.txt, sig_combo5.txt, and sig_combo6.txt
files = ['sig_combo2.txt', 'sig_combo3.txt', 'sig_combo4.txt', 'sig_combo5.txt', 'sig_combo6.txt']

# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Enzyme Pair', 'Enzyme Name', 'Mann U', 'Binomial Distribution', 'y-samples', '# of samples'])

# Append the significant_enzymes_df to the beginning of results_df
results_df = pd.concat([significant_enzymes_df, results_df], ignore_index=True)

for file in files:
    with open(file, 'r') as sig_file:
        next(sig_file)  # Skip the header

        for line in sig_file:
            values = line.strip().split('\t')
            enzyme_set = values[0]
            enzyme_name = values[1]
            mann_u_value = float(values[2])

            # Retrieve binomial distribution from the corresponding combo file
            combo_file = f'combo{len(enzyme_set.split("-"))}.txt'
            binomial_distribution = None  # Initialize with a default value

            with open(combo_file, 'r') as combo_data:
                next(combo_data)  # Skip the header
                for combo_line in combo_data:
                    combo_values = combo_line.strip().split('\t')

                    # Check if enzyme_set matches any combination in the combo file
                    combo_enzymes = set(combo_values[:len(enzyme_set.split("-"))])
                    if set(enzyme_set.split("-")) == combo_enzymes:
                        binomial_distribution = float(combo_values[-2])  # Assuming binomial distribution is the second-to-last column
                        break

            # Retrieve y-samples from the corresponding Wilcoxon_y-samples file
            y_samples_file = f'Wilcoxon_y-samples{len(enzyme_set.split("-"))}.txt'
            with open(y_samples_file, 'r') as y_samples_data:
                for y_samples_line in y_samples_data:
                    y_samples_values = y_samples_line.strip().split('\t')
                    if y_samples_values[0] == enzyme_set:
                        y_samples = y_samples_values[1:]
                        break

            # Count the number of samples in the set of y-samples
            num_samples = count_samples(', '.join(y_samples))

            new_row = pd.DataFrame([{
                'Enzyme Pair': enzyme_set,
                'Enzyme Name': enzyme_name,
                'Mann U': mann_u_value,
                'Binomial Distribution': binomial_distribution,
                'y-samples': ', '.join(y_samples),
                '# of samples': num_samples
            }])

            results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.to_excel('results.xlsx', index=False)

#Species Analysis

In [None]:
import pandas as pd

file1_path = 'miTAG.taxonomic.profiles.release.tsv'
df1 = pd.read_csv(file1_path, delimiter='\t')

file2_path = 'mapped.txt'
df2 = pd.read_csv(file2_path, delimiter='\t', header=None, names=['ID', 'Sample', 'Value1', 'Value2'])

df_filtered = df1.iloc[:, :7]

sample_names_to_keep = df2['Sample'].unique()
columns_to_keep = [col for col in df1.columns if any(sample_name in col for sample_name in sample_names_to_keep)]
df_filtered = pd.concat([df_filtered, df1[columns_to_keep]], axis=1)

output_file_path = 'miTAG_taxonomic_filtered.tsv'
df_filtered.to_csv(output_file_path, sep='\t', index=False)


In [None]:
taxonomic_data = pd.read_csv('miTAG_taxonomic_filtered.tsv', delimiter='\t')

x_values = taxonomic_data.iloc[:, 7:].values

correlation_coefficients = {}

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

release_df = pd.read_csv("TARA243.KO.profile.release", sep='\s+', skiprows=[1])


def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

for row_index in range(len(release_df)):
    y_values = mapped_data[2].values

    x_row_values = x_values[row_index]

    correlation_coefficient = calculate_correlation(x_row_values, y_values)

    correlation_coefficients[row_index] = correlation_coefficient

sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

domain = taxonomic_data.iloc[:, 0].values
phylum = taxonomic_data.iloc[:, 1].values
specie_class = taxonomic_data.iloc[:, 2].values
order = taxonomic_data.iloc[:, 3].values
family = taxonomic_data.iloc[:, 4].values
genus = taxonomic_data.iloc[:, 5].values
specie = taxonomic_data.iloc[:, 6].values

with open("species_coefficient.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        one = domain[row_index]
        two = phylum[row_index]
        three = specie_class[row_index]
        four = order[row_index]
        five = family[row_index]
        six = genus[row_index]
        seven = specie[row_index]
        file.write(f"{one}, {two}, {three}, {four}, {five}, {six}, {seven}, Correlation = {correlation_coefficient}\n")

In [None]:
import numpy as np
import scipy.stats as stats

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

degrees_of_freedom = 39

with open("species_coefficient.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        one = domain[row_index]
        two = phylum[row_index]
        three = specie_class[row_index]
        four = order[row_index]
        five = family[row_index]
        six = genus[row_index]
        seven = specie[row_index]
        x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        file.write(f"{one}, {two}, {three}, {four}, {five}, {six}, {seven}, Correlation = {correlation_coefficient}, x = {x}, Probability = {probability}\n")

In [None]:
significant_rows = []

total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    one = domain[row_index]
    two = phylum[row_index]
    three = specie_class[row_index]
    four = order[row_index]
    five = family[row_index]
    six = genus[row_index]
    seven = specie[row_index]
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, seven, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

if significant_rows:
    significant_rows = significant_rows[:-1]

with open("significant_species.txt", "w") as file:
    for row_index, seven, correlation_coefficient, x, probability in significant_rows:
        one = domain[row_index]
        two = phylum[row_index]
        three = specie_class[row_index]
        four = order[row_index]
        five = family[row_index]
        six = genus[row_index]
        seven = specie[row_index]
        file.write(f"{one}, {two}, {three}, {four}, {five}, {six}, {seven}, Correlation = {correlation_coefficient}, p-value = {probability}\n")
        #, x = {y_values}

In [None]:
# Import necessary modules
import pandas as pd

# Read the data from the file
file_path = 'significant_species.txt'
data = pd.read_csv(file_path, sep='\,', header=None)

# Assign column names based on the data structure
data.columns = ['Domain', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'ID', 'Correlation', 'Probability']

# Sort the data by the specified columns
sorted_data = data.sort_values(by=['Kingdom', 'Phylum', 'Class', 'Order', 'Family','ID',])

# Drop correlation and probability columns for the output
sorted_data_output = sorted_data[['Domain', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family','ID']]

# Write the sorted data to a new file
output_file_path = 'sorted sig species.txt'
sorted_data_output.to_csv(output_file_path, sep='\t', index=False, header=False)
