# My Analysis

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import csv
import itertools
from scipy.stats import pearsonr, t
from scipy.stats import spearmanr
from scipy.stats import binom

In [2]:
data_dict = {}

with open('pollutionData.txt', 'r') as input_file:
    next(input_file)
    for line in input_file:
        values = line.strip().split('\t')
        key = (values[0], values[1])
        distance = float(values[5])

        if key in data_dict:
            existing_value = data_dict[key]
            if distance < existing_value[1] and distance < 400:
                data_dict[key] = (values[3], distance)
        elif distance < 400:
            data_dict[key] = (values[3], distance)

with open('smallest_dist.txt', 'w') as output_file:
    for key, value in data_dict.items():
        output_file.write(f"{key[0]}\t{key[1]}\t{value[0]}\t{value[1]}\n")

In [3]:
# Map the pollution in smallest_dist.txt to the sample location in Sunagawa file using the enzyme id in both files
sunagawa_data = pd.read_excel('Sunagawa_TableS1.xlsx')
new_data = pd.read_csv('smallest_dist.txt', sep='\t', header=None)

mapping_dict = dict(zip(sunagawa_data.iloc[1:, 2], sunagawa_data.iloc[1:, 0]))
new_data[1] = new_data[1].map(mapping_dict)

new_data.to_csv('mapped.txt', sep='\t', header=False, index=False)

  warn(msg)


In [4]:
file3_path = 'TARA243.KO.profile.release'
df3 = pd.read_csv(file3_path, delimiter='\t')

file2_path = 'mapped.txt'
df2 = pd.read_csv(file2_path, delimiter='\t', header=None, names=['ID', 'Sample', 'Value1', 'Value2'])

df_ko_filtered = df3.iloc[:, 0:1]

sample_names_to_keep = df2['Sample'].unique()
columns_to_keep_ko = [col for col in df3.columns if any(sample_name in col for sample_name in sample_names_to_keep)]
df_ko_filtered = pd.concat([df_ko_filtered, df3[columns_to_keep_ko]], axis=1)

output_ko_file_path = 'TARA243_KO_filtered.KO.profile.release'
df_ko_filtered.to_csv(output_ko_file_path, sep='\t', index=False)

# Calculating Pearson's Correlation

In [5]:
def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values # sampleIDs
y_values = mapped_data[2].values # pollutionValues

correlation_coefficients = {}
x_values_for_all_enzymes = []

# For each row of the enzyme abundance data, get corresponding x_values, calculate correlation coefficient, store
for row_index in range(len(release_df)):
    x_values = [release_df.iloc[row_index][str(second_value)] for second_value in second_values]

    correlation_coefficient = calculate_correlation(x_values, y_values)
    correlation_coefficients[row_index] = correlation_coefficient

    enzyme_name = release_df.iloc[row_index, 0]
    x_values_for_all_enzymes.append([enzyme_name] + x_values)

# descending order sort
sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

first_column_values = release_df.iloc[:, 0].values

# Save the sorted coefficients
with open("all_pearson_correlations.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        first_value = first_column_values[row_index]  # Get the corresponding enzyme name
        file.write(f"Row {row_index}: Enzyme = {first_value}, Correlation = {correlation_coefficient}\n")

# Save x_values
column_names = ['Enzyme'] + [f'Pollution_{y_val}' for y_val in y_values]  # Using y_values as sample identifiers
x_values_df = pd.DataFrame(x_values_for_all_enzymes, columns=column_names)
x_values_df.to_csv('pearson_x_y_values.csv', index=False)


  release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
  correlation_coefficient, _ = pearsonr(x_values, y_values)


##### T-Distribution

In [6]:
def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

degrees_of_freedom = 39

# Calculate and add the t-value distribution to file
with open("all_pearson_correlations.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        first_value = first_column_values[row_index]

        x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        file.write(f"Row {row_index}: First Value = {first_value}, Correlation Coefficient = {correlation_coefficient}, x = {x}, Probability = {probability}\n")


In [7]:
significant_rows = []
total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    first_value = first_column_values[row_index]
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, first_value, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

if significant_rows:
    significant_rows = significant_rows[:-1]

with open("sig_pearson_enzymes.txt", "w") as file:
    for row_index, first_value, correlation_coefficient, x, probability in significant_rows:
        file.write(f"Enzyme = {first_value}, Correlation = {correlation_coefficient}\n")
        #, x = {y_values}

#### Correlation calculation & t-dist

In [8]:
def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

degrees_of_freedom = 39

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

correlation_coefficients = {}
x_values_for_all_enzymes = []

for row_index in range(len(release_df)):
    x_values = [release_df.iloc[row_index][str(second_value)] for second_value in second_values]

    desired_length = len(x_values)
    y_values_resized = np.resize(y_values, desired_length)

    correlation_coefficient = calculate_correlation(x_values, y_values_resized)
    correlation_coefficients[row_index] = correlation_coefficient

    enzyme_name = release_df.iloc[row_index, 0]
    x_values_for_all_enzymes.append([enzyme_name] + x_values)

# Sort in descending order
sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

first_column_values = release_df.iloc[:, 0].values

# Calculate the t and p-value for the sorted coefficients
significant_rows = []
total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    first_value = first_column_values[row_index]  # Get the corresponding enzyme name
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, first_value, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

with open("sig_pearson_enzymes.txt", "w") as file:
    for row_index, first_value, correlation_coefficient, x, probability in significant_rows:
        file.write(f"Enzyme = {first_value}, Correlation = {correlation_coefficient}, t-dist = {x}, p-value = {probability}\n")

  release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
  correlation_coefficient, _ = pearsonr(x_values, y_values)


In [9]:
with open('all_pearson_correlations.txt', 'r') as input_file:
    with open('all_pearson_correlations.tab', 'w', newline='') as tab_output_file:
        tab_writer = csv.writer(tab_output_file, delimiter='\t')
        tab_writer.writerow(['Enzyme', 'Correlation'])

        with open('all_pearson_correlations.csv', 'w', newline='') as csv_output_file:
            csv_writer = csv.writer(csv_output_file)
            csv_writer.writerow(['Enzyme', 'Correlation'])

            for line in input_file:
                line = line.strip()
                parts = line.split(',')

                if len(parts) == 2:
                    enzyme = parts[0].split('=')[1].strip()
                    correlation = float(parts[1].split('=')[1].strip())
                    tab_writer.writerow([enzyme, correlation])
                    csv_writer.writerow([enzyme, correlation])

with open('sig_pearson_enzymes.txt', 'r') as infile:
    lines = infile.readlines()

    with open('sig_pearson_enzymes.tab', 'w', newline='') as tab_output_file:
        tab_writer = csv.writer(tab_output_file, delimiter='\t')
        tab_writer.writerow(['Enzyme', 'Correlation'])

        with open('sig_pearson_enzymes.csv', 'w', newline='') as csv_output_file:
            csv_writer = csv.writer(csv_output_file)
            csv_writer.writerow(['Enzyme', 'Correlation'])

            for line in lines:
                parts = line.strip().split(', ')
                enzyme = parts[0].split(' = ')[1]
                correlation = parts[1].split(' = ')[1]
                tab_writer.writerow([enzyme, correlation])
                csv_writer.writerow([enzyme, correlation])


## Spearman Correlation Calculation

In [10]:
from scipy.stats import spearmanr  # Import spearmanr instead of pearsonr

def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = spearmanr(x_values, y_values)
    return correlation_coefficient

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

correlation_coefficients = {}
x_values_for_all_enzymes = []

for row_index in range(len(release_df)):
    x_values = [release_df.iloc[row_index][str(second_value)] for second_value in second_values]

    desired_length = len(x_values)
    y_values_resized = np.resize(y_values, desired_length)

    correlation_coefficient = calculate_correlation(x_values, y_values_resized)
    correlation_coefficients[row_index] = correlation_coefficient

    enzyme_name = release_df.iloc[row_index, 0]
    x_values_for_all_enzymes.append([enzyme_name] + x_values)

sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

first_column_values = release_df.iloc[:, 0].values

with open("all_spearman_correlations.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        first_value = first_column_values[row_index]
        file.write(f"Row {row_index}: Enzyme = {first_value}, Spearman Correlation = {correlation_coefficient}\n")

column_names = ['Enzyme'] + [f'Pollution_{y_val}' for y_val in y_values]
x_values_df = pd.DataFrame(x_values_for_all_enzymes, columns=column_names)
x_values_df.to_csv('spearman_x_y_values.csv', index=False)


  release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
  correlation_coefficient, _ = spearmanr(x_values, y_values)


In [11]:
import numpy as np
import scipy.stats as stats

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

degrees_of_freedom = 39

with open("all_spearman_correlations.txt", "w") as file:
    for row_index, correlation_coefficient in sorted_coefficients:
        first_value = first_column_values[row_index]  # Get the corresponding first value

        x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        file.write(f"Row {row_index}: First Value = {first_value}, Correlation Coefficient = {correlation_coefficient}, x = {x}, Probability = {probability}\n")

In [12]:
significant_rows = []
total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    first_value = first_column_values[row_index]

    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, first_value, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

if significant_rows:
    significant_rows = significant_rows[:-1]

with open("sig_spearman_enzymes.txt", "w") as file:
    for row_index, first_value, correlation_coefficient, x, probability in significant_rows:
        file.write(f"Enzyme = {first_value}, Correlation = {correlation_coefficient}\n")
        #, x = {y_values}

In [13]:
import numpy as np
import scipy.stats as stats
import pandas as pd
from scipy.stats import spearmanr

def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = spearmanr(x_values, y_values)
    return correlation_coefficient

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

degrees_of_freedom = 39

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

correlation_coefficients = {}
x_values_for_all_enzymes = []

for row_index in range(len(release_df)):
    x_values = [release_df.iloc[row_index][str(second_value)] for second_value in second_values]

    desired_length = len(x_values)
    y_values_resized = np.resize(y_values, desired_length)

    correlation_coefficient = calculate_correlation(x_values, y_values_resized)
    correlation_coefficients[row_index] = correlation_coefficient

    enzyme_name = release_df.iloc[row_index, 0]
    x_values_for_all_enzymes.append([enzyme_name] + x_values)

sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

first_column_values = release_df.iloc[:, 0].values

significant_rows = []
total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    first_value = first_column_values[row_index]
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, first_value, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

with open("sig_spearman_enzymes.txt", "w") as file:
    for row_index, first_value, correlation_coefficient, x, probability in significant_rows:
        file.write(f"Enzyme = {first_value}, Correlation = {correlation_coefficient}, t-dist = {x}, p-value = {probability}\n")


  release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
  correlation_coefficient, _ = spearmanr(x_values, y_values)


In [14]:
import csv

with open('all_spearman_correlations.txt', 'r') as input_file:
    with open('all_spearman_correlations.tab', 'w', newline='') as tab_output_file:
        tab_writer = csv.writer(tab_output_file, delimiter='\t')
        tab_writer.writerow(['Enzyme', 'Correlation'])

        with open('all_spearman_correlations.csv', 'w', newline='') as csv_output_file:
            csv_writer = csv.writer(csv_output_file)
            csv_writer.writerow(['Enzyme', 'Correlation'])

            for line in input_file:
                line = line.strip()
                parts = line.split(',')

                if len(parts) == 2:
                    enzyme = parts[0].split('=')[1].strip()
                    correlation = float(parts[1].split('=')[1].strip())
                    tab_writer.writerow([enzyme, correlation])
                    csv_writer.writerow([enzyme, correlation])

with open('sig_spearman_enzymes.txt', 'r') as infile:
    lines = infile.readlines()

    with open('sig_spearman_enzymes.tab', 'w', newline='') as tab_output_file:
        tab_writer = csv.writer(tab_output_file, delimiter='\t')
        tab_writer.writerow(['Enzyme', 'Correlation'])

        with open('sig_spearman_enzymes.csv', 'w', newline='') as csv_output_file:
            csv_writer = csv.writer(csv_output_file)
            csv_writer.writerow(['Enzyme', 'Correlation'])

            for line in lines:
                parts = line.strip().split(', ')
                enzyme = parts[0].split(' = ')[1]
                correlation = parts[1].split(' = ')[1]
                tab_writer.writerow([enzyme, correlation])
                csv_writer.writerow([enzyme, correlation])

### Check for overlap in Pearson and Spearman correlations

In [15]:
pearson_data = {}
with open('sig_pearson_enzymes.csv', 'r') as pearson_file:
    reader = csv.reader(pearson_file)
    next(reader)
    for row in reader:
        enzyme = row[0]
        correlation = float(row[1])
        pearson_data[enzyme] = correlation

# Find overlapping enzymes
overlap_data = []
with open('sig_spearman_enzymes.csv', 'r') as spearman_file:
    reader = csv.reader(spearman_file)
    next(reader)
    for row in reader:
        enzyme = row[0]
        spearman_correlation = float(row[1])

        if enzyme in pearson_data:
            pearson_correlation = pearson_data[enzyme]
            overlap_data.append([enzyme, pearson_correlation, spearman_correlation])

with open('enzyme_overlap.csv', 'w', newline='') as overlap_file:
    writer = csv.writer(overlap_file)
    writer.writerow(['Enzyme', 'Pearson_Correlation', 'Spearman_Correlation'])
    for row in overlap_data:
        writer.writerow(row)

# Combination Analysis using Pearson Correlations

### Mean and Standard Deviation



In [16]:
enzyme_names = []
with open('sig_pearson_enzymes.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        enzyme = line.split('=')[1].split(',')[0].strip()
        enzyme_names.append(enzyme)

with open('TARA243.KO.profile.release', 'r') as release_file, open('updated_releasefile.txt', 'w') as updated_file:
    first_row = release_file.readline()
    updated_file.write(first_row)

    for line in release_file:
        data = line.split()
        if data[0] in enzyme_names:
            updated_file.write(line)

In [17]:
# Calculate the mean and standard deviation
import pandas as pd

# get enzyme names from significant enzymes file
enzyme_names = []
with open("sig_pearson_enzymes.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        enzyme = line.split(' ')[2].replace(",", "")
        enzyme_names.append(enzyme)

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True)

enzyme_rows = {enzyme: [] for enzyme in enzyme_names}

for enzyme in enzyme_names:
    enzyme_rows[enzyme] = release_df[release_df['ko'].str.contains(enzyme)]

with open("p_mean&sd.txt", "w") as file:
    for enzyme, rows in enzyme_rows.items():
        means = rows.iloc[:, 1:].mean(axis=1)
        std_devs = rows.iloc[:, 1:].std(axis=1)
        for index, (mean, std_dev) in enumerate(zip(means, std_devs)):
            file.write(f"Enzyme: {enzyme} Mean: {mean} Standard Deviation: {std_dev}\n")

  release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True)


In [18]:
# Normalize abundance data using significant enzyme's mean and sd
import pandas as pd

mean_sd_values = {}
with open("p_mean&sd.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        data = line.split()
        enzyme = data[1]
        mean = float(data[3])
        std_dev = float(data[6])
        mean_sd_values[enzyme] = {'mean': mean, 'std_dev': std_dev}

normalized_values = []
with open("updated_releasefile.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        data = line.split()
        enzyme = data[0]  # enzyme name should be the first column value

        if enzyme in mean_sd_values:
            mean = mean_sd_values[enzyme]['mean']
            std_dev = mean_sd_values[enzyme]['std_dev']

            # Normalize each value in the row using its corresponding mean and std dev
            normalized_row = [f"{(float(value) - mean) / std_dev}" for value in data[1:]]
            normalized_values.append([enzyme] + normalized_row)

with open('TARA243.KO.profile.release', 'r') as release_file:
    first_row = release_file.readline()

with open('p_normalized_releasefile.txt', 'w') as normalized_file:
    normalized_file.write(first_row)
    for row in normalized_values:
        normalized_file.write('\t'.join(row) + '\n')

In [19]:
#Remove non-related samples

sample_names = []
with open('mapped.txt', 'r') as mapped_file:
    lines = mapped_file.readlines()
    for line in lines:
        sample_name = line.split('\t')[1].strip()
        sample_names.append(sample_name)

with open('p_normalized_releasefile.txt', 'r') as release_file, open('p_related_samples_normalized.txt', 'w') as sorted_file:
    first_row = release_file.readline()

    first_column_index = 0
    indices = [i for i, name in enumerate(first_row.split('\t')) if name.strip() in sample_names]

    sorted_file.write('\t'.join(first_row.split('\t')[i] for i in indices))

    for line in release_file:
        data = line.split('\t')
        sorted_file.write(data[first_column_index] + '\t' + '\t'.join(data[i] for i in indices))

In [20]:
# sort based on the number of positive numbers in each row from highest to lowest
def count_positive(row):
    return sum(1 for value in row.split('\t')[1:] if float(value) > 0)

data_rows = []
with open('p_related_samples_normalized.txt', 'r') as sorted_file:
    header = sorted_file.readline()

    for line in sorted_file:
        data_rows.append(line)

sorted_rows = sorted(data_rows, key=count_positive, reverse=True)

with open('p_sorted_normalized.txt', 'w') as output_file:
    output_file.write(header)
    for row in sorted_rows:
        output_file.write(row)

In [21]:
# find enzymes that have at least n positive values in their row
n = 10

In [22]:
with open('p_sorted_normalized.txt', 'r') as input_file, open('p_single.txt', 'w') as output_file:
    first_row = input_file.readline()
    output_file.write(first_row)

    for line in input_file:
        values = line.strip().split('\t')
        count_positives = sum(float(value) > 0 for value in values[1:])

        if count_positives >= n:
            output_file.write(line)

### Combination of 2

In [23]:
from scipy.stats import binom

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

# Finding combos enzymes that have at least n matching positive values
n = 10

with open('p_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('p_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('p_combo2.txt', 'w') as output_file, open('p_Wilcoxon_y-values2.txt', 'w') as y_file, open('p_Wilcoxon_y-samples2.txt', 'w') as samples_file:
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Pair', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for i in range(num_rows - 1):
            for j in range(i + 1, num_rows):
                count_matches = 0
                positive_values = []

                for k in range(1, len(sample_names) + 1):
                    value_i = float(lines[i].strip().split('\t')[k])
                    value_j = float(lines[j].strip().split('\t')[k])

                    if value_i > 0 and value_j > 0:
                        count_matches += 1
                        positive_values.append(f'{value_i}, {value_j}')

                if count_matches >= n:
                    sample1 = lines[i].strip().split('\t')[0]
                    sample2 = lines[j].strip().split('\t')[0]

                    # Get the percentage for each enzyme from the dictionary
                    percentage_sample1 = enzyme_percentages.get(sample1, 0)
                    percentage_sample2 = enzyme_percentages.get(sample2, 0)

                    # Calculate the fraction based on the percentage of positive values
                    fraction = percentage_sample1 * percentage_sample2

                    # Calculate the distribution
                    distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                    # Calculate the p-value (sum of binomial probabilities from k to 41)
                    p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                    if p_value < 0.01:
                        output_file.write(f'{sample1}\t{sample2}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                        pair_name = f'{sample1}-{sample2}'
                        values_str = ', '.join(positive_values)
                        y_file.write(f'{pair_name}\t{values_str}\n')

                        # Replace each positive value with its corresponding sample name
                        for idx, value in enumerate(positive_values):
                            sample_name = sample_names[idx]
                            values_str = values_str.replace(value, sample_name)

                        samples_file.write(f'{pair_name}\t{values_str}\n')

In [24]:
# Take out positive values that are not in Wilcoxon_y-values2.txt and only in single.txt
y_values = {}
with open('p_Wilcoxon_y-values2.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        y_values[pair_name] = [float(value) for value in values_str.split(', ')]

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values2.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Values']) + '\n')

    for pair_name, y_positive_values in y_values.items():
        sample_names = pair_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                x_output_file.write(f'{pair_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [25]:
# Replace positive x values with sample names
y_values = {}
with open('p_Wilcoxon_y-values2.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        y_values[pair_name] = [float(value) for value in values_str.split(', ')]

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values2.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Pair', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for pair_name, y_positive_values in y_values.items():
        sample_names = pair_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{pair_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [26]:
# Replace sample names with pollution data from mapped.txt and perform Mann U Whitney Test

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('p_Wilcoxon_y-samples2.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[pair_name] = sample_values

x_values = {}
with open('p_Wilcoxon_x-values2.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{pair_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

from scipy.stats import mannwhitneyu

with open('p_MannU2.txt', 'w') as mann_u_file:
    mann_u_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')
    for key, x_sample_values in x_values.items():
        pair_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(pair_name, [])

        # Mann-Whitney U test
        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{pair_name}\t{enzyme_name_x}\t{p_value}\n')

In [27]:
# Open MannU2.txt and copy the enzyme pairs with p-values less than 0.1 for both enzymes to sig_combo2.txt
with open('p_MannU2.txt', 'r') as mann_u_file, open('p_sig_combo2.txt', 'w') as output_file:

    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])

        if mann_u_value < 0.1:
            next_values = next(mann_u_file, None)
            if next_values is not None:
                next_values = next_values.strip().split('\t')
                next_enzyme_name = next_values[1]
                next_mann_u_value = float(next_values[2])

                if next_mann_u_value < 0.1:
                    output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                    output_file.write('\t'.join([enzyme_pair, next_enzyme_name, str(next_mann_u_value)]) + '\n')

### Combo3

In [28]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('p_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('p_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('p_combo3.txt', 'w') as output_file, open('p_Wilcoxon_y-values3.txt', 'w') as y_file, open('p_Wilcoxon_y-samples3.txt', 'w') as samples_file:
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Trio', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 3):
            i, j, k = combo

            count_matches = 0
            positive_values = []

            for m in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[m])
                value_j = float(lines[j].strip().split('\t')[m])
                value_k = float(lines[k].strip().split('\t')[m])

                if all(value > 0 for value in [value_i, value_j, value_k]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3

                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    trio_name = f'{sample1}-{sample2}-{sample3}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{trio_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{trio_name}\t{values_str}\n')


In [29]:
y_values = {}
with open('p_Wilcoxon_y-values3.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values3.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [30]:
y_values = {}
with open('p_Wilcoxon_y-values3.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values3.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) -1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [31]:
def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('p_Wilcoxon_y-samples3.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('p_Wilcoxon_x-values3.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

from scipy.stats import mannwhitneyu

with open('p_MannU3.txt', 'w') as mann_u_file:
    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():
        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [32]:
with open('p_MannU3.txt', 'r') as mann_u_file, open('p_sig_combo3.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')


### Combo4

In [33]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('p_normalized_releasefile.txt', 'r') as enzyme_file:

    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243  # Assuming 243 as the total number of values
        enzyme_percentages[enzyme_name] = percentage

with open('p_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('p_combo4.txt', 'w') as output_file, open('p_Wilcoxon_y-values4.txt', 'w') as y_file, open('p_Wilcoxon_y-samples4.txt', 'w') as samples_file:
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Quadruplet', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 4):
            i, j, k, l = combo

            count_matches = 0
            positive_values = []

            for m in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[m])
                value_j = float(lines[j].strip().split('\t')[m])
                value_k = float(lines[k].strip().split('\t')[m])
                value_l = float(lines[l].strip().split('\t')[m])

                if all(value > 0 for value in [value_i, value_j, value_k, value_l]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)

                # Calculate the fraction based on the percentage of positive values
                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4

                # Calculate the distribution
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    quadruplet_name = f'{sample1}-{sample2}-{sample3}-{sample4}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{quadruplet_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{quadruplet_name}\t{values_str}\n')

In [34]:
y_values = {}
with open('p_Wilcoxon_y-values4.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values4.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [35]:
y_values = {}
with open('p_Wilcoxon_y-values4.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values4.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [36]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('p_Wilcoxon_y-samples4.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('p_Wilcoxon_x-values4.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('p_MannU4.txt', 'w') as mann_u_file:
    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():
        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [37]:
with open('p_MannU4.txt', 'r') as mann_u_file, open('p_sig_combo4.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')

### Combo5

In [38]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('p_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('p_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('p_combo5.txt', 'w') as output_file, open('p_Wilcoxon_y-values5.txt', 'w') as y_file, open('p_Wilcoxon_y-samples5.txt', 'w') as samples_file:

        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Enzyme5', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Quintuplet', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 5):
            i, j, k, l, m = combo

            count_matches = 0
            positive_values = []

            for a in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[a])
                value_j = float(lines[j].strip().split('\t')[a])
                value_k = float(lines[k].strip().split('\t')[a])
                value_l = float(lines[l].strip().split('\t')[a])
                value_m = float(lines[m].strip().split('\t')[a])

                if all(value > 0 for value in [value_i, value_j, value_k, value_l, value_m]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}, {value_m}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]
                sample5 = lines[m].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)
                percentage_sample5 = enzyme_percentages.get(sample5, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4 * percentage_sample5
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)
                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{sample5}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    quintuplet_name = f'{sample1}-{sample2}-{sample3}-{sample4}-{sample5}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{quintuplet_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{quintuplet_name}\t{values_str}\n')

In [39]:
y_values = {}
with open('p_Wilcoxon_y-values5.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values5.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [40]:
y_values = {}
with open('p_Wilcoxon_y-values5.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values5.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)

            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [41]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('p_Wilcoxon_y-samples5.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('p_Wilcoxon_x-values5.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('p_MannU5.txt', 'w') as mann_u_file:

    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():
        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [42]:
with open('p_MannU5.txt', 'r') as mann_u_file, open('p_sig_combo5.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)
        next_values_5 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None and next_values_5 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')
            next_values_5 = next_values_5.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            next_enzyme_name_5 = next_values_5[1]
            next_mann_u_value_5 = float(next_values_5[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1 and next_mann_u_value_5 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_5, str(next_mann_u_value_5)]) + '\n')


### Combo6

In [None]:
from scipy.stats import binom
from itertools import combinations
def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('p_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('p_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('p_combo6.txt', 'w') as output_file, open('p_Wilcoxon_y-values6.txt', 'w') as y_file, open('p_Wilcoxon_y-samples6.txt', 'w') as samples_file:

        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Enzyme5', 'Enzyme6', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Set', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 6):
            i, j, k, l, m, o = combo

            count_matches = 0
            positive_values = []

            for a in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[a])
                value_j = float(lines[j].strip().split('\t')[a])
                value_k = float(lines[k].strip().split('\t')[a])
                value_l = float(lines[l].strip().split('\t')[a])
                value_m = float(lines[m].strip().split('\t')[a])
                value_o = float(lines[o].strip().split('\t')[a])

                if all(value > 0 for value in [value_i, value_j, value_k, value_l, value_m, value_o]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}, {value_m}, {value_o}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]
                sample5 = lines[m].strip().split('\t')[0]
                sample6 = lines[o].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)
                percentage_sample5 = enzyme_percentages.get(sample5, 0)
                percentage_sample6 = enzyme_percentages.get(sample6, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4 * percentage_sample5 * percentage_sample6

                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{sample5}\t{sample6}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    set_name = f'{sample1}-{sample2}-{sample3}-{sample4}-{sample5}-{sample6}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{set_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{set_name}\t{values_str}\n')

In [None]:
y_values = {}
with open('p_Wilcoxon_y-values6.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values6.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
y_values = {}
with open('p_Wilcoxon_y-values6.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('p_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('p_Wilcoxon_x-values6.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('p_Wilcoxon_y-samples6.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('p_Wilcoxon_x-values6.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('p_MannU6.txt', 'w') as mann_u_file:

    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():

        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])
        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
with open('p_MannU6.txt', 'r') as mann_u_file, open('p_sig_combo6.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)
        next_values_5 = next(mann_u_file, None)
        next_values_6 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None and next_values_5 is not None and next_values_6 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')
            next_values_5 = next_values_5.strip().split('\t')
            next_values_6 = next_values_6.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            next_enzyme_name_5 = next_values_5[1]
            next_mann_u_value_5 = float(next_values_5[2])

            next_enzyme_name_6 = next_values_6[1]
            next_mann_u_value_6 = float(next_values_6[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1 and next_mann_u_value_5 < 0.1 and next_mann_u_value_6 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_5, str(next_mann_u_value_5)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_6, str(next_mann_u_value_6)]) + '\n')


# Combination Analysis using Spearman Correlation

### Mean and Standard Deviation

In [None]:
enzyme_names = []
with open('sig_spearman_enzymes.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        enzyme = line.split('=')[1].split(',')[0].strip()
        enzyme_names.append(enzyme)

with open('TARA243.KO.profile.release', 'r') as release_file, open('updated_releasefile.txt', 'w') as updated_file:
    first_row = release_file.readline()
    updated_file.write(first_row)

    for line in release_file:
        data = line.split()
        if data[0] in enzyme_names:
            updated_file.write(line)

In [None]:
enzyme_names = []
with open("sig_spearman_enzymes.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        enzyme = line.split(' ')[2].replace(",", "")
        enzyme_names.append(enzyme)

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True)

enzyme_rows = {enzyme: [] for enzyme in enzyme_names}

for enzyme in enzyme_names:
    enzyme_rows[enzyme] = release_df[release_df['ko'].str.contains(enzyme)]

with open("s_mean&sd.txt", "w") as file:
    for enzyme, rows in enzyme_rows.items():
        means = rows.iloc[:, 1:].mean(axis=1)
        std_devs = rows.iloc[:, 1:].std(axis=1)
        for index, (mean, std_dev) in enumerate(zip(means, std_devs)):
            file.write(f"Enzyme: {enzyme} Mean: {mean} Standard Deviation: {std_dev}\n")

In [None]:
mean_sd_values = {}
with open("s_mean&sd.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        data = line.split()
        enzyme = data[1]
        mean = float(data[3])
        std_dev = float(data[6])
        mean_sd_values[enzyme] = {'mean': mean, 'std_dev': std_dev}

normalized_values = []
with open("updated_releasefile.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        data = line.split()
        enzyme = data[0]

        if enzyme in mean_sd_values:
            mean = mean_sd_values[enzyme]['mean']
            std_dev = mean_sd_values[enzyme]['std_dev']
            normalized_row = [f"{(float(value) - mean) / std_dev}" for value in data[1:]]
            normalized_values.append([enzyme] + normalized_row)

with open('TARA243.KO.profile.release', 'r') as release_file:
    first_row = release_file.readline()

with open('s_normalized_releasefile.txt', 'w') as normalized_file:
    normalized_file.write(first_row)

    for row in normalized_values:
        normalized_file.write('\t'.join(row) + '\n')

In [None]:
sample_names = []
with open('mapped.txt', 'r') as mapped_file:
    lines = mapped_file.readlines()
    for line in lines:
        sample_name = line.split('\t')[1].strip()
        sample_names.append(sample_name)

with open('s_normalized_releasefile.txt', 'r') as release_file, open('s_related_samples_normalized.txt', 'w') as sorted_file:

    first_row = release_file.readline()
    first_column_index = 0
    indices = [i for i, name in enumerate(first_row.split('\t')) if name.strip() in sample_names]
    sorted_file.write('\t'.join(first_row.split('\t')[i] for i in indices))

    for line in release_file:
        data = line.split('\t')
        sorted_file.write(data[first_column_index] + '\t' + '\t'.join(data[i] for i in indices))

In [None]:
def count_positive(row):
    return sum(1 for value in row.split('\t')[1:] if float(value) > 0)

data_rows = []
with open('s_related_samples_normalized.txt', 'r') as sorted_file:

    header = sorted_file.readline()
    for line in sorted_file:
        data_rows.append(line)

sorted_rows = sorted(data_rows, key=count_positive, reverse=True)

with open('s_sorted_normalized.txt', 'w') as output_file:

    output_file.write(header)

    for row in sorted_rows:
        output_file.write(row)

In [None]:
# Finding enzymes that have at least n positive values in their row
n = 10

with open('s_sorted_normalized.txt', 'r') as input_file, open('s_single.txt', 'w') as output_file:
    first_row = input_file.readline()
    output_file.write(first_row)

    for line in input_file:
        values = line.strip().split('\t')
        count_positives = sum(float(value) > 0 for value in values[1:])

        if count_positives >= n:
            output_file.write(line)

### Combo2

In [None]:
from scipy.stats import binom

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

n = 10

with open('s_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('s_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('s_combo2.txt', 'w') as output_file, open('s_Wilcoxon_y-values2.txt', 'w') as y_file, open('s_Wilcoxon_y-samples2.txt', 'w') as samples_file:
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Pair', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for i in range(num_rows - 1):
            for j in range(i + 1, num_rows):
                count_matches = 0
                positive_values = []

                for k in range(1, len(sample_names) + 1):
                    value_i = float(lines[i].strip().split('\t')[k])
                    value_j = float(lines[j].strip().split('\t')[k])

                    if value_i > 0 and value_j > 0:
                        count_matches += 1
                        positive_values.append(f'{value_i}, {value_j}')

                if count_matches >= n:
                    sample1 = lines[i].strip().split('\t')[0]
                    sample2 = lines[j].strip().split('\t')[0]

                    percentage_sample1 = enzyme_percentages.get(sample1, 0)
                    percentage_sample2 = enzyme_percentages.get(sample2, 0)

                    fraction = percentage_sample1 * percentage_sample2
                    distribution = calculate_binomial_distribution(count_matches, 41, fraction)
                    p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                    if p_value < 0.01:
                        output_file.write(f'{sample1}\t{sample2}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                        pair_name = f'{sample1}-{sample2}'
                        values_str = ', '.join(positive_values)
                        y_file.write(f'{pair_name}\t{values_str}\n')

                        for idx, value in enumerate(positive_values):
                            sample_name = sample_names[idx]
                            values_str = values_str.replace(value, sample_name)

                        samples_file.write(f'{pair_name}\t{values_str}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values2.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        y_values[pair_name] = [float(value) for value in values_str.split(', ')]

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values2.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Values']) + '\n')

    for pair_name, y_positive_values in y_values.items():
        sample_names = pair_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                x_output_file.write(f'{pair_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values2.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        y_values[pair_name] = [float(value) for value in values_str.split(', ')]

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values2.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Pair', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for pair_name, y_positive_values in y_values.items():
        sample_names = pair_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]  # Shift the index by one
                    x_output_file.write(f'{pair_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('s_Wilcoxon_y-samples2.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[pair_name] = sample_values

x_values = {}
with open('s_Wilcoxon_x-values2.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        pair_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{pair_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

from scipy.stats import mannwhitneyu

with open('s_MannU2.txt', 'w') as mann_u_file:
    mann_u_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')
    for key, x_sample_values in x_values.items():
        pair_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(pair_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{pair_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
with open('s_MannU2.txt', 'r') as mann_u_file, open('s_sig_combo2.txt', 'w') as output_file:

    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])

        if mann_u_value < 0.1:
            next_values = next(mann_u_file, None)
            if next_values is not None:
                next_values = next_values.strip().split('\t')
                next_enzyme_name = next_values[1]
                next_mann_u_value = float(next_values[2])

                if next_mann_u_value < 0.1:
                    output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                    output_file.write('\t'.join([enzyme_pair, next_enzyme_name, str(next_mann_u_value)]) + '\n')

### Combo3

In [None]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('s_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('s_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('s_combo3.txt', 'w') as output_file, open('s_Wilcoxon_y-values3.txt', 'w') as y_file, open('s_Wilcoxon_y-samples3.txt', 'w') as samples_file:
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Trio', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 3):
            i, j, k = combo

            count_matches = 0
            positive_values = []

            for m in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[m])
                value_j = float(lines[j].strip().split('\t')[m])
                value_k = float(lines[k].strip().split('\t')[m])

                if all(value > 0 for value in [value_i, value_j, value_k]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3

                distribution = calculate_binomial_distribution(count_matches, 41, fraction)

                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    trio_name = f'{sample1}-{sample2}-{sample3}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{trio_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{trio_name}\t{values_str}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values3.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values3.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values3.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values3.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) -1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')


In [None]:
def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('s_Wilcoxon_y-samples3.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('s_Wilcoxon_x-values3.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

from scipy.stats import mannwhitneyu

with open('s_MannU3.txt', 'w') as mann_u_file:
    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():
        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
with open('s_MannU3.txt', 'r') as mann_u_file, open('s_sig_combo3.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')


### Combo4

In [None]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('s_normalized_releasefile.txt', 'r') as enzyme_file:

    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243  # Assuming 243 as the total number of values
        enzyme_percentages[enzyme_name] = percentage

with open('s_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('s_combo4.txt', 'w') as output_file, open('s_Wilcoxon_y-values4.txt', 'w') as y_file, open('s_Wilcoxon_y-samples4.txt', 'w') as samples_file:
        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Quadruplet', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 4):
            i, j, k, l = combo
            count_matches = 0
            positive_values = []

            for m in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[m])
                value_j = float(lines[j].strip().split('\t')[m])
                value_k = float(lines[k].strip().split('\t')[m])
                value_l = float(lines[l].strip().split('\t')[m])

                if all(value > 0 for value in [value_i, value_j, value_k, value_l]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)
                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    quadruplet_name = f'{sample1}-{sample2}-{sample3}-{sample4}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{quadruplet_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{quadruplet_name}\t{values_str}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values4.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values4.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values4.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values4.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('s_Wilcoxon_y-samples4.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('s_Wilcoxon_x-values4.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('s_MannU4.txt', 'w') as mann_u_file:

    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():
        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')


In [None]:
with open('s_MannU4.txt', 'r') as mann_u_file, open('s_sig_combo4.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')

### Combo5

In [None]:
from scipy.stats import binom
from itertools import combinations

def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('s_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('s_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('s_combo5.txt', 'w') as output_file, open('s_Wilcoxon_y-values5.txt', 'w') as y_file, open('s_Wilcoxon_y-samples5.txt', 'w') as samples_file:

        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Enzyme5', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Quintuplet', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 5):
            i, j, k, l, m = combo
            count_matches = 0
            positive_values = []

            for a in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[a])
                value_j = float(lines[j].strip().split('\t')[a])
                value_k = float(lines[k].strip().split('\t')[a])
                value_l = float(lines[l].strip().split('\t')[a])
                value_m = float(lines[m].strip().split('\t')[a])

                if all(value > 0 for value in [value_i, value_j, value_k, value_l, value_m]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}, {value_m}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]
                sample5 = lines[m].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)
                percentage_sample5 = enzyme_percentages.get(sample5, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4 * percentage_sample5
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)
                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{sample5}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    quintuplet_name = f'{sample1}-{sample2}-{sample3}-{sample4}-{sample5}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{quintuplet_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{quintuplet_name}\t{values_str}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values5.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values5.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values5.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values5.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)

            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('s_Wilcoxon_y-samples5.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('s_Wilcoxon_x-values5.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('s_MannU5.txt', 'w') as mann_u_file:

    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():
        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
with open('s_MannU5.txt', 'r') as mann_u_file, open('s_sig_combo5.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)
        next_values_5 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None and next_values_5 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')
            next_values_5 = next_values_5.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            next_enzyme_name_5 = next_values_5[1]
            next_mann_u_value_5 = float(next_values_5[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1 and next_mann_u_value_5 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_5, str(next_mann_u_value_5)]) + '\n')


### Combo6

In [None]:
from scipy.stats import binom
from itertools import combinations
def calculate_binomial_distribution(count_matches, total_samples, count_percentage):
    k = count_matches
    distribution = binom.pmf(k, total_samples, count_percentage)
    return distribution

with open('s_normalized_releasefile.txt', 'r') as enzyme_file:
    enzyme_names = enzyme_file.readline().strip().split('\t')[1:]

    enzyme_percentages = {}
    for line in enzyme_file:
        values = line.strip().split('\t')
        enzyme_name = values[0]
        positive_values = sum(1 for value in values[1:] if float(value) > 0)
        percentage = positive_values / 243
        enzyme_percentages[enzyme_name] = percentage

with open('s_single.txt', 'r') as input_file:
    sample_names = input_file.readline().strip().split('\t')[1:]

    with open('s_combo6.txt', 'w') as output_file, open('s_Wilcoxon_y-values6.txt', 'w') as y_file, open('s_Wilcoxon_y-samples6.txt', 'w') as samples_file:

        output_file.write('\t'.join(['Enzyme1', 'Enzyme2', 'Enzyme3', 'Enzyme4', 'Enzyme5', 'Enzyme6', 'Count Percentage', 'Matches', 'Binomial Distribution', 'p_value']) + '\n')
        y_file.write('\t'.join(['Enzyme Set', 'Values']) + '\n')

        lines = input_file.readlines()
        num_rows = len(lines)

        for combo in combinations(range(num_rows), 6):
            i, j, k, l, m, o = combo
            count_matches = 0
            positive_values = []

            for a in range(1, len(sample_names) + 1):
                value_i = float(lines[i].strip().split('\t')[a])
                value_j = float(lines[j].strip().split('\t')[a])
                value_k = float(lines[k].strip().split('\t')[a])
                value_l = float(lines[l].strip().split('\t')[a])
                value_m = float(lines[m].strip().split('\t')[a])
                value_o = float(lines[o].strip().split('\t')[a])

                if all(value > 0 for value in [value_i, value_j, value_k, value_l, value_m, value_o]):
                    count_matches += 1
                    positive_values.append(f'{value_i}, {value_j}, {value_k}, {value_l}, {value_m}, {value_o}')

            if count_matches >= n:
                sample1 = lines[i].strip().split('\t')[0]
                sample2 = lines[j].strip().split('\t')[0]
                sample3 = lines[k].strip().split('\t')[0]
                sample4 = lines[l].strip().split('\t')[0]
                sample5 = lines[m].strip().split('\t')[0]
                sample6 = lines[o].strip().split('\t')[0]

                percentage_sample1 = enzyme_percentages.get(sample1, 0)
                percentage_sample2 = enzyme_percentages.get(sample2, 0)
                percentage_sample3 = enzyme_percentages.get(sample3, 0)
                percentage_sample4 = enzyme_percentages.get(sample4, 0)
                percentage_sample5 = enzyme_percentages.get(sample5, 0)
                percentage_sample6 = enzyme_percentages.get(sample6, 0)

                fraction = percentage_sample1 * percentage_sample2 * percentage_sample3 * percentage_sample4 * percentage_sample5 * percentage_sample6
                distribution = calculate_binomial_distribution(count_matches, 41, fraction)
                p_value = sum(calculate_binomial_distribution(k, 41, fraction) for k in range(count_matches, 42))

                if p_value < 0.01:
                    output_file.write(f'{sample1}\t{sample2}\t{sample3}\t{sample4}\t{sample5}\t{sample6}\t{fraction}\t{count_matches}\t{distribution}\t{p_value}\n')

                    set_name = f'{sample1}-{sample2}-{sample3}-{sample4}-{sample5}-{sample6}'
                    values_str = ', '.join(positive_values)
                    y_file.write(f'{set_name}\t{values_str}\n')

                    for idx, value in enumerate(positive_values):
                        sample_name = sample_names[idx]
                        values_str = values_str.replace(value, sample_name)

                    samples_file.write(f'{set_name}\t{values_str}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values6.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = set(values_str.split(', '))

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values6.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - y_positive_values

            if unique_values:
                x_output_file.write(f'{set_name}\t{sample_name}\t{" ".join(map(str, unique_values))}\n')

In [None]:
y_values = {}
with open('s_Wilcoxon_y-values6.txt', 'r') as y_file:
    next(y_file)
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        y_values[set_name] = [float(value) for value in values_str.split(', ')]

with open('s_single.txt', 'r') as sample_file:
    header = sample_file.readline().strip().split('\t')[1:]
    data = [line.strip().split('\t') for line in sample_file]

with open('s_Wilcoxon_x-values6.txt', 'w') as x_output_file:
    x_output_file.write('\t'.join(['Enzyme Set', 'Sample Name', 'Enzyme Name', 'Values']) + '\n')

    for set_name, y_positive_values in y_values.items():
        sample_names = set_name.split('-')

        for sample_name in sample_names:
            sample_row = next(row for row in data if row[0] == sample_name)
            sample_positive_values = [float(value) for value in sample_row[1:] if float(value) > 0]
            unique_values = set(sample_positive_values) - set(y_positive_values)

            if unique_values:
                for value in unique_values:
                    enzyme_name = header[sample_row[1:].index(str(value)) - 1]
                    x_output_file.write(f'{set_name}\t{enzyme_name}\t{sample_name}\t{value}\n')

In [None]:
from itertools import combinations
from scipy.stats import mannwhitneyu

def extract_values(sample_name, mapped_data):
    row = next(row for row in mapped_data if row[1] == sample_name)
    return float(row[2])

with open('mapped.txt', 'r') as mapped_file:
    mapped_data = [line.strip().split('\t') for line in mapped_file]

y_values = {}
with open('s_Wilcoxon_y-samples6.txt', 'r') as y_file:
    for line in y_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        values_str = parts[1]
        sample_values = []

        for sample_name in values_str.split(', '):
            value = extract_values(sample_name, mapped_data)
            sample_values.append(value)

        y_values[set_name] = sample_values

x_values = {}
with open('s_Wilcoxon_x-values6.txt', 'r') as x_file:
    next(x_file)
    for line in x_file:
        parts = line.strip().split('\t')
        set_name = parts[0]
        sample_name = parts[1]
        enzyme_name = parts[2]

        value = extract_values(sample_name, mapped_data)

        key = f'{set_name} {enzyme_name}'
        if key in x_values:
            x_values[key].append(value)
        else:
            x_values[key] = [value]

with open('s_MannU6.txt', 'w') as mann_u_file:

    mann_u_file.write('\t'.join(['Enzyme Set', 'Enzyme Name', 'Mann U']) + '\n')

    for key, x_sample_values in x_values.items():

        set_name, enzyme_name_x = key.split()
        y_sample_values = y_values.get(set_name, [])

        stat, p_value = mannwhitneyu(x_sample_values, y_sample_values, alternative='two-sided')

        mann_u_file.write(f'{set_name}\t{enzyme_name_x}\t{p_value}\n')

In [None]:
with open('s_MannU6.txt', 'r') as mann_u_file, open('s_sig_combo6.txt', 'w') as output_file:
    output_file.write('\t'.join(['Enzyme Pair', 'Enzyme Name', 'Mann U']) + '\n')

    next(mann_u_file)
    for line in mann_u_file:
        values = line.strip().split('\t')
        enzyme_pair = values[0]
        enzyme_name = values[1]
        mann_u_value = float(values[2])
        next_values_2 = next(mann_u_file, None)
        next_values_3 = next(mann_u_file, None)
        next_values_4 = next(mann_u_file, None)
        next_values_5 = next(mann_u_file, None)
        next_values_6 = next(mann_u_file, None)

        if next_values_2 is not None and next_values_3 is not None and next_values_4 is not None and next_values_5 is not None and next_values_6 is not None:
            next_values_2 = next_values_2.strip().split('\t')
            next_values_3 = next_values_3.strip().split('\t')
            next_values_4 = next_values_4.strip().split('\t')
            next_values_5 = next_values_5.strip().split('\t')
            next_values_6 = next_values_6.strip().split('\t')

            next_enzyme_name_2 = next_values_2[1]
            next_mann_u_value_2 = float(next_values_2[2])

            next_enzyme_name_3 = next_values_3[1]
            next_mann_u_value_3 = float(next_values_3[2])

            next_enzyme_name_4 = next_values_4[1]
            next_mann_u_value_4 = float(next_values_4[2])

            next_enzyme_name_5 = next_values_5[1]
            next_mann_u_value_5 = float(next_values_5[2])

            next_enzyme_name_6 = next_values_6[1]
            next_mann_u_value_6 = float(next_values_6[2])

            if next_mann_u_value_2 < 0.1 and next_mann_u_value_3 < 0.1 and mann_u_value < 0.1 and next_mann_u_value_4 < 0.1 and next_mann_u_value_5 < 0.1 and next_mann_u_value_6 < 0.1:
                output_file.write('\t'.join([enzyme_pair, enzyme_name, str(mann_u_value)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_2, str(next_mann_u_value_2)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_3, str(next_mann_u_value_3)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_4, str(next_mann_u_value_4)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_5, str(next_mann_u_value_5)]) + '\n')
                output_file.write('\t'.join([enzyme_pair, next_enzyme_name_6, str(next_mann_u_value_6)]) + '\n')


## Combination Results


In [None]:
def count_samples(y_samples):
    return len(set(y_samples.split(', ')))

def read_significant_enzymes(file_path):
    enzymes = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(', ')
            enzyme_name = parts[0].split(' = ')[1]
            correlation = float(parts[1].split(' = ')[1])
            enzymes.append((enzyme_name, correlation))
    return pd.DataFrame(enzymes, columns=['Enzyme', 'Correlation'])

significant_enzymes_df = read_significant_enzymes('sig_pearson_enzymes.txt')

files = ['p_sig_combo2.txt', 'p_sig_combo3.txt', 'p_sig_combo4.txt', 'p_sig_combo5.txt', 'p_sig_combo6.txt']

results_df = pd.DataFrame(columns=['Enzyme Pair', 'Enzyme Name', 'Mann U', 'Binomial Distribution', 'y-samples', '# of samples'])

results_df = pd.concat([significant_enzymes_df, results_df], ignore_index=True)

for file in files:
    with open(file, 'r') as sig_file:
        next(sig_file)

        for line in sig_file:
            values = line.strip().split('\t')
            enzyme_set = values[0]
            enzyme_name = values[1]
            mann_u_value = float(values[2])

            combo_file = f'p_combo{len(enzyme_set.split("-"))}.txt'
            binomial_distribution = None

            with open(combo_file, 'r') as combo_data:
                next(combo_data)
                for combo_line in combo_data:
                    combo_values = combo_line.strip().split('\t')

                    combo_enzymes = set(combo_values[:len(enzyme_set.split("-"))])
                    if set(enzyme_set.split("-")) == combo_enzymes:
                        binomial_distribution = float(combo_values[-2])
                        break

            y_samples_file = f'p_Wilcoxon_y-samples{len(enzyme_set.split("-"))}.txt'
            y_samples = []
            with open(y_samples_file, 'r') as y_samples_data:
                for y_samples_line in y_samples_data:
                    y_samples_values = y_samples_line.strip().split('\t')
                    if y_samples_values[0] == enzyme_set:
                        y_samples = y_samples_values[1:]
                        break

            num_samples = count_samples(', '.join(y_samples))

            new_row = pd.DataFrame([{
                'Enzyme Pair': enzyme_set,
                'Enzyme Name': enzyme_name,
                'Mann U': mann_u_value,
                'Binomial Distribution': binomial_distribution,
                'y-samples': ', '.join(y_samples),
                '# of samples': num_samples
            }])

            results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.to_excel('pearson_combinations.xlsx', index=False)

In [None]:
def count_samples(y_samples):
    return len(set(y_samples.split(', ')))

significant_enzymes_df = pd.read_csv('sig_spearman_enzymes.txt', sep=', ', header=None, engine='python')
significant_enzymes_df.columns = ['Enzyme', 'Correlation', 't-dist', 'p-value']

files = ['s_sig_combo2.txt', 's_sig_combo3.txt', 's_sig_combo4.txt', 's_sig_combo5.txt', 's_sig_combo6.txt']
results_df = pd.DataFrame(columns=['Enzyme Pair', 'Enzyme Name', 'Mann U', 'Binomial Distribution', 'y-samples', '# of samples'])
results_df = pd.concat([significant_enzymes_df, results_df], ignore_index=True)

for file in files:
    with open(file, 'r') as sig_file:
        next(sig_file)

        for line in sig_file:
            values = line.strip().split('\t')
            enzyme_set = values[0]
            enzyme_name = values[1]
            mann_u_value = float(values[2])

            combo_file = f's_combo{len(enzyme_set.split("-"))}.txt'
            binomial_distribution = None

            with open(combo_file, 'r') as combo_data:
                next(combo_data)
                for combo_line in combo_data:
                    combo_values = combo_line.strip().split('\t')
                    combo_enzymes = set(combo_values[:len(enzyme_set.split("-"))])
                    if set(enzyme_set.split("-")) == combo_enzymes:
                        binomial_distribution = float(combo_values[-2])
                        break

            y_samples_file = f's_Wilcoxon_y-samples{len(enzyme_set.split("-"))}.txt'
            with open(y_samples_file, 'r') as y_samples_data:
                for y_samples_line in y_samples_data:
                    y_samples_values = y_samples_line.strip().split('\t')
                    if y_samples_values[0] == enzyme_set:
                        y_samples = y_samples_values[1:]
                        break

            num_samples = count_samples(', '.join(y_samples))

            new_row = pd.DataFrame([{
                'Enzyme Pair': enzyme_set,
                'Enzyme Name': enzyme_name,
                'Mann U': mann_u_value,
                'Binomial Distribution': binomial_distribution,
                'y-samples': ', '.join(y_samples),
                '# of samples': num_samples
            }])

            results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.to_excel('spearman_combinations.xlsx', index=False)

#Species Analysis

In [None]:
import pandas as pd

file1_path = 'miTAG.taxonomic.profiles.release.tsv'
df1 = pd.read_csv(file1_path, delimiter='\t')

file2_path = 'mapped.txt'
df2 = pd.read_csv(file2_path, delimiter='\t', header=None, names=['ID', 'Sample', 'Value1', 'Value2'])

df_filtered = df1.iloc[:, :7]

sample_names_to_keep = df2['Sample'].unique()
columns_to_keep = [col for col in df1.columns if any(sample_name in col for sample_name in sample_names_to_keep)]
df_filtered = pd.concat([df_filtered, df1[columns_to_keep]], axis=1)

output_file_path = 'miTAG_taxonomic_filtered.tsv'
df_filtered.to_csv(output_file_path, sep='\t', index=False)

#### Using pearson correlation

In [None]:
taxonomic_data = pd.read_csv('miTAG_taxonomic_filtered.tsv', delimiter='\t')
x_values = taxonomic_data.iloc[:, 7:].values

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

release_df = pd.read_csv("TARA243.KO.profile.release", sep='\s+', skiprows=[1])

# Calculate t-value and probability distribution
def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

# Calculate Spearman correlation
def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

degrees_of_freedom = 39

correlation_coefficients = {}

for row_index in range(len(release_df)):
    y_values = mapped_data[2].values
    x_row_values = x_values[row_index]
    correlation_coefficient = calculate_correlation(x_row_values, y_values)
    correlation_coefficients[row_index] = correlation_coefficient

sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

domain = taxonomic_data.iloc[:, 0].values
phylum = taxonomic_data.iloc[:, 1].values
specie_class = taxonomic_data.iloc[:, 2].values
order = taxonomic_data.iloc[:, 3].values
family = taxonomic_data.iloc[:, 4].values
genus = taxonomic_data.iloc[:, 5].values
specie = taxonomic_data.iloc[:, 6].values

with open("p_species_correlation.tab", "w") as file:
    file.write("Domain\tPhylum\tSpecies Class\tOrder\tFamily\tGenus\tSpecies\tCorrelation\tx\tProbability\n")

    for row_index, correlation_coefficient in sorted_coefficients:
        one = domain[row_index]
        two = phylum[row_index]
        three = specie_class[row_index]
        four = order[row_index]
        five = family[row_index]
        six = genus[row_index]
        seven = specie[row_index]

        x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        file.write(f"{one}\t{two}\t{three}\t{four}\t{five}\t{six}\t{seven}\t{correlation_coefficient}\t{x}\t{probability}\n")

In [None]:
# Benjamini–Hochberg method
significant_rows = []

total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    one = domain[row_index]
    two = phylum[row_index]
    three = specie_class[row_index]
    four = order[row_index]
    five = family[row_index]
    six = genus[row_index]
    seven = specie[row_index]
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, seven, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

if significant_rows:
    significant_rows = significant_rows[:-1]

with open("p_significant_species.tab", "w") as file:
    for row_index, seven, correlation_coefficient, x, probability in significant_rows:
        one = domain[row_index]
        two = phylum[row_index]
        three = specie_class[row_index]
        four = order[row_index]
        five = family[row_index]
        six = genus[row_index]
        seven = specie[row_index]
        file.write(f"{one}\t{two}\t{three}\t{four}\t{five}\t{six}\t{seven}\t{correlation_coefficient}\t{probability}\n")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

def plot_line_of_best_fit(x, y, color, label):
    slope, intercept = np.polyfit(x, y, 1)
    plt.plot(x, slope * np.array(x) + intercept, color=color, linestyle='--', label=f'Fit for {label}')

colors = ['blue', 'green', 'red', 'purple', 'orange']

markers = ['o', 's', '^', 'v', 'D']

plt.figure(figsize=(10, 6))

for i in range(min(5, len(sorted_coefficients))):
    row_index, correlation_coefficient = sorted_coefficients[i]
    one = domain[row_index]
    two = phylum[row_index]
    three = specie_class[row_index]
    four = order[row_index]
    five = family[row_index]
    six = genus[row_index]
    seven = specie[row_index]

    x_row_values = x_values[row_index]
    marker = markers[i]
    color = colors[i]

    sns.scatterplot(x=x_row_values, y=y_values, label=f'Species: {seven}\nCorrelation: {correlation_coefficient}',
                    color=color, marker=marker, alpha=0.7, s=25)

    plot_line_of_best_fit(x_row_values, y_values, color, label=seven)

plt.xlabel('Species Abundance')
plt.ylabel('Pollution Degree')
plt.title('Top 5 Species Pearson Correlations')
plt.legend()
plt.show()

#### Using Spearman Correlation

In [None]:
taxonomic_data = pd.read_csv('miTAG_taxonomic_filtered.tsv', delimiter='\t')
x_values = taxonomic_data.iloc[:, 7:].values

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

release_df = pd.read_csv("TARA243.KO.profile.release", sep='\s+', skiprows=[1])

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - stats.t.cdf(x, degrees_of_freedom)
    return x, probability

def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = spearmanr(x_values, y_values)
    return correlation_coefficient

degrees_of_freedom = 39

correlation_coefficients = {}

for row_index in range(len(release_df)):
    y_values = mapped_data[2].values
    x_row_values = x_values[row_index]
    correlation_coefficient = calculate_correlation(x_row_values, y_values)
    correlation_coefficients[row_index] = correlation_coefficient

sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

domain = taxonomic_data.iloc[:, 0].values
phylum = taxonomic_data.iloc[:, 1].values
specie_class = taxonomic_data.iloc[:, 2].values
order = taxonomic_data.iloc[:, 3].values
family = taxonomic_data.iloc[:, 4].values
genus = taxonomic_data.iloc[:, 5].values
specie = taxonomic_data.iloc[:, 6].values

with open("s_species_correlation.tab", "w") as file:
    file.write("Domain\tPhylum\tSpecies Class\tOrder\tFamily\tGenus\tSpecies\tCorrelation\tx\tProbability\n")

    for row_index, correlation_coefficient in sorted_coefficients:
        one = domain[row_index]
        two = phylum[row_index]
        three = specie_class[row_index]
        four = order[row_index]
        five = family[row_index]
        six = genus[row_index]
        seven = specie[row_index]

        x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        file.write(f"{one}\t{two}\t{three}\t{four}\t{five}\t{six}\t{seven}\t{correlation_coefficient}\t{x}\t{probability}\n")

In [None]:
# Benjamini–Hochberg method
significant_rows = []

total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    one = domain[row_index]
    two = phylum[row_index]
    three = specie_class[row_index]
    four = order[row_index]
    five = family[row_index]
    six = genus[row_index]
    seven = specie[row_index]
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, seven, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

if significant_rows:
    significant_rows = significant_rows[:-1]

with open("s_significant_species.tab", "w") as file:
    for row_index, seven, correlation_coefficient, x, probability in significant_rows:
        one = domain[row_index]
        two = phylum[row_index]
        three = specie_class[row_index]
        four = order[row_index]
        five = family[row_index]
        six = genus[row_index]
        seven = specie[row_index]
        file.write(f"{one}\t{two}\t{three}\t{four}\t{five}\t{six}\t{seven}\t{correlation_coefficient}\t{probability}\n")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

def plot_line_of_best_fit(x, y, color, label):
    slope, intercept = np.polyfit(x, y, 1)
    plt.plot(x, slope * np.array(x) + intercept, color=color, linestyle='--', label=f'Fit for {label}')

colors = ['blue', 'green', 'red', 'purple', 'orange']

markers = ['o', 's', '^', 'v', 'D']

plt.figure(figsize=(10, 6))

for i in range(min(5, len(sorted_coefficients))):
    row_index, correlation_coefficient = sorted_coefficients[i]
    one = domain[row_index]
    two = phylum[row_index]
    three = specie_class[row_index]
    four = order[row_index]
    five = family[row_index]
    six = genus[row_index]
    seven = specie[row_index]

    x_row_values = x_values[row_index]
    marker = markers[i]
    color = colors[i]

    sns.scatterplot(x=x_row_values, y=y_values, label=f'Species: {seven}\nCorrelation: {correlation_coefficient}',
                    color=color, marker=marker, alpha=0.7, s=25)

    plot_line_of_best_fit(x_row_values, y_values, color, label=seven)

plt.xlabel('Species Abundance')
plt.ylabel('Pollution Degree')
plt.title('Top 5 Species Spearman Correlations')
plt.legend()
plt.show()

# Cross-validation (5)

In [None]:
import pandas as pd
import numpy as np
import itertools
from scipy.stats import pearsonr, t

def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - t.cdf(x, degrees_of_freedom)
    return x, probability

df = pd.read_csv("mapped.txt", header=None, delim_whitespace=True)

# Randomly shuffle the samples (41 rows in total)
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the 41 rows into 5 groups of 8, 8, 8, 8, and 9
rows_per_group = [8, 8, 8, 8, 9]  # Sizes of the groups
splits = []
start_idx = 0
for count in rows_per_group:
    splits.append(df_shuffled.iloc[start_idx:start_idx + count])
    start_idx += count

# Generate all combinations of 4 groups out of the 5 groups
group_combinations = list(itertools.combinations(range(5), 4))  # Generate combinations of 4 out of 5 groups

for combo_index, combo in enumerate(group_combinations):
    # Combine selected groups
    combined_rows = pd.concat([splits[i] for i in combo], ignore_index=True)
    group_file_name = f"cross-combo{combo_index + 1}.tab"
    combined_rows.to_csv(group_file_name, sep='\t', index=False, header=False)

    # Enzyme abundance data
    release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

    sample_names = combined_rows[1].values
    y_values = combined_rows[2].values

    tara_dict = release_df.set_index('ko').to_dict('index')
    x_values_for_all_enzymes = []

    for enzyme, enzyme_data in tara_dict.items():
        x_values = []
        for sample in sample_names:
            if sample in enzyme_data:
                x_values.append(enzyme_data[sample])
            else:
                x_values.append(np.nan)

        x_values_for_all_enzymes.append([enzyme] + x_values)

    correlation_results = []
    for enzyme_data in x_values_for_all_enzymes:
        enzyme_name = enzyme_data[0]
        x_values = enzyme_data[1:]

        correlation_coefficient = calculate_correlation(x_values, y_values)
        correlation_results.append((enzyme_name, correlation_coefficient))

    correlation_results.sort(key=lambda x: x[1], reverse=True)

    with open(f"cross-combo{combo_index + 1}_all_correlations.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient in correlation_results:
            degrees_of_freedom = len(combined_rows) - 2  # Adjust degrees of freedom based on current group size
            t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

    degrees_of_freedom = len(combined_rows) - 2  # Adjust degrees of freedom based on current group size
    significant_rows = []
    total_probability = 0

    # Check the significance of each enzyme
    for enzyme_name, correlation_coefficient in correlation_results:
        t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        # Add the current result if the total probability does not exceed 1
        if total_probability + p_value <= 1:
            significant_rows.append((enzyme_name, correlation_coefficient, t_value, p_value))
            total_probability += p_value
        else:
            break

    with open(f"cross_sig_combo{combo_index + 1}.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient, t_value, p_value in significant_rows:
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

In [None]:
# Finding shared enzymes between cross combos and sig_pearson_enzymes
import pandas as pd

def read_sig_pearson_file(file_name):
    enzyme_data = {}
    with open(file_name, "r") as file:
        for line in file:
            if line.strip():  # Skip empty lines
                parts = line.strip().split(", ")
                enzyme_name = parts[0].split(" = ")[1]
                correlation = float(parts[1].split(" = ")[1])
                enzyme_data[enzyme_name] = correlation
    return enzyme_data

def read_cross_sig_file(file_name):
    enzyme_data = {}
    with open(file_name, "r") as file:
        next(file)
        for line in file:
            if line.strip():
                parts = line.strip().split("\t")
                enzyme_name = parts[0]
                correlation = float(parts[1])
                enzyme_data[enzyme_name] = correlation
    return enzyme_data

sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "cross_sig_combo1.tab",
    "cross_sig_combo2.tab",
    "cross_sig_combo3.tab",
    "cross_sig_combo4.tab",
    "cross_sig_combo5.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:  # Only consider enzymes that are shared between sig_pearson and the current group
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 6)]
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 6)])

total_enzymes_in_sig = len(sig_enzymes)
for i in range(1, 6):
    group_column = f"Group{i} Correlation"
    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()
    percentage_common = (common_enzymes_in_group / total_enzymes_in_sig) * 100 if total_enzymes_in_sig else 0
    df[f"Percentage Common in Group{i}"] = percentage_common

# Calculate how many groups share each enzyme
df["Shared Count"] = df[["Group1 Correlation", "Group2 Correlation", "Group3 Correlation", "Group4 Correlation", "Group5 Correlation"]].apply(lambda x: (x != "na").sum(), axis=1)
df_sorted = df.sort_values(by="Shared Count", ascending=False)
df_sorted = df_sorted.drop(columns=["Shared Count"])

for group in ["Group1 Correlation", "Group2 Correlation", "Group3 Correlation", "Group4 Correlation", "Group5 Correlation"]:
    df_sorted[group] = df_sorted[group].apply(lambda x: f"{x:.2f}" if x != "na" else "na")

for i in range(1, 6):
    df_sorted[f"Percentage Common in Group{i}"] = df_sorted[f"Percentage Common in Group{i}"].apply(lambda x: f"{x:.2f}%")

df_sorted.to_csv("cross_validation_results.tab", sep="\t", index=False)

In [None]:
def read_sig_pearson_file(file_name):
    enzyme_data = {}
    with open(file_name, "r") as file:
        for line in file:
            if line.strip():
                parts = line.strip().split(", ")
                enzyme_name = parts[0].split(" = ")[1]
                correlation = float(parts[1].split(" = ")[1])
                enzyme_data[enzyme_name] = correlation
    return enzyme_data

def read_cross_sig_file(file_name):
    enzyme_data = {}
    with open(file_name, "r") as file:
        next(file)
        for line in file:
            if line.strip():
                parts = line.strip().split("\t")
                enzyme_name = parts[0]
                correlation = float(parts[1])
                enzyme_data[enzyme_name] = correlation
    return enzyme_data

sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "cross_sig_combo1.tab",
    "cross_sig_combo2.tab",
    "cross_sig_combo3.tab",
    "cross_sig_combo4.tab",
    "cross_sig_combo5.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 6)]
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 6)])

for i in range(1, 6):
    group_column = f"Group{i} Correlation"
    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()

    total_enzymes_in_sig = len(sig_enzymes)
    total_enzymes_in_group = len(df)

    denominator = min(total_enzymes_in_sig, total_enzymes_in_group)
    percentage_common = (common_enzymes_in_group / denominator) * 100 if denominator else 0

    df[f"Percentage Common in Group{i}"] = percentage_common

df["Shared Count"] = df[["Group1 Correlation", "Group2 Correlation", "Group3 Correlation", "Group4 Correlation", "Group5 Correlation"]].apply(lambda x: (x != "na").sum(), axis=1)
df_sorted = df.sort_values(by="Shared Count", ascending=False)
df_sorted = df_sorted.drop(columns=["Shared Count"])

for group in ["Group1 Correlation", "Group2 Correlation", "Group3 Correlation", "Group4 Correlation", "Group5 Correlation"]:
    df_sorted[group] = df_sorted[group].apply(lambda x: f"{x:.2f}" if x != "na" else "na")

for i in range(1, 6):
    df_sorted[f"Percentage Common in Group{i}"] = df_sorted[f"Percentage Common in Group{i}"].apply(lambda x: f"{x:.2f}%")

df_sorted.to_csv("cross_validation_results.tab", sep="\t", index=False)

# Cross-validation (10)

In [None]:
import pandas as pd
import numpy as np
import itertools
from scipy.stats import pearsonr, t

def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

def calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom):
    x = correlation_coefficient * np.sqrt(degrees_of_freedom) / np.sqrt(1 - correlation_coefficient**2)
    probability = 1 - t.cdf(x, degrees_of_freedom)
    return x, probability

df = pd.read_csv("mapped.txt", header=None, delim_whitespace=True)
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

rows_per_group = [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]
splits = []
start_idx = 0
for count in rows_per_group:
    splits.append(df_shuffled.iloc[start_idx:start_idx + count])
    start_idx += count

group_combinations = list(itertools.combinations(range(10), 9))  # Generate combinations of 9 out of 10 groups

for combo_index, combo in enumerate(group_combinations):
    combined_rows = pd.concat([splits[i] for i in combo], ignore_index=True)

    group_file_name = f"cross-combo{combo_index + 1}.tab"
    combined_rows.to_csv(group_file_name, sep='\t', index=False, header=False)

    release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
    sample_names = combined_rows[1].values
    y_values = combined_rows[2].values

    tara_dict = release_df.set_index('ko').to_dict('index')

    x_values_for_all_enzymes = []

    for enzyme, enzyme_data in tara_dict.items():
        x_values = []

        for sample in sample_names:
            if sample in enzyme_data:
                x_values.append(enzyme_data[sample])
            else:
                x_values.append(np.nan)

        x_values_for_all_enzymes.append([enzyme] + x_values)

    correlation_results = []
    for enzyme_data in x_values_for_all_enzymes:
        enzyme_name = enzyme_data[0]
        x_values = enzyme_data[1:]

        correlation_coefficient = calculate_correlation(x_values, y_values)
        correlation_results.append((enzyme_name, correlation_coefficient))

    correlation_results.sort(key=lambda x: x[1], reverse=True)

    with open(f"cross-combo{combo_index + 1}_all_correlations.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient in correlation_results:
            degrees_of_freedom = len(combined_rows) - 2
            t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

    degrees_of_freedom = len(combined_rows) - 2
    significant_rows = []
    total_probability = 0

    for enzyme_name, correlation_coefficient in correlation_results:
        t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        if total_probability + p_value <= 1:
            significant_rows.append((enzyme_name, correlation_coefficient, t_value, p_value))
            total_probability += p_value
        else:
            break

    with open(f"cross_sig_combo{combo_index + 1}.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient, t_value, p_value in significant_rows:
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

In [None]:
def read_sig_pearson_file(file_name):
    enzyme_data = {}
    with open(file_name, "r") as file:
        for line in file:
            if line.strip():
                parts = line.strip().split(", ")
                enzyme_name = parts[0].split(" = ")[1]
                correlation = float(parts[1].split(" = ")[1])
                enzyme_data[enzyme_name] = correlation
    return enzyme_data

def read_cross_sig_file(file_name):
    enzyme_data = {}
    with open(file_name, "r") as file:
        next(file)
        for line in file:
            if line.strip():
                parts = line.strip().split("\t")
                enzyme_name = parts[0]
                correlation = float(parts[1])
                enzyme_data[enzyme_name] = correlation
    return enzyme_data

sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "cross_sig_combo1.tab",
    "cross_sig_combo2.tab",
    "cross_sig_combo3.tab",
    "cross_sig_combo4.tab",
    "cross_sig_combo5.tab",
    "cross_sig_combo6.tab",
    "cross_sig_combo7.tab",
    "cross_sig_combo8.tab",
    "cross_sig_combo9.tab",
    "cross_sig_combo10.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 11)]  # Update to 10 groups
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 11)])  # Update to 10 groups

total_enzymes_in_sig = len(sig_enzymes)
for i in range(1, 11):
    group_column = f"Group{i} Correlation"
    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()
    percentage_common = (common_enzymes_in_group / total_enzymes_in_sig) * 100 if total_enzymes_in_sig else 0
    df[f"Percentage Common in Group{i}"] = percentage_common

df["Shared Count"] = df[["Group1 Correlation", "Group2 Correlation", "Group3 Correlation", "Group4 Correlation", "Group5 Correlation",
                         "Group6 Correlation", "Group7 Correlation", "Group8 Correlation", "Group9 Correlation", "Group10 Correlation"]].apply(
    lambda x: (x != "na").sum(), axis=1)

df_sorted = df.sort_values(by="Shared Count", ascending=False)
df_sorted = df_sorted.drop(columns=["Shared Count"])

for group in [f"Group{i} Correlation" for i in range(1, 11)]:  # Update to 10 groups
    df_sorted[group] = df_sorted[group].apply(lambda x: f"{x:.2f}" if x != "na" else "na")

for i in range(1, 11):
    df_sorted[f"Percentage Common in Group{i}"] = df_sorted[f"Percentage Common in Group{i}"].apply(lambda x: f"{x:.2f}%")

df_sorted.to_csv("10cross_validation_results.tab", sep="\t", index=False)

In [None]:
sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "cross_sig_combo1.tab",
    "cross_sig_combo2.tab",
    "cross_sig_combo3.tab",
    "cross_sig_combo4.tab",
    "cross_sig_combo5.tab",
    "cross_sig_combo6.tab",
    "cross_sig_combo7.tab",
    "cross_sig_combo8.tab",
    "cross_sig_combo9.tab",
    "cross_sig_combo10.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 11)]  # Update for 10 groups
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 11)])  # Updated for 10 groups

percentages = {}
for i in range(1, 11):
    group_column = f"Group{i} Correlation"
    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()

    total_enzymes_in_sig = len(sig_enzymes)
    total_enzymes_in_group = len(df)

    denominator = min(total_enzymes_in_sig, total_enzymes_in_group)

    percentage_common = (common_enzymes_in_group / denominator) * 100 if denominator else 0
    percentages[f"Group{i}"] = percentage_common

percentages_df = pd.DataFrame(list(percentages.items()), columns=["Group", "Percentage"])
percentages_df["Percentage"] = percentages_df["Percentage"].apply(lambda x: f"{x:.5f}%")
percentages_df.to_csv("10_cross_validation_percentages.tab", sep="\t", index=False)

In [None]:
def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

df = pd.read_csv("mapped.txt", header=None, delim_whitespace=True)

row_order = [
    38, 28, 33, 9, 0, 25, 14, 39, 23, 37, 15, 20, 11, 6, 32, 29, 21, 7, 36, 18,
    2, 10, 22, 13, 5, 40, 3, 8, 4, 24, 35, 16, 19, 27, 31, 17, 26, 34, 30, 12, 1
]

df_ordered = df.iloc[row_order].reset_index(drop=True)

rows_per_group = [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]
splits = []
start_idx = 0
for count in rows_per_group:
    splits.append(df_ordered.iloc[start_idx:start_idx + count])
    start_idx += count

group_combinations = list(itertools.combinations(range(10), 9))

for combo_index, combo in enumerate(group_combinations):
    combined_rows = pd.concat([splits[i] for i in combo], ignore_index=True)

    group_file_name = f"cross-combo{combo_index + 1}.tab"
    combined_rows.to_csv(group_file_name, sep='\t', index=False, header=False)

    release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

    sample_names = combined_rows[1].values
    y_values = combined_rows[2].values

    tara_dict = release_df.set_index('ko').to_dict('index')

    x_values_for_all_enzymes = []

    for enzyme, enzyme_data in tara_dict.items():
        x_values = []

        for sample in sample_names:
            if sample in enzyme_data:
                x_values.append(enzyme_data[sample])
            else:
                x_values.append(np.nan)

        x_values_for_all_enzymes.append([enzyme] + x_values)

    correlation_results = []
    for enzyme_data in x_values_for_all_enzymes:
        enzyme_name = enzyme_data[0]
        x_values = enzyme_data[1:]

        correlation_coefficient = calculate_correlation(x_values, y_values)
        correlation_results.append((enzyme_name, correlation_coefficient))

    correlation_results.sort(key=lambda x: x[1], reverse=True)

    with open(f"cross-combo{combo_index + 1}_all_correlations.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient in correlation_results:
            degrees_of_freedom = len(combined_rows) - 2
            t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

    degrees_of_freedom = len(combined_rows) - 2
    significant_rows = []
    total_probability = 0

    for enzyme_name, correlation_coefficient in correlation_results:
        t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        if total_probability + p_value <= 1:
            significant_rows.append((enzyme_name, correlation_coefficient, t_value, p_value))
            total_probability += p_value
        else:
            break

    with open(f"1_10_cross_sig_combo{combo_index + 1}.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient, t_value, p_value in significant_rows:
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

In [None]:
sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "1_10_cross_sig_combo1.tab",
    "1_10_cross_sig_combo2.tab",
    "1_10_cross_sig_combo3.tab",
    "1_10_cross_sig_combo4.tab",
    "1_10_cross_sig_combo5.tab",
    "1_10_cross_sig_combo6.tab",
    "1_10_cross_sig_combo7.tab",
    "1_10_cross_sig_combo8.tab",
    "1_10_cross_sig_combo9.tab",
    "1_10_cross_sig_combo10.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 11)]  # Update for 10 groups
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 11)])  # Updated for 10 groups

percentages = {}
for i in range(1, 11):
    group_column = f"Group{i} Correlation"

    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()
    total_enzymes_in_sig = len(sig_enzymes)
    total_enzymes_in_group = len(df)

    denominator = min(total_enzymes_in_sig, total_enzymes_in_group)
    percentage_common = (common_enzymes_in_group / denominator) * 100 if denominator else 0
    percentages[f"Group{i}"] = percentage_common

percentages_df = pd.DataFrame(list(percentages.items()), columns=["Group", "Percentage"])
percentages_df["Percentage"] = percentages_df["Percentage"].apply(lambda x: f"{x:.5f}%")
percentages_df.to_csv("1_10_cross_validation_percentages.tab", sep="\t", index=False)

In [None]:
df = pd.read_csv("mapped.txt", header=None, delim_whitespace=True)

row_order = [
    26, 6, 19, 23, 15, 18, 1, 0, 12, 25, 34, 30, 35, 16, 29, 10, 2, 24, 13, 28,
    32, 31, 36, 7, 8, 40, 11, 9, 17, 22, 3, 4, 27, 20, 14, 39, 5, 38, 21, 37, 33
]

df_ordered = df.iloc[row_order].reset_index(drop=True)

rows_per_group = [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]
splits = []
start_idx = 0
for count in rows_per_group:
    splits.append(df_ordered.iloc[start_idx:start_idx + count])
    start_idx += count

group_combinations = list(itertools.combinations(range(10), 9))

for combo_index, combo in enumerate(group_combinations):
    combined_rows = pd.concat([splits[i] for i in combo], ignore_index=True)

    group_file_name = f"cross-combo{combo_index + 1}.tab"
    combined_rows.to_csv(group_file_name, sep='\t', index=False, header=False)

    release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

    sample_names = combined_rows[1].values
    y_values = combined_rows[2].values

    tara_dict = release_df.set_index('ko').to_dict('index')

    x_values_for_all_enzymes = []

    for enzyme, enzyme_data in tara_dict.items():
        x_values = []

        for sample in sample_names:
            if sample in enzyme_data:
                x_values.append(enzyme_data[sample])
            else:
                x_values.append(np.nan)

        x_values_for_all_enzymes.append([enzyme] + x_values)

    correlation_results = []
    for enzyme_data in x_values_for_all_enzymes:
        enzyme_name = enzyme_data[0]
        x_values = enzyme_data[1:]

        correlation_coefficient = calculate_correlation(x_values, y_values)
        correlation_results.append((enzyme_name, correlation_coefficient))

    correlation_results.sort(key=lambda x: x[1], reverse=True)

    with open(f"cross-combo{combo_index + 1}_all_correlations.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient in correlation_results:
            degrees_of_freedom = len(combined_rows) - 2
            t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

    degrees_of_freedom = len(combined_rows) - 2
    significant_rows = []
    total_probability = 0

    for enzyme_name, correlation_coefficient in correlation_results:
        t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        if total_probability + p_value <= 1:
            significant_rows.append((enzyme_name, correlation_coefficient, t_value, p_value))
            total_probability += p_value
        else:
            break

    with open(f"2_10_cross_sig_combo{combo_index + 1}.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient, t_value, p_value in significant_rows:
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

In [None]:
sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "2_10_cross_sig_combo1.tab",
    "2_10_cross_sig_combo2.tab",
    "2_10_cross_sig_combo3.tab",
    "2_10_cross_sig_combo4.tab",
    "2_10_cross_sig_combo5.tab",
    "2_10_cross_sig_combo6.tab",
    "2_10_cross_sig_combo7.tab",
    "2_10_cross_sig_combo8.tab",
    "2_10_cross_sig_combo9.tab",
    "2_10_cross_sig_combo10.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 11)]  # Update for 10 groups
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 11)])  # Updated for 10 groups

percentages = {}
for i in range(1, 11):
    group_column = f"Group{i} Correlation"

    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()

    total_enzymes_in_sig = len(sig_enzymes)
    total_enzymes_in_group = len(df)

    denominator = min(total_enzymes_in_sig, total_enzymes_in_group)

    percentage_common = (common_enzymes_in_group / denominator) * 100 if denominator else 0
    percentages[f"Group{i}"] = percentage_common

percentages_df = pd.DataFrame(list(percentages.items()), columns=["Group", "Percentage"])
percentages_df["Percentage"] = percentages_df["Percentage"].apply(lambda x: f"{x:.5f}%")
percentages_df.to_csv("2_10_cross_validation_percentages.tab", sep="\t", index=False)

In [None]:
df = pd.read_csv("mapped.txt", header=None, delim_whitespace=True)

row_order = [
    37, 15, 14, 1, 0, 2, 8, 16, 19, 30, 4, 34, 26, 11, 20, 7, 3, 33, 29, 28,
    36, 32, 9, 13, 18, 22, 25, 31, 17, 5, 27, 40, 23, 38, 10, 6, 24, 39, 21, 35, 12
]

df_ordered = df.iloc[row_order].reset_index(drop=True)

rows_per_group = [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]
splits = []
start_idx = 0
for count in rows_per_group:
    splits.append(df_ordered.iloc[start_idx:start_idx + count])
    start_idx += count

group_combinations = list(itertools.combinations(range(10), 9))

for combo_index, combo in enumerate(group_combinations):
    combined_rows = pd.concat([splits[i] for i in combo], ignore_index=True)
    group_file_name = f"cross-combo{combo_index + 1}.tab"
    combined_rows.to_csv(group_file_name, sep='\t', index=False, header=False)
    release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

    sample_names = combined_rows[1].values
    y_values = combined_rows[2].values

    tara_dict = release_df.set_index('ko').to_dict('index')

    x_values_for_all_enzymes = []

    for enzyme, enzyme_data in tara_dict.items():
        x_values = []

        for sample in sample_names:
            if sample in enzyme_data:
                x_values.append(enzyme_data[sample])
            else:
                x_values.append(np.nan)

        x_values_for_all_enzymes.append([enzyme] + x_values)

    correlation_results = []
    for enzyme_data in x_values_for_all_enzymes:
        enzyme_name = enzyme_data[0]
        x_values = enzyme_data[1:]
        correlation_coefficient = calculate_correlation(x_values, y_values)
        correlation_results.append((enzyme_name, correlation_coefficient))

    correlation_results.sort(key=lambda x: x[1], reverse=True)

    with open(f"cross-combo{combo_index + 1}_all_correlations.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient in correlation_results:
            degrees_of_freedom = len(combined_rows) - 2
            t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

    degrees_of_freedom = len(combined_rows) - 2
    significant_rows = []
    total_probability = 0

    for enzyme_name, correlation_coefficient in correlation_results:
        t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

        if total_probability + p_value <= 1:
            significant_rows.append((enzyme_name, correlation_coefficient, t_value, p_value))
            total_probability += p_value
        else:
            break

    with open(f"3_10_cross_sig_combo{combo_index + 1}.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient, t_value, p_value in significant_rows:
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

In [None]:
sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "3_10_cross_sig_combo1.tab",
    "3_10_cross_sig_combo2.tab",
    "3_10_cross_sig_combo3.tab",
    "3_10_cross_sig_combo4.tab",
    "3_10_cross_sig_combo5.tab",
    "3_10_cross_sig_combo6.tab",
    "3_10_cross_sig_combo7.tab",
    "3_10_cross_sig_combo8.tab",
    "3_10_cross_sig_combo9.tab",
    "3_10_cross_sig_combo10.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 11)]  # Update for 10 groups
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 11)])  # Updated for 10 groups

percentages = {}
for i in range(1, 11):
    group_column = f"Group{i} Correlation"

    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()

    total_enzymes_in_sig = len(sig_enzymes)
    total_enzymes_in_group = len(df)

    denominator = min(total_enzymes_in_sig, total_enzymes_in_group)

    percentage_common = (common_enzymes_in_group / denominator) * 100 if denominator else 0
    percentages[f"Group{i}"] = percentage_common

percentages_df = pd.DataFrame(list(percentages.items()), columns=["Group", "Percentage"])
percentages_df["Percentage"] = percentages_df["Percentage"].apply(lambda x: f"{x:.5f}%")
percentages_df.to_csv("3_10_cross_validation_percentages.tab", sep="\t", index=False)

In [None]:
def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

df = pd.read_csv("mapped.txt", header=None, delim_whitespace=True)

row_order = [
    6, 35, 39, 8, 26, 14, 5, 3, 12, 37, 24, 40, 30,
    13, 4, 29, 19, 28, 9, 7, 0, 27, 11, 15, 18, 22, 1, 17, 20,
    32, 33, 36, 23, 34, 16, 31, 10, 2, 38, 25, 21
 ]

df_ordered = df.iloc[row_order].reset_index(drop=True)

rows_per_group = [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]
splits = []
start_idx = 0
for count in rows_per_group:
    splits.append(df_ordered.iloc[start_idx:start_idx + count])
    start_idx += count

group_combinations = list(itertools.combinations(range(10), 9))

for combo_index, combo in enumerate(group_combinations):
    combined_rows = pd.concat([splits[i] for i in combo], ignore_index=True)

    group_file_name = f"cross-combo{combo_index + 1}.tab"
    combined_rows.to_csv(group_file_name, sep='\t', index=False, header=False)

    release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

    sample_names = combined_rows[1].values
    y_values = combined_rows[2].values

    tara_dict = release_df.set_index('ko').to_dict('index')

    x_values_for_all_enzymes = []

    for enzyme, enzyme_data in tara_dict.items():
        x_values = []

        for sample in sample_names:
            if sample in enzyme_data:
                x_values.append(enzyme_data[sample])
            else:
                x_values.append(np.nan)

        x_values_for_all_enzymes.append([enzyme] + x_values)

    correlation_results = []
    for enzyme_data in x_values_for_all_enzymes:
        enzyme_name = enzyme_data[0]
        x_values = enzyme_data[1:]

        correlation_coefficient = calculate_correlation(x_values, y_values)
        correlation_results.append((enzyme_name, correlation_coefficient))

    correlation_results.sort(key=lambda x: x[1], reverse=True)

    with open(f"cross-combo{combo_index + 1}_all_correlations.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient in correlation_results:
            degrees_of_freedom = len(combined_rows) - 2
            t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

    degrees_of_freedom = len(combined_rows) - 2
    significant_rows = []
    total_probability = 0

    for enzyme_name, correlation_coefficient in correlation_results:
        t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
        if total_probability + p_value <= 1:
            significant_rows.append((enzyme_name, correlation_coefficient, t_value, p_value))
            total_probability += p_value
        else:
            break

    with open(f"4_10_cross_sig_combo{combo_index + 1}.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient, t_value, p_value in significant_rows:
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

In [None]:
sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "4_10_cross_sig_combo1.tab",
    "4_10_cross_sig_combo2.tab",
    "4_10_cross_sig_combo3.tab",
    "4_10_cross_sig_combo4.tab",
    "4_10_cross_sig_combo5.tab",
    "4_10_cross_sig_combo6.tab",
    "4_10_cross_sig_combo7.tab",
    "4_10_cross_sig_combo8.tab",
    "4_10_cross_sig_combo9.tab",
    "4_10_cross_sig_combo10.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 11)]
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 11)])

percentages = {}
for i in range(1, 11):
    group_column = f"Group{i} Correlation"

    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()

    total_enzymes_in_sig = len(sig_enzymes)
    total_enzymes_in_group = len(df)

    denominator = min(total_enzymes_in_sig, total_enzymes_in_group)

    percentage_common = (common_enzymes_in_group / denominator) * 100 if denominator else 0
    percentages[f"Group{i}"] = percentage_common

percentages_df = pd.DataFrame(list(percentages.items()), columns=["Group", "Percentage"])
percentages_df["Percentage"] = percentages_df["Percentage"].apply(lambda x: f"{x:.5f}%")
percentages_df.to_csv("4_10_cross_validation_percentages.tab", sep="\t", index=False)

In [None]:
df = pd.read_csv("mapped.txt", header=None, delim_whitespace=True)

row_order = [
    17, 32, 31, 7, 6, 26, 8, 19, 28, 10, 1, 33, 35, 2,
    14, 5, 34, 39, 22, 4, 13, 21, 38, 18, 16, 0, 9, 12, 15,
    27, 23, 36, 3, 37, 20, 11, 40, 25, 24, 30, 29
 ]

df_ordered = df.iloc[row_order].reset_index(drop=True)

rows_per_group = [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]
splits = []
start_idx = 0
for count in rows_per_group:
    splits.append(df_ordered.iloc[start_idx:start_idx + count])
    start_idx += count

group_combinations = list(itertools.combinations(range(10), 9))

for combo_index, combo in enumerate(group_combinations):
    combined_rows = pd.concat([splits[i] for i in combo], ignore_index=True)

    group_file_name = f"cross-combo{combo_index + 1}.tab"
    combined_rows.to_csv(group_file_name, sep='\t', index=False, header=False)

    release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

    sample_names = combined_rows[1].values
    y_values = combined_rows[2].values

    tara_dict = release_df.set_index('ko').to_dict('index')

    x_values_for_all_enzymes = []

    for enzyme, enzyme_data in tara_dict.items():
        x_values = []

        for sample in sample_names:
            if sample in enzyme_data:
                x_values.append(enzyme_data[sample])
            else:
                x_values.append(np.nan)

        x_values_for_all_enzymes.append([enzyme] + x_values)

    correlation_results = []
    for enzyme_data in x_values_for_all_enzymes:
        enzyme_name = enzyme_data[0]
        x_values = enzyme_data[1:]

        correlation_coefficient = calculate_correlation(x_values, y_values)
        correlation_results.append((enzyme_name, correlation_coefficient))

    correlation_results.sort(key=lambda x: x[1], reverse=True)

    with open(f"cross-combo{combo_index + 1}_all_correlations.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient in correlation_results:
            degrees_of_freedom = len(combined_rows) - 2
            t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

    degrees_of_freedom = len(combined_rows) - 2
    significant_rows = []
    total_probability = 0

    for enzyme_name, correlation_coefficient in correlation_results:
        t_value, p_value = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)
        if total_probability + p_value <= 1:
            significant_rows.append((enzyme_name, correlation_coefficient, t_value, p_value))
            total_probability += p_value
        else:
            break

    with open(f"5_10_cross_sig_combo{combo_index + 1}.tab", "w") as file:
        file.write("Enzyme\tCorrelation\tT-dist\tP-value\n")
        for enzyme_name, correlation_coefficient, t_value, p_value in significant_rows:
            file.write(f"{enzyme_name}\t{correlation_coefficient}\t{t_value}\t{p_value}\n")

In [None]:
sig_enzymes = read_sig_pearson_file("sig_pearson_enzymes.txt")

group_files = [
    "5_10_cross_sig_combo1.tab",
    "5_10_cross_sig_combo2.tab",
    "5_10_cross_sig_combo3.tab",
    "5_10_cross_sig_combo4.tab",
    "5_10_cross_sig_combo5.tab",
    "5_10_cross_sig_combo6.tab",
    "5_10_cross_sig_combo7.tab",
    "5_10_cross_sig_combo8.tab",
    "5_10_cross_sig_combo9.tab",
    "5_10_cross_sig_combo10.tab"
]

shared_enzyme_data = {}

for i, group_file in enumerate(group_files, start=1):
    group_enzymes = read_cross_sig_file(group_file)
    for enzyme_name in group_enzymes:
        if enzyme_name in sig_enzymes:
            if enzyme_name not in shared_enzyme_data:
                shared_enzyme_data[enzyme_name] = {"Enzyme Name": enzyme_name}

            shared_enzyme_data[enzyme_name][f"Group{i} Correlation"] = group_enzymes[enzyme_name]

final_data = []
for enzyme_name, group_data in shared_enzyme_data.items():
    row = [enzyme_name] + [group_data.get(f"Group{i} Correlation", "na") for i in range(1, 11)]
    final_data.append(row)

df = pd.DataFrame(final_data, columns=["Enzyme Name"] + [f"Group{i} Correlation" for i in range(1, 11)])

percentages = {}
for i in range(1, 11):
    group_column = f"Group{i} Correlation"

    common_enzymes_in_group = df[group_column].apply(lambda x: x != "na").sum()

    total_enzymes_in_sig = len(sig_enzymes)
    total_enzymes_in_group = len(df)

    denominator = min(total_enzymes_in_sig, total_enzymes_in_group)

    percentage_common = (common_enzymes_in_group / denominator) * 100 if denominator else 0
    percentages[f"Group{i}"] = percentage_common

percentages_df = pd.DataFrame(list(percentages.items()), columns=["Group", "Percentage"])
percentages_df["Percentage"] = percentages_df["Percentage"].apply(lambda x: f"{x:.5f}%")
percentages_df.to_csv("5_10_cross_validation_percentages.tab", sep="\t", index=False)

# Confidence Interval

In [None]:
import numpy as np
import scipy.stats as stats
import pandas as pd
from scipy.stats import pearsonr

def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = pearsonr(x_values, y_values)
    return correlation_coefficient

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])

mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

correlation_coefficients = {}
x_values_for_all_enzymes = []

for row_index in range(len(release_df)):
    x_values = [release_df.iloc[row_index][str(second_value)] for second_value in second_values]

    desired_length = len(x_values)
    y_values_resized = np.resize(y_values, desired_length)

    correlation_coefficient = calculate_correlation(x_values, y_values_resized)
    correlation_coefficients[row_index] = correlation_coefficient

    enzyme_name = release_df.iloc[row_index, 0]
    x_values_for_all_enzymes.append([enzyme_name] + x_values)

sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

first_column_values = release_df.iloc[:, 0].values

significant_rows = []
total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    first_value = first_column_values[row_index]  # Get the corresponding enzyme name
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, first_value, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

with open("sig_pearson_enzymes.txt", "w") as file:
    for row_index, first_value, correlation_coefficient, x, probability in significant_rows:
        file.write(f"Enzyme = {first_value}, Correlation = {correlation_coefficient}, t-dist = {x}, p-value = {probability}\n")

In [None]:
def fisher_z_transformation(correlation_coefficient):
    return 0.5 * np.log((1 + correlation_coefficient) / (1 - correlation_coefficient))

def calculate_confidence_interval(correlation_coefficient, degrees_of_freedom, confidence_level=0.95):
    z = fisher_z_transformation(correlation_coefficient)

    n = degrees_of_freedom + 2
    se = 1 / np.sqrt(n - 3)

    alpha = 1 - confidence_level
    z_alpha = stats.norm.ppf(1 - alpha / 2)

    z_lower = z - z_alpha * se
    z_upper = z + z_alpha * se

    r_lower = (np.exp(2 * z_lower) - 1) / (np.exp(2 * z_lower) + 1)
    r_upper = (np.exp(2 * z_upper) - 1) / (np.exp(2 * z_upper) + 1)

    return r_lower, r_upper

significant_rows = []
with open("sig_pearson_enzymes.txt", "r") as file:
    for line in file:
        parts = line.strip().split(", ")
        enzyme_name = parts[0].split(" = ")[1]
        correlation = float(parts[1].split(" = ")[1])
        t_value = float(parts[2].split(" = ")[1].split()[0])
        p_value = float(parts[3].split(" = ")[1])

        significant_rows.append((enzyme_name, correlation, t_value, p_value))

confidence_intervals = []
for enzyme_name, correlation, t_value, p_value in significant_rows:
    df = 39
    r_lower, r_upper = calculate_confidence_interval(correlation, df)
    confidence_intervals.append((enzyme_name, correlation, r_lower, r_upper, t_value, p_value))

with open("confidence_intervals_pearson.tab", "w") as file:
    file.write("Enzyme\tCorrelation\t95% CI Lower\t95% CI Upper\tt-dist (x value)\tp-value\n")
    for enzyme_name, correlation, r_lower, r_upper, t_value, p_value in confidence_intervals:
        file.write(f"{enzyme_name}\t{correlation}\t{r_lower}\t{r_upper}\t{t_value}\t{p_value}\n")

### Confidence Interval (Spearman)

In [None]:
# Function to calculate correlation
def calculate_correlation(x_values, y_values):
    correlation_coefficient, _ = spearmanr(x_values, y_values)
    return correlation_coefficient

release_df = pd.read_csv("TARA243.KO.profile.release", delim_whitespace=True, skiprows=[1])
mapped_data = pd.read_csv('mapped.txt', sep='\t', header=None)

second_values = mapped_data[1].values
y_values = mapped_data[2].values

correlation_coefficients = {}
x_values_for_all_enzymes = []

for row_index in range(len(release_df)):
    x_values = [release_df.iloc[row_index][str(second_value)] for second_value in second_values]

    desired_length = len(x_values)
    y_values_resized = np.resize(y_values, desired_length)

    correlation_coefficient = calculate_correlation(x_values, y_values_resized)
    correlation_coefficients[row_index] = correlation_coefficient

    enzyme_name = release_df.iloc[row_index, 0]
    x_values_for_all_enzymes.append([enzyme_name] + x_values)

sorted_coefficients = sorted(correlation_coefficients.items(), key=lambda x: x[1], reverse=True)

first_column_values = release_df.iloc[:, 0].values

significant_rows = []
total_probability = 0

for row_index, correlation_coefficient in sorted_coefficients:
    first_value = first_column_values[row_index]
    x, probability = calculate_t_value_distribution(correlation_coefficient, degrees_of_freedom)

    if total_probability + probability <= 1:
        significant_rows.append((row_index, first_value, correlation_coefficient, x, probability))
        total_probability += probability
    else:
        break

with open("sig_spearman_enzymes.txt", "w") as file:
    for row_index, first_value, correlation_coefficient, x, probability in significant_rows:
        file.write(f"Enzyme = {first_value}, Correlation = {correlation_coefficient}, t-dist = {x}, p-value = {probability}\n")

In [None]:
import numpy as np
import scipy.stats as stats

def fisher_z_transformation(correlation_coefficient):
    return 0.5 * np.log((1 + correlation_coefficient) / (1 - correlation_coefficient))

def calculate_confidence_interval(correlation_coefficient, degrees_of_freedom, confidence_level=0.95):
    z = fisher_z_transformation(correlation_coefficient)

    n = degrees_of_freedom + 2  # Sample size (degrees of freedom + 2)
    se = 1 / np.sqrt(n - 3)

    alpha = 1 - confidence_level
    z_alpha = stats.norm.ppf(1 - alpha / 2)  # Z value for confidence level (e.g., 1.96 for 95% CI)

    z_lower = z - z_alpha * se
    z_upper = z + z_alpha * se

    r_lower = (np.exp(2 * z_lower) - 1) / (np.exp(2 * z_lower) + 1)
    r_upper = (np.exp(2 * z_upper) - 1) / (np.exp(2 * z_upper) + 1)

    return r_lower, r_upper

significant_rows = []
with open("sig_spearman_enzymes.txt", "r") as file:
    for line in file:
        parts = line.strip().split(", ")
        enzyme_name = parts[0].split(" = ")[1]
        correlation = float(parts[1].split(" = ")[1])
        t_value = float(parts[2].split(" = ")[1].split()[0])
        p_value = float(parts[3].split(" = ")[1])

        significant_rows.append((enzyme_name, correlation, t_value, p_value))

confidence_intervals = []
for enzyme_name, correlation, t_value, p_value in significant_rows:
    df = 39
    r_lower, r_upper = calculate_confidence_interval(correlation, df)
    confidence_intervals.append((enzyme_name, correlation, r_lower, r_upper, t_value, p_value))

with open("confidence_intervals_spearman.tab", "w") as file:
    file.write("Enzyme\tCorrelation\t95% CI Lower\t95% CI Upper\tt-dist (x value)\tp-value\n")
    for enzyme_name, correlation, r_lower, r_upper, t_value, p_value in confidence_intervals:
        file.write(f"{enzyme_name}\t{correlation}\t{r_lower}\t{r_upper}\t{t_value}\t{p_value}\n")