In [1]:

import os.path
from pathlib import Path

from utils.util import *


KeyboardInterrupt



In [None]:
models_to_test = {
    "Real": {
        "path": "resource/10K_SNP_1000G_real.hapt",
        "color": "gray"
    },
    "F1 WITH PENALTY GS-AC-GAN": {
        "path": "fake_genotypes_sequences/new_sequences/f1_score_test/genotypes.hapt",
        "color": "red"
    },
    "SUB F1 WITH PENALTY GS-AC-GAN": {
        "path": "fake_genotypes_sequences/new_sequences/f1_score_sub_pop/genotypes.hapt",
        "color": "blue"
    },
}

In [None]:
output_dir = "population_analysis"
Path(output_dir).mkdir(parents=True, exist_ok=True)
figwi = 12

In [None]:
model_name_to_input_file, model_name_to_color, color_palette = init_analysis_args(output_dir, models_to_test)

In [None]:
real_data_1000_genome = load_real_data(hapt_genotypes_path=f"../{REAL_10K_SNP_1000G_PATH}",
                                       extra_data_path=f"../{REAL_EXTRA_DATA_PATH}")
real_data_1000_genome

In [None]:
genotypes = get_relevant_columns(input_df=real_data_1000_genome, input_columns=[])
relevant_columns = genotypes + ['Superpopulation code', 'Population code']
print(f"Number of SNPs: {len(genotypes)}")

In [None]:
pop = real_data_1000_genome['Superpopulation code']
sub_pop = real_data_1000_genome['Population code']
real_data_1000_genome_genotypes = real_data_1000_genome[genotypes]
genotypes = [genotype for genotype in range(real_data_1000_genome_genotypes.shape[1])]
real_data_1000_genome_genotypes.columns = genotypes

real_data_1000_genome_genotypes['is_real'] = 1
real_data_1000_genome_genotypes_by_pop = real_data_1000_genome_genotypes.copy()
real_data_1000_genome_genotypes_by_pop['Superpopulation code'] = pop
real_data_1000_genome_genotypes_by_pop = real_data_1000_genome_genotypes_by_pop.reset_index(drop=True)

real_data_1000_genome_genotypes_by_sub_pop = real_data_1000_genome_genotypes.copy()
real_data_1000_genome_genotypes_by_sub_pop['Population code'] = sub_pop
real_data_1000_genome_genotypes_by_sub_pop = real_data_1000_genome_genotypes_by_sub_pop.reset_index(drop=True)

real_data_1000_genome.head()

In [None]:
def prepare_synthetic_data(input_file_path, target_column):
    synthetic_pop_results = pd.read_csv(input_file_path, sep=' ', header=None)
    pop = synthetic_pop_results[0]
    synthetic_pop_results = synthetic_pop_results.drop(0, axis=1)
    synthetic_pop_results.columns = [genotype for genotype in range(synthetic_pop_results.shape[1])]
    synthetic_pop_results[target_column] = pop.str.replace('Fake_', "")
    return synthetic_pop_results


def prepare_old_synthetic_data(input_file_path):
    synthetic_pop_results = pd.read_csv(input_file_path, sep=' ', header=None)
    synthetic_pop_results = synthetic_pop_results.drop([0, 1], axis=1)
    synthetic_pop_results.columns = [genotype for genotype in range(synthetic_pop_results.shape[1])]
    return synthetic_pop_results


synthetic_pop_results = prepare_synthetic_data(
    '../fake_genotypes_sequences/new_sequences/full_pop/10001_genotypes.hapt', 'Superpopulation code')
synthetic_pop_results = prepare_old_synthetic_data(
    '../fake_genotypes_sequences/preview_sequences/10K_SNP_RBM_AG_1050epochs.hapt')
synthetic_sub_pop_results = prepare_synthetic_data(
    '../fake_genotypes_sequences/new_sequences/full_sub_pop/10001_genotypes.hapt', 'Population code')

In [None]:
real_with_fake_by_pop = pd.concat([real_data_1000_genome_genotypes_by_pop, synthetic_pop_results])
real_with_fake_by_pop['is_real'] = real_with_fake_by_pop['is_real'].fillna(0)
real_with_fake_by_pop

In [None]:
real_with_fake_by_sub_pop = pd.concat([real_data_1000_genome_genotypes_by_sub_pop, synthetic_sub_pop_results])
real_with_fake_by_sub_pop['is_real'] = real_with_fake_by_sub_pop['is_real'].fillna(0)
real_with_fake_by_sub_pop

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


def plot_pca_real_vs_fake(df, population_col, is_real_col, n_components):
    """
    Plot PCA of real compared to fake on each population, with the number of required components.

    Args:
    df (pd.DataFrame): The dataset containing the samples.
    population_col (str): The name of the column containing the population code.
    is_real_col (str): The name of the column containing the indicator for real/fake samples.
    n_components (int): The number of components to use in the PCA analysis.
    """

    # Get unique populations
    # uniques, counts = np.unique(df[population_col], return_counts=True)
    # populations = [pop for i, pop in enumerate(uniques) if counts[i] > 420]
    # print(counts)
    populations = random.sample(list(df[population_col]), 6)
    ext = 1
    fig, axs = plt.subplots(nrows=3, ncols=len(populations),
                            figsize=(len(populations) * 16, 3 * 22), constrained_layout=True)

    # Create subplots

    # Loop through populations
    for pop_index, pop in enumerate(populations):

        # Get real and fake samples for population
        pop_df = df[df[population_col] == pop].reset_index(drop=True)
        real_samples = pop_df[pop_df[is_real_col] == 1].iloc[:, :-2].values
        fake_samples = pop_df[pop_df[is_real_col] == 0].iloc[:, :-2].values

        # Fit PCA on real samples
        pca = PCA(n_components=6)
        # real_pca = pca.fit_transform(datasets['Real'])
        pca.fit_transform(real_samples)

        pcs = pca.transform(np.concatenate([real_samples, fake_samples]))
        pcdf = pd.DataFrame(pcs, columns=["PC{}".format(x + 1) for x in np.arange(pcs.shape[1])])
        pcdf['label'] = pop_df['is_real']
        pcdf['label'] = np.where(pcdf['label'] > 0.5, pop, 'Real')

        for i, pcx in enumerate([0, 2, 4]):
            # compute x and y ranges to force same dimension for all methods
            pcs = pcdf.drop(columns=['label', 'coupled_with'], errors='ignore').values
            xlim = (np.min(pcs[:, pcx]) - ext, np.max(pcs[:, pcx]) + ext)
            ylim = (np.min(pcs[:, pcx + 1]) - ext, np.max(pcs[:, pcx + 1]) + ext)

            reals = (pcdf.label == 'Real')
            ax = axs[i, pop_index]

            ax.scatter(pcdf.values[reals, pcx], pcdf.values[reals, pcx + 1], alpha=.5, s=500, color='white',
                       edgecolor='white', linewidths=2)
            keep = (pcdf.label == pop)
            ax.scatter(pcdf.values[keep, pcx], pcdf.values[keep, pcx + 1], alpha=.5, color='red', s=500,
                       edgecolor='red', linewidths=2)
            ax.set_xlim(xlim)
            ax.set_ylim(ylim)
            ax.set_xlabel("PC{}\n".format(pcx + 1), fontsize=65, fontweight='bold')
            ax.set_ylabel("\nPC{}".format(pcx + 2), fontsize=65, fontweight='bold')
            if i == 0:
                ax.set_title(f"\n{pop}\n", fontsize=80, fontweight='bold')
            ax.set_facecolor('black')

    plt.savefig(os.path.join(output_dir, 'pca_by_sub_pop.jpg'))
    plt.show()

In [None]:
plot_pca_real_vs_fake(real_with_fake_by_sub_pop, 'Population code', 'is_real', 2)


In [None]:
def get_number_of_duplicate(df):
    # Find all the duplicate rows in the dataframe
    duplicates = df[df.duplicated(keep=False)]

    # Get the indices of each duplicate row
    indices = duplicates.index.tolist()

    return round(len(indices) / float(len(df)) * 100, 2)

In [None]:
rows = []
for weight in [0.01, 0.02, 0.03, 0.1, 0.5]:
    sub_size = int(len(genotypes) * weight)
    print(f"start calculating {sub_size} duplications")
    for i in range(200):
        sub_list = random.sample(genotypes, sub_size)
        rows.append({"batch_size": sub_size, "Type": "Real", "percentage_of_duplications": get_number_of_duplicate(
            real_data_1000_genome_genotypes_by_pop[sub_list])})
        rows.append({"batch_size": sub_size, "Type": "Real & Old",
                     "percentage_of_duplications": get_number_of_duplicate(real_with_fake_by_pop[sub_list])})
        rows.append({"batch_size": sub_size, "Type": "Old", "percentage_of_duplications":
            get_number_of_duplicate(synthetic_pop_results[sub_list])})
        rows.append({"batch_size": sub_size, "Type": "Real & New",
                     "percentage_of_duplications": get_number_of_duplicate(real_with_fake_by_sub_pop[sub_list])})
        rows.append({"batch_size": sub_size, "Type": "New", "percentage_of_duplications":
            get_number_of_duplicate(synthetic_sub_pop_results[sub_list])})
        if (i + 1) % 50 == 0:
            print(f"finished calculating {sub_size} duplications for {i+1} sub lists")
    print(f"finished calculating {sub_size} duplications")

duplicates_sub_genomics = pd.DataFrame(rows)

In [None]:
import matplotlib.pyplot as plt
import numpy as np


def plot_comparison(df):
    fig, ax = plt.subplots(figsize=(18, 8))

    # Group the DataFrame by 'Type' and calculate the mean of 'percentage_of_duplications' for each 'batch_size'
    grouped_df = df.groupby(['Type', 'batch_size'])['percentage_of_duplications'].mean().reset_index()

    # Get unique types and their count
    types = np.unique(grouped_df['Type'])
    num_types = len(types)

    # Get unique non-empty batch sizes
    batch_sizes = np.unique(grouped_df['batch_size'])

    # Set the bar width and offset
    bar_width = 1 / num_types
    offset = bar_width / 2

    # Loop through the types and plot the bars
    for i, t in enumerate(types):
        subset = grouped_df[grouped_df['Type'] == t]

        # Filter the subset to include only non-empty batch sizes
        subset = subset[subset['batch_size'].isin(batch_sizes)]

        x = np.arange(len(subset))

        # Plot the bars with mean 'percentage_of_duplications' values
        bars = ax.bar(x + (i * bar_width) - offset, subset['percentage_of_duplications'], width=bar_width, label=t)

        # Add the numbers on top of each bar
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width() / 2, height, round(height, 2),
                    ha='center', va='bottom')

    # Set the y-axis as 'percentage_of_duplications'
    ax.set_ylabel('Mean Number of Duplications Percentage')

    # Set the x-axis as 'batch_size'
    ax.set_xlabel('Batch Size')

    # Set the title and legend
    ax.set_title('Mean Number of Duplications Percentage by Type and Batch Size')
    ax.legend()

    # Adjust the x-axis tick labels
    ax.set_xticks(np.arange(len(batch_sizes)))
    ax.set_xticklabels(batch_sizes)

    # Set the position of the x-axis label to be at the middle of the bars
    # ax.xaxis.set_label_coords(0.5, -0.15)

    # Rotate the x-axis tick labels for better visibility
    plt.xticks(rotation=45)

    # Show the plot
    plt.tight_layout()
    plt.show()


In [None]:
plot_comparison(duplicates_sub_genomics)
grouped_df = duplicates_sub_genomics.groupby(['Type', 'batch_size'])['percentage_of_duplications'].mean().reset_index()
grouped_df.to_csv("grouped_df.csv")

In [None]:
def print_frequency_compression(current_df, target_column, title):
    rows = []
    for pop in current_df[target_column].unique():
        if "," not in pop:
            for is_real in [0, 1]:
                tmp_df = current_df[current_df[target_column] == pop]
                tmp_df = tmp_df[tmp_df["is_real"] == is_real]
                uniques, counts = np.unique(tmp_df[genotypes], return_counts=True)
                tmp_percentages = dict(zip(uniques, 100 * counts / (len(tmp_df[genotypes]) * len(genotypes))))
                tmp_percentages = {key: round(values, 3) for key, values in tmp_percentages.items()}
                tmp_percentages["Pop"] = pop
                tmp_percentages["Type"] = "Real" if is_real == 1 else "Fake"
                rows.append(tmp_percentages)
    # print("\033[91m \033[1m ", title, "\033[0m")
    # print(pd.DataFrame(rows).plot())
    return pd.DataFrame(rows).sort_values(0)

In [None]:
allel_freq_df = print_frequency_compression(real_with_fake_by_pop, target_column='Superpopulation code', title="title")

fig, ax = plt.subplots(figsize=(16, 10))

real_genotypes = real_with_fake_by_sub_pop[real_with_fake_by_sub_pop["is_real"] == 1][genotypes]
uniques, counts = np.unique(real_genotypes, return_counts=True)
tmp_percentages = dict(zip(uniques, 100 * counts / (len(real_genotypes) * len(genotypes))))
tmp_percentages = {key: round(values) for key, values in tmp_percentages.items()}
ax.axhline(y=tmp_percentages[1], color='black', linestyle='--', linewidth=2)

# group the dataframe by 'Pop' and 'Type' columns, and get the sum of '1' column
grouped = allel_freq_df.groupby(['Pop', 'Type'])[1].mean()

# plot the grouped data as a bar plot with color mapped to the 'Type' categories
grouped.unstack().plot(kind='bar', ax=ax, color=['red', 'blue'], width=0.6)

# set the title and axis labels
ax.set_title('Bar plot of Pop vs. 1, colored by Type')
ax.set_xlabel('Pop')
ax.set_ylabel('1')

# show the plot

In [None]:
def init_fake_genotypes(model_name: str):
    tmp_df = pd.read_csv(model_name_to_input_file[model_name], sep=' ', header=None)
    tmp_df[0] = tmp_df[0].str.replace('Fake_', '')
    tmp_df = tmp_df.set_index(tmp_df[0])
    tmp_df.drop([0], inplace=True, axis=1)
    tmp_df.columns = genotypes
    return tmp_df

In [None]:
def split_by_target_column(input_df: pd.DataFrame, target_column):
    real_split = {}
    fake_split = {}
    populations = input_df[target_column].unique()
    input_df.set_index(target_column, inplace=True)
    for pop_name in list(populations):
        if len(input_df[input_df.index == pop_name]) > 20 and pop_name not in ['GBR', 'CLM', 'PJL', 'ACB', 'ESN', 'ITU',
                                                                               'LWK', 'GIH', 'PUR', 'GWD', 'MSL']:
            pop_df = input_df[input_df.index == pop_name]
            real_split[pop_name] = pop_df[pop_df["is_real"] == 0]
            fake_split[pop_name] = pop_df[pop_df["is_real"] == 1]
    return real_split, fake_split

In [None]:
real_split_by_sub_pop, fake_split_by_sub_pop = split_by_target_column(input_df=real_with_fake_by_pop.copy(),
                                                                      target_column="Superpopulation code")

In [None]:
real_split_by_super_pop, fake_split_by_super_pop = split_by_target_column(input_df=real_with_fake_by_sub_pop.copy(),
                                                                          target_column="Population code")

In [None]:
fake_sum_alleles_by_sub_pop, fake_allele_frequency_by_sub_pop, _ = build_allele_frequency(fake_split_by_sub_pop)
fake_sum_alleles_by_super_pop, fake_allele_frequency_by_super_pop, _ = build_allele_frequency(fake_split_by_super_pop)
real_sum_alleles_by_sub_pop, real_allele_frequency_by_sub_pop, _ = build_allele_frequency(real_split_by_sub_pop)
real_sum_alleles_by_super_pop, real_allele_frequency_by_super_pop, _ = build_allele_frequency(real_split_by_super_pop)

In [None]:
def plot_allele_frequency_fake_vs_real(real_input, fake_input, color, output_file_name):
    l, c = np.ceil(len(fake_input) / 3), 3
    plt.figure(figsize=(figwi, figwi * l / c))
    for i, (model_name, fake_allele_frequency) in enumerate(fake_input.items()):
        ax = plt.subplot(int(l), 3, i + 1)
        plotreg(x=real_input[model_name], y=fake_allele_frequency,
                keys=['Real', "Fake"], statname="Allele frequency",
                col=color, ax=ax)
        plt.title(f'Allele Frequencies Fake vs Real - {model_name}')
    plt.suptitle(f'Allele Frequencies vs Real \n\n')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, output_file_name))

In [None]:
plot_allele_frequency_fake_vs_real(real_input=real_allele_frequency_by_super_pop,
                                   fake_input=fake_allele_frequency_by_super_pop,
                                   color="blue", output_file_name="allele_frequency_sub_pop.jpg")

In [None]:
plot_allele_frequency_fake_vs_real(real_input=real_allele_frequency_by_sub_pop,
                                   fake_input=fake_allele_frequency_by_sub_pop,
                                   color="red", output_file_name="allele_frequency_pop.jpg")