In [1]:
import itertools
import os.path
import pickle
import random
import warnings
from pathlib import Path

from scipy.stats import binned_statistic
from scipy.stats import sem


from utils.util import *


KeyboardInterrupt



In [None]:
models_to_test = {
    "Real": {
        "path": "resource/10K_SNP_1000G_real.hapt",
        "color": "gray"
    },
    "F1 WITH PENALTY GS-AC-GAN": {
        "path": "fake_genotypes_sequences/new_sequences/f1_score_test/genotypes.hapt",
        "color": "red"
    },
    "SUB F1 WITH PENALTY GS-AC-GAN": {
        "path": "fake_genotypes_sequences/new_sequences/f1_score_sub_pop/genotypes.hapt",
        "color": "blue"
    },
}

In [None]:
output_dir = "population_analysis"
Path(output_dir).mkdir(parents=True, exist_ok=True)
figwi = 12

In [None]:
model_name_to_input_file, model_name_to_color, color_palette = init_analysis_args(output_dir, models_to_test)

In [None]:
real_data_1000_genome = load_real_data(hapt_genotypes_path=f"../{REAL_10K_SNP_1000G_PATH}",
                                       extra_data_path=f"../{REAL_EXTRA_DATA_PATH}")
real_data_1000_genome

In [None]:
genotypes_ids = get_relevant_columns(input_df=real_data_1000_genome, input_columns=[])
print(f"Number of SNPs: {len(genotypes_ids)}")

In [None]:
def init_real_genotypes(target_columns):
    real_data_1000_genome_by_pop = real_data_1000_genome[genotypes_ids + [target_columns]]
    real_data_1000_genome_by_pop = real_data_1000_genome_by_pop.set_index(
        real_data_1000_genome_by_pop[target_columns])
    real_data_1000_genome_by_pop.drop([target_columns], inplace=True, axis=1)
    return real_data_1000_genome_by_pop

In [None]:
real_data_1000_genome_by_sub_pop = init_real_genotypes(target_columns="Population code")
real_data_1000_genome_by_super_pop = init_real_genotypes(target_columns="Superpopulation code")

In [None]:
def init_fake_genotypes(model_name: str):
    tmp_df = pd.read_csv(model_name_to_input_file[model_name], sep=' ', header=None)
    tmp_df[0] = tmp_df[0].str.replace('Fake_', '')
    tmp_df = tmp_df.set_index(tmp_df[0])
    tmp_df.drop([0], inplace=True, axis=1)
    tmp_df.columns = genotypes_ids
    return tmp_df

In [None]:
super_pop_df = init_fake_genotypes(model_name="F1 WITH PENALTY GS-AC-GAN")
super_pop_df

In [None]:
sub_pop_df = init_fake_genotypes(model_name="SUB F1 WITH PENALTY GS-AC-GAN")
sub_pop_df

In [None]:
def split_by_target_column(input_fake_df: pd.DataFrame, input_real_df: pd.DataFrame):
    real_split = {}
    fake_split = {}
    for pop_name in list(set(input_real_df.index)):
        if len(input_fake_df[input_fake_df.index == pop_name]) > 0:
            real_split[pop_name] = input_real_df[input_real_df.index == pop_name]
            fake_split[pop_name] = input_fake_df[input_fake_df.index == pop_name]
    return real_split, fake_split

In [None]:
real_split_by_sub_pop, fake_split_by_sub_pop = split_by_target_column(input_fake_df=sub_pop_df,
                                                                      input_real_df=real_data_1000_genome_by_sub_pop)

In [None]:
real_split_by_super_pop, fake_split_by_super_pop = split_by_target_column(input_fake_df=super_pop_df,
                                                                          input_real_df=real_data_1000_genome_by_super_pop)

In [None]:
fake_sum_alleles_by_sub_pop, fake_allele_frequency_by_sub_pop, _ = build_allele_frequency(fake_split_by_sub_pop)
fake_sum_alleles_by_super_pop, fake_allele_frequency_by_super_pop, _ = build_allele_frequency(fake_split_by_super_pop)
real_sum_alleles_by_sub_pop, real_allele_frequency_by_sub_pop, _ = build_allele_frequency(real_split_by_sub_pop)
real_sum_alleles_by_super_pop, real_allele_frequency_by_super_pop, _ = build_allele_frequency(real_split_by_super_pop)

In [None]:
def plot_allele_frequency_fake_vs_real(real_input, fake_input, color, output_file_name):
    l, c = np.ceil(len(fake_input) / 3), 3
    plt.figure(figsize=(figwi, figwi * l / c))
    for i, (model_name, fake_allele_frequency) in enumerate(fake_input.items()):
        ax = plt.subplot(int(l), 3, i + 1)
        plotreg(x=real_input[model_name], y=fake_allele_frequency,
                keys=['Real', "Fake"], statname="Allele frequency",
                col=color, ax=ax)
        plt.title(f'Allele Frequencies Fake vs Real - {model_name}')
    plt.suptitle(f'Allele Frequencies vs Real \n\n')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, output_file_name))

In [None]:
plot_allele_frequency_fake_vs_real(real_input=real_allele_frequency_by_super_pop,
                                   fake_input=fake_allele_frequency_by_super_pop,
                                   color="blue", output_file_name="allele_frequency_super_pop.jpg")

In [None]:
plot_allele_frequency_fake_vs_real(real_input=real_allele_frequency_by_sub_pop,
                                   fake_input=fake_allele_frequency_by_sub_pop,
                                   color="red", output_file_name="allele_frequency_sub_pop.jpg")