In [1]:
from pathlib import Path
from utils.util import *
import plotly.express as px
output_dir = "data_analysis"
Path(output_dir).mkdir(parents=True, exist_ok=True)
real_data_1000_genome = load_real_data(hapt_genotypes_path=f"../{REAL_10K_SNP_1000G_PATH}",
                                       extra_data_path=f"../{REAL_EXTRA_DATA_PATH}")
real_data_1000_genome


KeyboardInterrupt



In [None]:
sns.set(style="whitegrid", rc={"figure.figsize": (20, 10)})


def plot_number_of_samples(target_column_name: str, output_path: str):
    ax = sns.countplot(x=target_column_name, data=real_data_1000_genome)
    plt.xlabel(target_column_name)
    plt.ylabel("Number of Samples")
    plt.title("Number of Samples by " + target_column_name)

    # Add count values to the plot
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='center', xytext=(0, 10), textcoords='offset points')
    plt.savefig(output_path)
    plt.show()

In [None]:
plot_number_of_samples("Superpopulation code", output_path=os.path.join(output_dir, "super_population_distribution.jpg"))

In [None]:
plot_number_of_samples("Population code", output_path=os.path.join(output_dir, "sub_population_distribution.jpg"))

In [None]:
genotypes_ids = get_relevant_columns(input_df=real_data_1000_genome, input_columns=[])
# genotypes_ids.pop(0)
# genotypes_ids.pop(0)
print(f"Number of SNPs: {len(genotypes_ids)}")

In [None]:
def print_frequency(current_df, target_column, title):
    rows = []
    for pop in current_df[target_column].unique():
        tmp_df = current_df[current_df[target_column] == pop]
        uniques, counts = np.unique(tmp_df[genotypes_ids], return_counts=True)
        tmp_percentages = dict(zip(uniques, 100 * counts / (len(tmp_df[genotypes_ids]) * len(genotypes_ids))))
        # tmp_percentages = {key: f"{str(round(values, 2))}%" for key, values in tmp_percentages.items()}
        tmp_percentages = {key: round(values, 2) for key, values in tmp_percentages.items()}
        tmp_percentages["pop"] = pop
        rows.append(tmp_percentages)
    # print("\033[91m \033[1m ", title, "\033[0m")
    # print(pd.DataFrame(rows).plot())
    df = pd.DataFrame(rows)
    df.set_index('pop', inplace=True)
    df.plot.bar()


Allele Frequency

In [None]:
print_frequency(current_df=real_data_1000_genome, target_column=0, title="allele frequencies all")

In [None]:
print_frequency(current_df=real_data_1000_genome, target_column="Superpopulation code",
                title="allele frequencies by Superpopulation code")

In [None]:
print_frequency(current_df=real_data_1000_genome, target_column="Population code",
                title="allele frequencies by Population code")

PCA

In [None]:
n_components = 2
pca = PCA(n_components=n_components)
components = pca.fit_transform(real_data_1000_genome[genotypes_ids])

In [None]:
fig = px.scatter(components, x=0, y=1, color=real_data_1000_genome["Superpopulation code"],
                 title="PCA By Super Population")
fig.show()

In [None]:
fig = px.scatter(components, x=0, y=1, color=real_data_1000_genome["Population code"],
                 title="PCA By Population")
fig.show()