In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

In [31]:
expansion_file = "../data/metrics_file.tsv"
df = pd.read_csv(expansion_file, sep="\t")
df.head()

Unnamed: 0,SampleId,VariantId,Genotype,AlleleDepth
0,HW0174518310.final-gatk.cram,ATXN1,30/30,22.94/22.59
1,HW0174518310.final-gatk.cram,ATXN2,22/22,32.15/6.23
2,HW0174518310.final-gatk.cram,ATXN3,20/24,12.52/14.28
3,HW0174518264.final-gatk.cram,ATXN1,29/30,15.21/17.23
4,HW0174518264.final-gatk.cram,ATXN2,22/32,18.95/13.23


In [32]:
def get_zygosity(row):
    if row["Allele_I"] == row["Allele_II"]:
        return "HOM"
    else:
        return "HET"

df[["Allele_I", "Allele_II"]] = df["Genotype"].str.split("/", expand=True)
df["SampleId"] = df["SampleId"].str.split(".").str[0]
df["Zygosity"] = df.apply(get_zygosity, axis=1)
df.head()
output_file = "../data/repeats_stat_file.xlsx"
df.to_excel(output_file, index=False)

In [33]:

def plot_histogram(df, gene):
    df.loc[:, 'Allele_I'] = df['Allele_I'].astype(int)
    df.loc[:, 'Allele_II'] = df['Allele_II'].astype(int)

    min_val = min(df['Allele_I'].min(), df['Allele_II'].min())
    max_val = max(df['Allele_I'].max(), df['Allele_II'].max())
    bins = np.arange(min_val - 0.5, max_val + 1.5)

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))

    for ax, zygosity in zip(axs, ['HOM', 'HET']):
        df_zygosity = df[df['Zygosity'] == zygosity]
        ax.hist([df_zygosity['Allele_I'], df_zygosity['Allele_II']], bins=bins, label=['Allele_I', 'Allele_II'])
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
        ax.set_xlabel('Length (nt)')
        ax.set_ylabel('Frequency')
        ax.set_title(f'{gene} ({zygosity})')
        ax.legend(loc='upper right')

    plt.tight_layout()
    plt.savefig(f'../data/{gene}_repeats.png')
    plt.close()
for gene in df["VariantId"].unique():
    gene_df = df[df["VariantId"] == gene]
    plot_histogram(gene_df, gene)