# Virus genome data set filtering

In [60]:
import pandas as pd
from matplotlib import pyplot as plt

In [61]:
def histograms(stats):
    # Creating histograms with custom labels
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))  # Adjust grid if you have a different layout

    stats[['N_share', 'Length']].hist(bins=50, log=True, ax=axes)

    # Setting y-axis label for the left column
    axes[0].set_ylabel('Number of sequences')

    axes[0].set_xlim(0, 1)
    axes[1].set_xlim(left=0)

    # Setting x-axis labels with the feature names
    features = ['N_share', 'Length']
    for ax, feature in zip(axes.flatten(), features):
        ax.set_xlabel(feature)
        ax.set_title('')  # Remove the title above each plot

    plt.tight_layout()
    plt.show()

# Data ingestion

In [None]:
virus = 'LCMV'
fname_stats = f'/Users/nils.petersen/dev/virus_dataset_curation/data/db2.0/stats/{virus}.stats.tsv'
stats = pd.read_csv(fname_stats, sep='\t')
stats

# Unmapped

In [None]:
unmapped = stats[stats['FilteringStatus'] == 'Unmapped']
histograms(unmapped)

In [None]:
columns = ['Sequence','Length', 'N_share']
unmapped.sort_values(by="Length", ascending=False)[columns].head(20)


# Too many N

In [None]:
too_many_N = stats[stats['FilteringStatus'] == 'TooManyN']
histograms(too_many_N)

# Too short

In [66]:
def histograms_relative_length(stats):

    columns = ['N_share', 'Length', 'RelativeLength']
    ncols = len(columns)
    # Creating histograms with custom labels
    fig, axes = plt.subplots(1, ncols, figsize=(12, 4))  # Adjust grid if you have a different layout

    stats[columns].hist(bins=50, log=True, ax=axes)

    # Setting y-axis label for the left column
    axes[0].set_ylabel('Number of sequences')

    axes[0].set_xlim(0, 1)
    axes[1].set_xlim(left=0)
    axes[2].set_xlim(left=0)

    # Setting x-axis labels with the feature names
    for ax, feature in zip(axes.flatten(), columns):
        ax.set_xlabel(feature)
        ax.set_title('')  # Remove the title above each plot

    plt.tight_layout()
    plt.show()

In [None]:
too_short = stats[stats['FilteringStatus'] == 'TooShort']
histograms_relative_length(too_short)

In [None]:
columns = ['Sequence','Length', 'N_share']
too_short.sort_values(by="Length", ascending=False)[columns].head(20)

# Ok

In [None]:
ok = stats[stats['FilteringStatus'] == 'Ok']
histograms_relative_length(ok)

In [None]:
print(f'OK: {len(ok)}')
print(f'Too many N {len(too_many_N)}')
print(f'Too short {len(too_short)}')
print(f'Unmapped {len(unmapped)}')