In [1]:
import os
import pandas as pd



path = 'logs'
files = os.listdir(path)
files = [f for f in files if f.endswith('.csv')]
files = [os.path.join(path, f) for f in files]
print(files)

['logs\\ABL_DROME_filter_log.csv', 'logs\\ACHA4_MOUSE_filter_log.csv', 'logs\\ANR17_HUMAN_filter_log.csv', 'logs\\CA2D3_MOUSE_filter_log.csv', 'logs\\CACB2_RABIT_filter_log.csv', 'logs\\CSKI1_MOUSE_filter_log.csv', 'logs\\DGLA_HUMAN_filter_log.csv', 'logs\\DOP1_HUMAN_filter_log.csv', 'logs\\GRIA2_filter_log.csv', 'logs\\IQEC1_HUMAN_filter_log.csv', 'logs\\K0513_MOUSE_filter_log.csv', 'logs\\KCNAS_DROME_filter_log.csv', 'logs\\MTUS2_HUMAN_filter_log.csv', 'logs\\PCLO_CHICK_filter_log.csv', 'logs\\PCLO_filter_log.csv', 'logs\\RIMS2_RAT_filter_log.csv', 'logs\\ROBO2_HUMAN_filter_log.csv', 'logs\\RUSC2_MOUSE_filter_log.csv', 'logs\\SCN1_HETBL_filter_log.csv', 'logs\\TRIM2_BOVIN_filter_log.csv', 'logs\\TWK7_CAEEL_filter_log.csv']


In [2]:
def get_contribution_per_step(file):
    df = pd.read_csv(file)
    # Track changes for each row in the dataframe
    df['change_working'] = df['Working'].diff()
    df['change_picked'] = df['Picked'].diff()
    df['change_x_count'] = df['X count'].diff()

    # Create new columns to store cumulative changes
    df['cumulative_working'] = df['change_working']
    df['cumulative_picked'] = df['change_picked']
    df['cumulative_x_count'] = df['change_x_count']

    # Iterate over the dataframe and apply the merging logic
    for i in range(1, len(df)):
        needs_to_merge = df.loc[i, 'Method'] in ['Pick Must Have Assignments', 'Merge Lonely Sequences', 'Assign Concensous for Isolated']
        if needs_to_merge:
            df.loc[i, 'cumulative_working'] += df.loc[i+1, 'cumulative_working']
            df.loc[i, 'cumulative_picked'] += df.loc[i+1, 'cumulative_picked']
            df.loc[i, 'cumulative_x_count'] += df.loc[i+1, 'cumulative_x_count']
            df.loc[i+1, 'cumulative_working'] = 0
            df.loc[i+1, 'cumulative_picked'] = 0
            df.loc[i+1, 'cumulative_x_count'] = 0

    # Get names of all methods
    methods = df['Method'].unique()

    # Make a dictionary that goes from method to change in each metric
    method_to_change = {}
    for method in methods:
        method_df = df[df['Method'] == method]
        method_to_change[method] = (
            method_df['cumulative_working'].sum(),
            method_df['cumulative_picked'].sum(),
            method_df['cumulative_x_count'].sum()
        )

    return method_to_change

def get_initial(file):
    df = pd.read_csv(file)
    initial_working = df.loc[0, 'Working']
    initial_x_count = df.loc[0, 'X count']

    return initial_working, initial_x_count



In [3]:


for file in files:
    df = pd.read_csv(file)

    working= df['Working'].iloc[0]
    picked= df['Picked'].iloc[0]
    x_count= df['X count'].iloc[0]
    print(f'Working: {working}, Picked: {picked}, X count: {x_count} in {file}')


Working: 421238, Picked: 0, X count: 2875125 in logs\ABL_DROME_filter_log.csv
Working: 603862, Picked: 0, X count: 3881367 in logs\ACHA4_MOUSE_filter_log.csv
Working: 242745, Picked: 0, X count: 1709832 in logs\ANR17_HUMAN_filter_log.csv
Working: 316931, Picked: 0, X count: 1148215 in logs\CA2D3_MOUSE_filter_log.csv
Working: 379584, Picked: 0, X count: 1742568 in logs\CACB2_RABIT_filter_log.csv
Working: 759863, Picked: 0, X count: 9300973 in logs\CSKI1_MOUSE_filter_log.csv
Working: 309182, Picked: 0, X count: 2693960 in logs\DGLA_HUMAN_filter_log.csv
Working: 483561, Picked: 0, X count: 3453449 in logs\DOP1_HUMAN_filter_log.csv
Working: 52953, Picked: 0, X count: 960482 in logs\GRIA2_filter_log.csv
Working: 536053, Picked: 0, X count: 4487167 in logs\IQEC1_HUMAN_filter_log.csv
Working: 490892, Picked: 0, X count: 1748602 in logs\K0513_MOUSE_filter_log.csv
Working: 250919, Picked: 0, X count: 1374529 in logs\KCNAS_DROME_filter_log.csv
Working: 588768, Picked: 0, X count: 3854901 in logs

In [4]:
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt


def create_pie_charts(method_to_change, thresholds=[5, 5, 5], show_legend=True, font_size=10, legend_font_size=12, custom_names=None, initial_working=None, initial_x_count=None, save_path=None):
    """
    Creates multiple pie charts from the method_to_change dictionary and arranges them side by side.

    Parameters:
    - method_to_change (dict): Dictionary with methods as keys and tuples of changes as values.
    - thresholds (list of floats): Percentage thresholds below which percentages are not shown.
    - show_legend (bool): Whether to show the legend or not.
    - font_size (int): Font size for the pie chart labels.
    - legend_font_size (int): Font size for the legend text.
    - custom_names (dict): Dictionary to map original method names to custom names for the legend.
    - total (float): The total value that the pie chart should represent. If provided, the difference will be labeled as "Uncovered".
    """
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    method_names = ['Working', 'Picked', 'X Count']

    # Generate a color map for each label
    labels = list(method_to_change.keys())
    colors = list(mcolors.TABLEAU_COLORS.keys())
    color_map = {label: colors[i % len(colors)] for i, label in enumerate(labels)}
    color_map['Uncovered'] = colors[(len(labels) + 1) % len(colors)]  # Add "Uncovered" to color map
    #color_map['Leftover'] = colors[(len(labels) + 2) % len(colors)]  # Add "Leftover" to color map
    color_map['Leftover'] = colors[(len(labels) + 1) % len(colors)]  # Add "Uncovered" to color map but with the same color as "Uncovered"


    for i, method_n in enumerate([0, 2, 1]):
        # Extract labels and sizes
        if method_n in [0, 2]:
            sizes = [-method_to_change[method][method_n] for method in method_to_change]
        else:
            sizes = [method_to_change[method][method_n] for method in method_to_change]
        
        total_size = sum(sizes)
        sizes_with_labels = [(size, label) for size, label in zip(sizes, labels)]
        
        # Filter out slices that round to 0.0%
        filtered_sizes, filtered_labels = zip(
            *[(size, label) for size, label in sizes_with_labels if size != 0]
        )
        
        # If total is provided and sizes don't add up to total, add "Uncovered" slice
        if method_n == 0 and initial_working is not None and total_size < initial_working:
            uncovered_size = initial_working - total_size
            filtered_sizes = list(filtered_sizes) + [uncovered_size]
            filtered_labels = list(filtered_labels) + ['Uncovered']

        if method_n == 2 and initial_x_count is not None and total_size < initial_x_count:
            uncovered_size = initial_x_count - total_size
            filtered_sizes = list(filtered_sizes) + [uncovered_size]
            filtered_labels = list(filtered_labels) + ['Leftover']
        
        # Create pie chart with consistent colors
        pie_colors = [color_map[label] for label in filtered_labels]
        
        wedges, texts, autotexts = axes[i].pie(
            filtered_sizes, labels=None, autopct=lambda p: f'{p:.1f}%' if p >= thresholds[i] else '', startangle=140, colors=pie_colors,
            textprops={'fontsize': font_size}, pctdistance=0.8
        )

        axes[i].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    # Optionally add a common legend
    if show_legend:
        # Apply custom names to labels if provided
        if custom_names:
            formatted_labels = [custom_names.get(label, label) for label in list(set(labels) - set(['Initial'])) + ['Uncovered', 'Leftover']]
        else:
            formatted_labels = [' '.join([word.lower() if idx > 0 else word for idx, word in enumerate(label.split())]) for label in list(set(labels) - set(['Initial'])) + ['Uncovered', 'Leftover']]
        
        handles = [plt.Line2D([0], [0], color=color_map[label], marker='o', linestyle='') for label in list(set(labels) - set(['Initial'])) + ['Uncovered', 'Leftover']]
        fig.legend(handles, formatted_labels, title=None, loc="center right", bbox_to_anchor=(0.95, 1.3), ncol=2, prop={'size': legend_font_size})

    plt.tight_layout()
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()

In [5]:
custom_names = {
    'Pick Must Have Assignments': 'Adding sequence assignments that cover multiple reads',
    'Merge Lonely Sequences': 'Merging reads which agree with only one other reads',
    'Apply Local Concensous': 'Assigning local consensus',
    'Assign Concensous for Isolated': 'Assigning consensus for isolated reads',
    'Update Knowns': 'Adding non-degenerate reads',
    'Remove Less Specific': 'Removing less-specific reads',
    'Others': 'Other'
}


In [6]:
method_to_change = get_contribution_per_step('logs\\GRIA2_filter_log.csv')
initial_working, initial_x_count = get_initial('logs\\GRIA2_filter_log.csv')
print(method_to_change)
create_pie_charts(method_to_change, thresholds=[3, 3, 3], show_legend=False, custom_names=custom_names, font_size=22, legend_font_size=20, initial_working=initial_working, initial_x_count=initial_x_count, save_path='GRIA2_filter.svg')
#create_pie_charts(method_to_change, thresholds=[3, 3, 3], show_legend=True, custom_names=custom_names, font_size=22, legend_font_size=20, initial_working=initial_working, initial_x_count=initial_x_count)

method_to_change = get_contribution_per_step('logs\\PCLO_filter_log.csv')
initial_working, initial_x_count = get_initial('logs\\PCLO_filter_log.csv')
create_pie_charts(method_to_change, thresholds=[3, 3, 3], show_legend=False, font_size=22,initial_working=initial_working, initial_x_count=initial_x_count, save_path='PCLO_filter.svg')
print(method_to_change)

{'Initial': (0.0, 0.0, 0.0), 'Update Knowns': (-4627.0, 160.0, -85136.0), 'Remove Less Specific': (-17878.0, 0.0, -355964.0), 'Pick Must Have Assignments': (-712.0, 165.0, -5960.0), 'Merge Lonely Sequences': (-2193.0, 101.0, -61654.0), 'Assign Concensous for Isolated': (-6745.0, 6745.0, -83775.0), 'Apply Local Concensous': (0.0, 0.0, -33590.0)}
{'Initial': (0.0, 0.0, 0.0), 'Update Knowns': (-4506.0, 162.0, -94443.0), 'Remove Less Specific': (-13276.0, 0.0, -304336.0), 'Pick Must Have Assignments': (-204.0, 67.0, -1584.0), 'Merge Lonely Sequences': (-4295.0, 90.0, -145139.0), 'Assign Concensous for Isolated': (-24868.0, 24868.0, -360260.0), 'Apply Local Concensous': (0.0, 0.0, -69315.0)}


In [7]:
short_reads = list(set(files)-  set(['logs\\GRIA2_filter_log.csv', 'logs\\PCLO_filter_log.csv']))
def sorting_key(x):
    species = x.split('_')[1]
    name = x.split('_')[0].split('\\')[1]
    return (species, name)

sorted_short_reads = sorted(short_reads, key=sorting_key)
#print(sorted_short_reads)


for i, file in enumerate(sorted_short_reads):
    method_to_change = get_contribution_per_step(file)
    initial_working, initial_x_count = get_initial(file)
    print(file)
    #print(method_to_change)
    create_pie_charts(method_to_change, thresholds=[5,5, 5], show_legend=(i==0), save_path=f'plots/{i}.svg', custom_names=custom_names, font_size=22, legend_font_size=18, initial_working=initial_working, initial_x_count=initial_x_count)

logs\TRIM2_BOVIN_filter_log.csv
logs\TWK7_CAEEL_filter_log.csv
logs\PCLO_CHICK_filter_log.csv
logs\ABL_DROME_filter_log.csv
logs\KCNAS_DROME_filter_log.csv
logs\SCN1_HETBL_filter_log.csv
logs\ANR17_HUMAN_filter_log.csv
logs\DGLA_HUMAN_filter_log.csv
logs\DOP1_HUMAN_filter_log.csv
logs\IQEC1_HUMAN_filter_log.csv
logs\MTUS2_HUMAN_filter_log.csv
logs\ROBO2_HUMAN_filter_log.csv
logs\ACHA4_MOUSE_filter_log.csv
logs\CA2D3_MOUSE_filter_log.csv
logs\CSKI1_MOUSE_filter_log.csv
logs\K0513_MOUSE_filter_log.csv
logs\RUSC2_MOUSE_filter_log.csv
logs\CACB2_RABIT_filter_log.csv
logs\RIMS2_RAT_filter_log.csv
