In [None]:
import sys
sys.path.append('../../')
from SynTemp.SynUtils.utils import load_database, load_from_pickle
#original_data = load_database('../../Data/Temp/USPTO_50K_reactions.json.gz')
templates = load_from_pickle('../../Data/Temp/templates.pkl.gz')
data_cluster = load_from_pickle('../../Data/Temp/data_cluster.pkl.gz')

In [None]:
temp_0 = templates[0]

In [None]:
temp_0[0]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def create_pie_chart(data, column, ax=None, title=None, color_pallet='pastel'):
    """
    Generates a pie chart for the specified column from a list of dictionaries.
    Displays percentage labels inside the slices only and category names in an external legend without percentages.
    Allows customization of the plot title, supporting LaTeX formatted strings.

    Parameters:
        data (list of dict): Data to plot.
        column (str): Column name to plot percentages for.
        ax (matplotlib.axes.Axes, optional): Matplotlib axis object to plot on.
        title (str, optional): Title for the pie chart, supports LaTeX formatted strings.

    Returns:
        matplotlib.axes.Axes: The axis with the pie chart.
    """
    # Enable LaTeX formatting for better quality text rendering
    plt.rc('text', usetex=True)
    plt.rc('font', family='serif')

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(data)
    
    # Calculate percentage
    percentage = df[column].value_counts(normalize=True) * 100
    
    # Define a color palette using Seaborn
    colors = sns.color_palette(color_pallet, len(percentage))

    # Create pie plot
    if ax is None:
        fig, ax = plt.subplots()
    
    wedges, texts, autotexts = ax.pie(percentage, startangle=90, colors=colors, autopct='%1.1f%%', pctdistance=0.85, explode=[0.05]*len(percentage))

    # Draw a circle at the center of pie to make it look like a donut
    centre_circle = plt.Circle((0, 0), 0.70, fc='white')
    ax.add_artist(centre_circle)

    # Equal aspect ratio ensures that pie is drawn as a circle.
    ax.axis('equal')  

    # Add legend with category names only
    ax.legend(wedges, [f'{label}' for label in percentage.index], title=column, loc="upper right", 
              bbox_to_anchor=(0.6, 0.1, 0.6, 1), prop={'size': 16}, title_fontsize=16)  # Set label font size

    # Set title using LaTeX if provided, else default to a generic title
    if title:
        ax.set_title(title, fontsize=24)
    else:
        ax.set_title(f'Pie Chart of {column}', fontsize=32)

    # Enhance the font size and color of the autotexts
    for autotext in autotexts:
        autotext.set_color('black')
        autotext.set_fontsize(18)

    return ax


# Create a pie chart for 'Reaction Type'
fig, ax = plt.subplots()
create_pie_chart(temp_0, 'Reaction Type', ax=ax, title='A')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming create_pie_chart is already defined as above

# Sample data

# Initialize a 2x2 subplot grid
fig, axs = plt.subplots(2, 2, figsize=(16, 10))  # Adjust size as needed

# Plot the pie chart in the first subplot (top-left)
create_pie_chart(temp_0, 'Reaction Type', ax=axs[0, 0], title='A. Template library', color_pallet='pastel')

create_pie_chart(data_cluster, 'Reaction Type', ax=axs[0, 1], title= 'B. Database', color_pallet='pastel')

create_pie_chart(temp_0, 'Topo Type', ax=axs[1, 0], title='C. Template library', color_pallet='coolwarm')

create_pie_chart(data_cluster, 'Topo Type', ax=axs[1, 1], title='D. Database', color_pallet='coolwarm')



plt.tight_layout()
plt.show()


In [None]:
temp_0[0]

In [None]:
element = [value for value in temp_0 if value['Reaction Type'] in ['Elementary']]
complex = [value for value in temp_0 if value['Reaction Type'] in ['Complicated']]

In [None]:
def count_column_values(data, column):
    """
    Count the occurrences of each unique value in the specified column from a list of dictionaries.

    Parameters:
        data (list of dict): The data to process.
        column (str): The column to count values from.

    Returns:
        dict: A dictionary with keys as unique values and values as the count of occurrences.
    """
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Handle if the column is supposed to be a list (like 'Rings')
    if df[column].dtype == object and df[column].apply(lambda x: isinstance(x, list)).all():
        # Flatten the list and count occurrences
        return df[column].explode().value_counts().to_dict()
    else:
        # Count occurrences of each unique value
        return df[column].value_counts().to_dict()
    
import pandas as pd

def count_column_values(data, column):
    """
    Count the occurrences of each unique value in the specified column from a list of dictionaries.
    Treats all data types, including lists, as single entities by converting lists to strings.

    Parameters:
        data (list of dict): The data to process.
        column (str): The column to count values from.

    Returns:
        dict: A dictionary with keys as unique values (strings if lists) and values as the count of occurrences.
    """
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Handle if the column contains lists
    if df[column].dtype == object and df[column].apply(lambda x: isinstance(x, list)).all():
        # Convert lists to strings to count each unique list configuration as a single entity
        df[column] = df[column].apply(lambda x: str(x))
    
    # Count occurrences of each unique value
    return df[column].value_counts().to_dict()

element_count = count_column_values(element, 'Rings')
complex_count = count_column_values(complex, 'Rings')

In [None]:
element_all = [value for value in data_cluster if value['Reaction Type'] in ['Elementary']]
complex_all = [value for value in data_cluster if value['Reaction Type'] in ['Complicated']]

In [None]:
element_count_all = count_column_values(element_all, 'Rings')
complex_count_all = count_column_values(complex_all, 'Rings')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_rules_distribution(
    rules, rule_type='single', 
    ax=None, title=None, refinement=False, 
    threshold = 1, remove=True, color_pallet = 'pastel'):
    """
    Plots the distribution of rules in a bar chart, optionally combining all entries under 1% into a single category 'Under 1%'.

    Parameters:
        rules (dict): Dictionary with rules counts (single or complex).
        rule_type (str): Specifies the type of rules to plot ('single' or 'complex').
        ax (matplotlib.axes.Axes, optional): Matplotlib axis object to plot on.
        title (str): Optional title.
        refinement (bool): If True, combines all percentages under 1% into one category 'Under 1%'.
    """
    # Calculate total counts for the rules
    total_rules = sum(rules.values())

    # Convert counts to percentages and optionally combine small values
    if refinement:
        refined_rules = {}
        small_value_aggregate = 0
        for key, value in rules.items():
            percentage = value / total_rules * 100
            if percentage < threshold:
                small_value_aggregate += percentage
            else:
                refined_rules[key] = percentage
        if small_value_aggregate > 0:
            refined_rules['Under 1%'] = small_value_aggregate
        percentages = list(refined_rules.values())
        types_of_rings = list(refined_rules.keys())
        if remove:
            percentages = percentages[:-1]
            types_of_rings = types_of_rings[:-1]
    else:
        percentages = [value / total_rules * 100 for value in rules.values()]
        types_of_rings = list(rules.keys())

    # Set style
    sns.set(style="whitegrid")

    # Enable LaTeX rendering in matplotlib
    plt.rc('text', usetex=True)
    plt.rc('text.latex', preamble=r'\usepackage{amsmath}')  # Ensure amsmath is loaded

    # Create figure and axis if not provided
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6), dpi=120)

    # Plot the data
    sns.barplot(ax=ax, x=types_of_rings, y=percentages, palette=color_pallet)
    if title:
        ax.set_title(title, fontsize=16)
    else:
        ax.set_title(f'Distribution of {rule_type.capitalize()} Rule Types of Rings{title_suffix}', fontsize=16)
    ax.set_xlabel('Type of Ring', fontsize=14)
    ax.set_ylabel(r'Percentage (\%)', fontsize=14)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

    # Add text labels above the bars
    for index, value in enumerate(percentages):
        ax.text(index, value + 0.5, f'{value:.1f}%', ha='center', va='bottom', fontsize=12)

    # Only show plot if ax is not provided (i.e., we created the figure here)
    if ax is None:
        plt.show()



In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, 2, figsize=(16, 10))  # Adjust size as needed


plot_rules_distribution(element_count, ax=axs[0, 0], title='A. Elementary reactions in templates', color_pallet= 'pastel')
plot_rules_distribution(element_count_all, ax=axs[0, 1], title='B. Elementary reactions in database', color_pallet= 'pastel')
plot_rules_distribution(complex_count, ax=axs[1, 0], title='C. Complicated reactions in templates', refinement=True, color_pallet= 'coolwarm')
plot_rules_distribution(complex_count_all, ax=axs[1, 1], title='C. Complicated reactions in database', refinement=True, color_pallet= 'coolwarm', threshold = 0.1)


plt.tight_layout()
plt.show()


In [None]:
temp_0

In [None]:
# Create a pie chart for 'Reaction Type'
fig, ax = plt.subplots()
create_pie_chart(temp_0, 'Reaction Step', ax=ax, title='A')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(data, title='Heatmap of Test Counts by Topo Type and Reaction Step', color_palette='coolwarm', ax=None):
    """
    Plots a heatmap based on the provided dataset with options for customization, specific aggregation, and an enhanced legend.

    Parameters:
        data (list of dict): Data to be visualized.
        title (str, optional): Title for the heatmap. Defaults to a generic title if none provided.
        color_palette (str, optional): Color palette for the heatmap. Defaults to 'coolwarm'.
        ax (matplotlib.axes.Axes, optional): Matplotlib axis object to plot on. If none, a new figure is created.
    """
    # Convert input data to DataFrame
    df = pd.DataFrame(data)
    df['Test'] = 1
    
    # Define a custom aggregation function to calculate percentages
    def custom_agg(series):
        total = series.sum()
        return (total / len(data)) * 100  # Dividing by the total number of data points and multiplying by 100

    # Create pivot table for heatmap using the custom aggregation function
    pivot_table = df.pivot_table(index='Topo Type', columns='Reaction Step', values='Test', aggfunc=custom_agg)

    # Check if an axis is provided; if not, create a new figure and axis
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 8))

    # Plot heatmap on the provided or created axis
    heatmap = sns.heatmap(pivot_table, annot=True, cmap=color_palette, fmt=".1f", ax=ax, cbar_kws={'label': r'Percentage (\%)'})
    ax.set_title(title)
    ax.set_ylabel('Topo Type')
    ax.set_xlabel('Reaction Step')
    
    if not ax:
        plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(data, title='Heatmap of Test Counts by Topo Type and Reaction Step', color_palette='coolwarm', 
                 title_fontsize=16, label_fontsize=14, annot_fontsize=12, cbar_label_fontsize=12, 
                 legend_fontsize=10, xtick_fontsize=10, ytick_fontsize=10, ax=None):
    """
    Plots a heatmap based on the provided dataset with options for customization, specific aggregation, and an enhanced legend.

    Parameters:
        data (list of dict): Data to be visualized.
        title (str, optional): Title for the heatmap. Defaults to a generic title if none provided.
        color_palette (str, optional): Color palette for the heatmap. Defaults to 'coolwarm'.
        title_fontsize (int, optional): Font size for the title. Defaults to 16.
        label_fontsize (int, optional): Font size for the axis labels. Defaults to 14.
        annot_fontsize (int, optional): Font size for the annotations. Defaults to 12.
        cbar_label_fontsize (int, optional): Font size for the color bar label. Defaults to 12.
        legend_fontsize (int, optional): Font size for the legend. Defaults to 10.
        xtick_fontsize (int, optional): Font size for the x-tick labels. Defaults to 10.
        ytick_fontsize (int, optional): Font size for the y-tick labels. Defaults to 10.
        ax (matplotlib.axes.Axes, optional): Matplotlib axis object to plot on. If none, a new figure is created.
    """
    # Convert input data to DataFrame
    df = pd.DataFrame(data)
    df['Test'] = 1
    
    # Define a custom aggregation function to calculate percentages
    def custom_agg(series):
        total = series.sum()
        return (total / len(data)) * 100  # Dividing by the total number of data points and multiplying by 100

    # Create pivot table for heatmap using the custom aggregation function
    pivot_table = df.pivot_table(index='Topo Type', columns='Reaction Step', values='Test', aggfunc=custom_agg)

    # Check if an axis is provided; if not, create a new figure and axis
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 8))

    # Plot heatmap on the provided or created axis
    heatmap = sns.heatmap(pivot_table, annot=True, cmap=color_palette, fmt=".1f", ax=ax, 
                          cbar_kws={'label': r'Percentage (\%)'})
    
    # Customize the title and axis labels font size
    ax.set_title(title, fontsize=title_fontsize)
    ax.set_ylabel('Topo Type', fontsize=label_fontsize)
    ax.set_xlabel('Reaction Step', fontsize=label_fontsize)
    
    # Customize the font size of the annotations
    for text in heatmap.texts:
        text.set_fontsize(annot_fontsize)
    
    # Customize the font size of the color bar label
    heatmap.figure.axes[-1].yaxis.label.set_size(cbar_label_fontsize)

    # Set font size for x-tick and y-tick labels
    ax.tick_params(axis='x', labelsize=xtick_fontsize)
    ax.tick_params(axis='y', labelsize=ytick_fontsize)

    # Create a legend with specified font size
    handles, labels = ax.get_legend_handles_labels()
    if handles:
        ax.legend(handles, labels, title='Legend', loc='upper right', bbox_to_anchor=(1.05, 1), fontsize=legend_fontsize)

    if not ax:
        plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(data, title='Heatmap of Test Counts by Topo Type and Reaction Step', color_palette='coolwarm', 
                 title_fontsize=24, label_fontsize=20, annot_fontsize=18, cbar_label_fontsize=18, 
                 legend_fontsize=24, xtick_fontsize=18, ytick_fontsize=18, ax=None):
    """
    Plots a heatmap based on the provided dataset with options for customization, specific aggregation, and an enhanced legend.

    Parameters:
        data (list of dict): Data to be visualized.
        title (str, optional): Title for the heatmap. Defaults to a generic title if none provided.
        color_palette (str, optional): Color palette for the heatmap. Defaults to 'coolwarm'.
        title_fontsize (int, optional): Font size for the title. Defaults to 16.
        label_fontsize (int, optional): Font size for the axis labels. Defaults to 14.
        annot_fontsize (int, optional): Font size for the annotations. Defaults to 12.
        cbar_label_fontsize (int, optional): Font size for the color bar label. Defaults to 12.
        legend_fontsize (int, optional): Font size for the legend. Defaults to 10.
        ax (matplotlib.axes.Axes, optional): Matplotlib axis object to plot on. If none, a new figure is created.
    """
    # Convert input data to DataFrame
    df = pd.DataFrame(data)
    df['Test'] = 1
    
    # Define a custom aggregation function to calculate percentages
    def custom_agg(series):
        total = series.sum()
        return (total / len(data)) * 100  # Dividing by the total number of data points and multiplying by 100

    # Create pivot table for heatmap using the custom aggregation function
    pivot_table = df.pivot_table(index='Topo Type', columns='Reaction Step', values='Test', aggfunc=custom_agg)

    # Check if an axis is provided; if not, create a new figure and axis
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 8))

    # Plot heatmap on the provided or created axis
    heatmap = sns.heatmap(pivot_table, annot=True, cmap=color_palette, fmt=".1f", ax=ax, 
                          cbar_kws={'label': r'Percentage (\%)'})
    
    # Customize the title and axis labels font size
    ax.set_title(title, fontsize=title_fontsize)
    ax.set_ylabel('Topo Type', fontsize=label_fontsize)
    ax.set_xlabel('Reaction Step', fontsize=label_fontsize)
    #ax.set_xticks
    
    # Customize the font size of the annotations
    for text in heatmap.texts:
        text.set_fontsize(annot_fontsize)
    
     # Set font size for x-tick and y-tick labels
    ax.tick_params(axis='x', labelsize=xtick_fontsize)
    ax.tick_params(axis='y', labelsize=ytick_fontsize)
    # Customize the font size of the color bar label
    heatmap.figure.axes[-1].yaxis.label.set_size(cbar_label_fontsize)

    # Create a legend with specified font size
    handles, labels = ax.get_legend_handles_labels()
    if handles:
        ax.legend(handles, labels, title='Legend', loc='upper right', bbox_to_anchor=(1.05, 1), fontsize=legend_fontsize)

    if not ax:
        plt.show()

plot_heatmap(temp_0)

plot_heatmap(data_cluster)

In [None]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(20, 10))  # Adjust size as needed

plot_heatmap(temp_0, title='Templates', color_palette='coolwarm', ax=axs[0])
plot_heatmap(data_cluster, title='Database', color_palette='coolwarm', ax=axs[1])




plt.tight_layout()
plt.show()


In [None]:
df = pd.DataFrame(temp_0)
df['Value'] = 1

# Create a pivot table for clustering
pivot_table = df.pivot_table(index=['Reaction Type', 'Topo Type', 'Reaction Step'], values='Value', aggfunc='sum')

# Reset the index to flatten the DataFrame
pivot_table = pivot_table.reset_index()

# Convert hierarchical columns to strings and create a combined identifier
pivot_table['Combined'] = pivot_table[['Reaction Type', 'Topo Type', 'Reaction Step']].astype(str).agg(' > '.join, axis=1)

# Set the 'Combined' column as the index
pivot_table = pivot_table.set_index('Combined')

# Drop the hierarchical columns as they are now part of the index
pivot_table = pivot_table.drop(columns=['Reaction Type', 'Topo Type', 'Reaction Step'])

# Debug: Print the pivot table to check its contents
print("Pivot Table:\n", pivot_table)

# Check if the pivot table is empty
if pivot_table.empty:
    raise ValueError("Pivot table is empty. Check your data and aggregation logic.")

# Perform hierarchical clustering and plot the heatmap
sns.clustermap(pivot_table, method='average', metric='euclidean', cmap='coolwarm', annot=True, figsize=(12, 8), cbar_kws={'label': 'Value'})

# Add the title to the overall plot, not just the clustermap
plt.suptitle('Hierarchical Clustering Heatmap of Values', y=1.02, fontsize=16)
plt.show()

In [None]:
# Perform hierarchical clustering and plot the heatmap
clustermap = sns.clustermap(pivot_table, method='average', metric='euclidean', cmap='coolwarm', annot=True, figsize=(12, 8), cbar_kws={'label': 'Value'})

# Add the title to the overall plot, not just the clustermap
plt.suptitle('Hierarchical Clustering Heatmap of Values', y=1.02, fontsize=16)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Example data for testing
temp_0 = [
    {'Reaction Type': 'Elementary', 'Topo Type': 'Single Cyclic', 'Reaction Step': 1, 'Rings': 4},
    {'Reaction Type': 'Elementary', 'Topo Type': 'Single Cyclic', 'Reaction Step': 2, 'Rings': 3},
    {'Reaction Type': 'Complex', 'Topo Type': 'Multiple Cyclic', 'Reaction Step': 1, 'Rings': 4},
    {'Reaction Type': 'Complex', 'Topo Type': 'Multiple Cyclic', 'Reaction Step': 2, 'Rings': 3},
    {'Reaction Type': 'Complicated', 'Topo Type': 'Combinatorial Cyclic', 'Reaction Step': 2, 'Rings': 3},
    {'Reaction Type': 'Complicated', 'Topo Type': 'Combinatorial Cyclic', 'Reaction Step': 3, 'Rings': 2},
    {'Reaction Type': 'Complicated', 'Topo Type': 'Complex Cyclic', 'Reaction Step': 2, 'Rings': 4},
    {'Reaction Type': 'Elementary', 'Topo Type': 'Acyclic', 'Reaction Step': 1, 'Rings': 1},
]

# Convert to DataFrame
df = pd.DataFrame(temp_0)
df['Value'] = 1

# Create a pivot table for clustering
pivot_table = df.pivot_table(index=['Reaction Type', 'Topo Type'], values='Value', aggfunc='sum')

# Reset the index to flatten the DataFrame
pivot_table = pivot_table.reset_index()

# Convert hierarchical columns to strings and create a combined identifier
pivot_table['Combined'] = pivot_table[['Reaction Type', 'Topo Type']].astype(str).agg(' > '.join, axis=1)

# Set the 'Combined' column as the index
pivot_table = pivot_table.set_index('Combined')

# Drop the hierarchical columns as they are now part of the index
pivot_table = pivot_table.drop(columns=['Reaction Type', 'Topo Type'])

# Debug: Print the pivot table to check its contents
print("Pivot Table:\n", pivot_table)

# Check if the pivot table is empty or contains only zeros
if pivot_table.empty or (pivot_table.sum().sum() == 0):
    raise ValueError("Pivot table is empty or contains only zeros. Check your data and aggregation logic.")

# Perform hierarchical clustering and plot the heatmap
try:
    clustermap = sns.clustermap(pivot_table, method='average', metric='euclidean', cmap='coolwarm', annot=True, figsize=(12, 8), cbar_kws={'label': 'Value'})

    # Add the title to the overall plot, not just the clustermap
    plt.suptitle('Hierarchical Clustering Heatmap of Values', y=1.02, fontsize=16)
    plt.show()
except ValueError as e:
    print("An error occurred during clustering: ", e)


## Rule Composition