In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('b3db_kc_split_20240502.csv')  # Assuming the dataset is in CSV format

# 4.1.1 Distribution of logBB values
def analyze_logbb_distribution():
    logbb_values = df['logBB'].dropna()

    print("4.1.1 Distribution of logBB values")
    print(f"Number of compounds with logBB values: {len(logbb_values)}")
    print(f"Mean logBB: {logbb_values.mean():.2f}")
    print(f"Median logBB: {logbb_values.median():.2f}")
    print(f"Standard deviation of logBB: {logbb_values.std():.2f}")
    print(f"Minimum logBB: {logbb_values.min():.2f}")
    print(f"Maximum logBB: {logbb_values.max():.2f}")

    # Plot histogram of logBB values
    plt.figure(figsize=(10, 6))
    plt.hist(logbb_values, bins=30, edgecolor='black')
    plt.title('Distribution of logBB Values')
    plt.xlabel('logBB')
    plt.ylabel('Frequency')
    plt.savefig('logBB_distribution.png')
    plt.close()



if __name__ == "__main__":
    analyze_logbb_distribution()


4.1.1 Distribution of logBB values
Number of compounds with logBB values: 942
Mean logBB: -0.07
Median logBB: -0.01
Standard deviation of logBB: 0.75
Minimum logBB: -2.69
Maximum logBB: 1.70


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('b3db_kc_split_20240502.csv')

def explore_dataset():
    print("Dataset Information:")
    print(df.info())

    print("\nColumn names:")
    print(df.columns.tolist())

    print("\nFirst few rows of the dataset:")
    print(df.head())

def analyze_bbb_classifications():
    # Check if 'BBB_classification' column exists
    if 'BBB_classification' in df.columns:
        bbb_classifications = df['BBB_classification']

        print("\n4.1.2 Analysis of BBB+/BBB- classifications")
        print(bbb_classifications.value_counts())
        print(f"Percentage of BBB+: {100 * bbb_classifications.value_counts(normalize=True)['BBB+']:.2f}%")
        print(f"Percentage of BBB-: {100 * bbb_classifications.value_counts(normalize=True)['BBB-']:.2f}%")
    else:
        print("\nThe 'BBB_classification' column is not present in the dataset.")
        print("Available columns that might contain BBB classification:")

        # Look for columns that might contain BBB classification
        potential_columns = [col for col in df.columns if 'bbb' in col.lower() or 'class' in col.lower()]

        if potential_columns:
            print(potential_columns)

            # Display value counts for potential columns
            for col in potential_columns:
                print(f"\nValue counts for '{col}':")
                print(df[col].value_counts(dropna=False))
        else:
            print("No columns found that might contain BBB classification.")

if __name__ == "__main__":
    explore_dataset()
    analyze_bbb_classifications()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3311 entries, 0 to 3310
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   inchi            3311 non-null   object 
 1   names            3289 non-null   object 
 2   iupac_name       3226 non-null   object 
 3   smiles           3311 non-null   object 
 4   cid              3227 non-null   float64
 5   record_ids       3311 non-null   object 
 6   logBB            942 non-null    float64
 7   logBB_group      942 non-null    object 
 8   BBB+/BBB-        3311 non-null   object 
 9   BBB+/BBB-_group  3311 non-null   object 
 10  split            3311 non-null   object 
dtypes: float64(2), object(9)
memory usage: 284.7+ KB
None

Column names:
['inchi', 'names', 'iupac_name', 'smiles', 'cid', 'record_ids', 'logBB', 'logBB_group', 'BBB+/BBB-', 'BBB+/BBB-_group', 'split']

First few rows of the dataset:
                                       

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('b3db_kc_split_20240502.csv')

# Define custom color palettes


def create_bbb_permeability_plots():
    # BBB Permeability Distribution
    bbb_counts = df['BBB+/BBB-'].value_counts()

    # Pie Chart
    plt.figure(figsize=(10, 5))
    plt.subplot(121)
    plt.pie(bbb_counts.values, labels=bbb_counts.index, autopct='%1.1f%%', startangle=90)
    plt.title('BBB Permeability Distribution (Pie Chart)')

    # Bar Plot
    plt.subplot(122)
    sns.barplot(x=bbb_counts.index, y=bbb_counts.values)
    plt.title('BBB Permeability Distribution (Bar Plot)')
    plt.ylabel('Count')
    plt.xlabel('BBB Permeability')

    plt.tight_layout()
    plt.savefig('bbb_permeability_distribution.png')
    plt.close()

def create_bbb_grouping_plots():
    # BBB+/BBB- Grouping Distribution
    group_counts = df['BBB+/BBB-_group'].value_counts()

    # Pie Chart
    plt.figure(figsize=(10, 5))
    plt.subplot(121)
    plt.pie(group_counts.values, labels=group_counts.index, autopct='%1.1f%%', startangle=90)
    plt.title('BBB+/BBB- Grouping Distribution (Pie Chart)')

    # Bar Plot
    plt.subplot(122)
    sns.barplot(x=group_counts.index, y=group_counts.values)
    plt.title('BBB+/BBB- Grouping Distribution (Bar Plot)')
    plt.ylabel('Count')
    plt.xlabel('BBB+/BBB- Group')

    plt.tight_layout()
    plt.savefig('bbb_grouping_distribution.png')
    plt.close()

def create_combined_plot():
    # Combine BBB Permeability and Grouping
    combined_data = df.groupby(['BBB+/BBB-', 'BBB+/BBB-_group']).size().unstack()

    # Stacked Bar Plot
    plt.figure(figsize=(10, 6))
    combined_data.plot(kind='bar', stacked=True)
    plt.title('BBB Permeability and Grouping Distribution')
    plt.xlabel('BBB Permeability')
    plt.ylabel('Count')
    plt.legend(title='BBB+/BBB- Group')
    plt.tight_layout()
    plt.savefig('bbb_combined_distribution.png')
    plt.close()

if __name__ == "__main__":
    create_bbb_permeability_plots()
    create_bbb_grouping_plots()
    create_combined_plot()
    print("Visualizations have been saved as PNG files.")

Visualizations have been saved as PNG files.


<Figure size 1000x600 with 0 Axes>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

# Load the dataset
df = pd.read_csv('b3db_kc_split_20240502.csv')

# Define custom color palettes
bbb_colors = ['#CD7F32', '#6F8FAF']  # Light red for BBB+, light blue for BBB-
group_colors = ['#900C3F', '#FFC300', '#4F7942']  # Gold for A, light green for B, light sky blue for C


def create_3d_bar_plot():
    # Combine BBB Permeability and Grouping
    combined_data = df.groupby(['BBB+/BBB-', 'BBB+/BBB-_group']).size().unstack()

    # Prepare data for 3D plotting
    bbb_categories = combined_data.index
    group_categories = combined_data.columns
    x_pos = np.arange(len(bbb_categories))
    y_pos = np.arange(len(group_categories))
    x_pos, y_pos = np.meshgrid(x_pos, y_pos)
    x_pos = x_pos.flatten()
    y_pos = y_pos.flatten()
    z_pos = np.zeros_like(x_pos)

    dx = 0.75 * np.ones_like(z_pos)
    dy = dx.copy()
    dz = combined_data.values.flatten()

    # Set up colors
    colors = ['#FF9999', '#66B2FF']  # Light red for BBB+, light blue for BBB-
    color_mapping = {cat: color for cat, color in zip(bbb_categories, colors)}
    bar_colors = [color_mapping[cat] for cat in np.repeat(bbb_categories, len(group_categories))]

    # Create 3D plot
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')

    ax.bar3d(x_pos, y_pos, z_pos, dx, dy, dz, color=bar_colors, alpha=0.8)

    # Customize the plot
    ax.set_xticks(np.arange(len(bbb_categories)))
    ax.set_yticks(np.arange(len(group_categories)))
    ax.set_xticklabels(bbb_categories)
    ax.set_yticklabels(group_categories)
    ax.set_xlabel('BBB Permeability')
    ax.set_ylabel('BBB+/BBB- Group')
    ax.set_zlabel('Count')
    ax.set_title('3D View of BBB Permeability and Grouping Distribution')

    # Adjust the viewing angle
    ax.view_init(elev=20, azim=45)

    plt.tight_layout()
    plt.savefig('bbb_3d_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_bbb_permeability_plots():
    # BBB Permeability Distribution
    bbb_counts = df['BBB+/BBB-'].value_counts()

    # Pie Chart with custom colors
    plt.figure(figsize=(10, 5))
    plt.subplot(121)
    plt.pie(bbb_counts.values, labels=bbb_counts.index, autopct='%1.1f%%', startangle=90, colors=bbb_colors)
    plt.title('BBB Permeability Distribution (Pie Chart)')

    # Bar Plot
    plt.subplot(122)
    sns.barplot(x=bbb_counts.index, y=bbb_counts.values, palette=bbb_colors)
    plt.title('BBB Permeability Distribution (Bar Plot)')
    plt.ylabel('Count')
    plt.xlabel('BBB Permeability')

    plt.tight_layout()
    plt.savefig('bbb_permeability_distribution.png')
    plt.close()

def create_bbb_grouping_plots():
    # BBB+/BBB- Grouping Distribution
    group_counts = df['BBB+/BBB-_group'].value_counts()

    # Pie Chart with custom colors
    plt.figure(figsize=(10, 5))
    plt.subplot(121)
    plt.pie(group_counts.values, labels=group_counts.index, autopct='%1.1f%%', startangle=90, colors=group_colors)
    plt.title('BBB+/BBB- Grouping Distribution (Pie Chart)')

    # Bar Plot
    plt.subplot(122)
    sns.barplot(x=group_counts.index, y=group_counts.values, palette=group_colors)
    plt.title('BBB+/BBB- Grouping Distribution (Bar Plot)')
    plt.ylabel('Count')
    plt.xlabel('BBB+/BBB- Group')

    plt.tight_layout()
    plt.savefig('bbb_grouping_distribution.png')
    plt.close()

def create_combined_plot():
    # Combine BBB Permeability and Grouping
    combined_data = df.groupby(['BBB+/BBB-', 'BBB+/BBB-_group']).size().unstack()

    # Stacked Bar Plot with custom colors
    plt.figure(figsize=(10, 6))
    combined_data.plot(kind='bar', stacked=True, color=group_colors)
    plt.title('BBB Permeability and Grouping Distribution')
    plt.xlabel('BBB Permeability')
    plt.ylabel('Count')
    plt.legend(title='BBB+/BBB- Group')
    plt.tight_layout()
    plt.savefig('bbb_combined_distribution.png')
    plt.close()

if __name__ == "__main__":
    create_bbb_permeability_plots()
    create_bbb_grouping_plots()
    create_combined_plot()
    create_3d_bar_plot()
    print("Visualizations with custom colors have been saved as PNG files.")
    print("3D visualization has been saved as 'bbb_3d_distribution.png'.")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bbb_counts.index, y=bbb_counts.values, palette=bbb_colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=group_counts.index, y=group_counts.values, palette=group_colors)
  plt.tight_layout()


Visualizations with custom colors have been saved as PNG files.
3D visualization has been saved as 'bbb_3d_distribution.png'.


<Figure size 1000x600 with 0 Axes>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load the dataset
df = pd.read_csv('b3db_kc_split_20240502.csv')

# Define custom color palette
group_colors = ['#900C3F', '#FFC300', '#4F7942']  # Gold for A, light green for B, light sky blue for C

def create_combined_3d_plot():
    # Combine BBB Permeability and Grouping
    combined_data = df.groupby(['BBB+/BBB-', 'BBB+/BBB-_group']).size().unstack()

    # Create 3D Bar Plot
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')

    x_pos = np.arange(len(combined_data.index))
    y_pos = np.arange(len(combined_data.columns))
    x_pos, y_pos = np.meshgrid(x_pos, y_pos)
    x_pos = x_pos.flatten()
    y_pos = y_pos.flatten()
    z_pos = np.zeros_like(x_pos)

    dx = 0.5 * np.ones_like(z_pos)
    dy = dx.copy()
    dz = combined_data.values.flatten()

    # Color mapping
    colors = []
    for group in combined_data.columns:
        colors.extend([group_colors[list(combined_data.columns).index(group)]] * len(combined_data.index))

    ax.bar3d(x_pos, y_pos, z_pos, dx, dy, dz, color=colors, alpha=0.8)

    # Customize the plot
    ax.set_xticks(np.arange(len(combined_data.index)))
    ax.set_yticks(np.arange(len(combined_data.columns)))
    ax.set_xticklabels(combined_data.index)
    ax.set_yticklabels(combined_data.columns)
    ax.set_xlabel('BBB Permeability')
    ax.set_ylabel('BBB+/BBB- Group')
    ax.set_zlabel('Count')
    ax.set_title('3D View: BBB Permeability and Grouping Distribution')

    # Adjust the viewing angle
    ax.view_init(elev=20, azim=45)

    plt.tight_layout()
    plt.savefig('bbb_combined_distribution_3d.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    create_combined_3d_plot()
    print("3D visualization of combined distribution has been saved as a PNG file.")

  plt.tight_layout()


3D visualization of combined distribution has been saved as a PNG file.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Load the dataset
df = pd.read_csv('b3db_kc_split_20240502.csv')

def create_logbb_distribution_plot():
    # Filter out rows with NaN logBB values
    logbb_data = df[df['logBB'].notna()]

    # Create histogram data
    hist, bin_edges = np.histogram(logbb_data['logBB'], bins=30)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    # Create custom colormap
    colors = ['#FFA07A', '#98FB98', '#87CEFA']  # Light salmon to light green to light sky blue
    n_bins = len(colors)
    cmap = LinearSegmentedColormap.from_list('custom', colors, N=n_bins)

    # Normalize the data for colormapping
    norm = plt.Normalize(logbb_data['logBB'].min(), logbb_data['logBB'].max())

    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 6))

    bars = ax.bar(bin_centers, hist, width=np.diff(bin_edges), align='center', alpha=0.8)

    # Color the bars according to their logBB value
    for bar, logbb in zip(bars, bin_centers):
        bar.set_facecolor(cmap(norm(logbb)))
        bar.set_edgecolor('none')

    # Customize the plot
    ax.set_xlabel('logBB Value')
    ax.set_ylabel('Frequency')
    ax.set_title('Distribution of logBB Values')

    # Add colorbar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cbar = plt.colorbar(sm)
    cbar.set_label('logBB Value')

    # Add statistics to the plot
    stats_text = f"Mean: {logbb_data['logBB'].mean():.2f}\n"
    stats_text += f"Median: {logbb_data['logBB'].median():.2f}\n"
    stats_text += f"Std Dev: {logbb_data['logBB'].std():.2f}"
    ax.text(0.95, 0.95, stats_text, transform=ax.transAxes, verticalalignment='top',
            horizontalalignment='right', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    plt.tight_layout()
    plt.savefig('logbb_distribution_gradient.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    create_logbb_distribution_plot()
    print("LogBB distribution visualization with color gradient has been saved as a PNG file.")

  cbar = plt.colorbar(sm)


LogBB distribution visualization with color gradient has been saved as a PNG file.


In [None]:
!pip install pandas numpy pubchempy scikit-learn seaborn matplotlib
import pandas as pd
import numpy as np
from pubchempy import get_compounds
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

def load_b3db_data(file_path):
    """
    Load the B3DB dataset from a CSV file and print detailed information.

    Parameters:
    file_path (str): Path to the B3DB CSV file

    Returns:
    pandas.DataFrame: Loaded dataset
    """
    df = pd.read_csv(file_path)
    print("Columns in the CSV file:")
    print(df.columns)
    print("\nFirst few rows of the data:")
    print(df.head())
    print("\nData types of columns:")
    print(df.dtypes)
    print("\nSummary statistics:")
    print(df.describe())
    return df

def calculate_descriptors(compound_ids, id_type='name'):
    """
    Calculate molecular descriptors using PubChemPy.

    Parameters:
    compound_ids (list): List of compound identifiers
    id_type (str): Type of identifier ('name', 'smiles', 'cid', etc.)

    Returns:
    pandas.DataFrame: Calculated descriptors
    """
    descriptors = []
    for cid in compound_ids:
        try:
            compound = get_compounds(cid, id_type)[0]
            descriptors.append({
                'MW': compound.molecular_weight,
                'XLogP': compound.xlogp,
                'TPSA': compound.tpsa,
                'HBD': compound.h_bond_donor_count,
                'HBA': compound.h_bond_acceptor_count,
                'RotatableBonds': compound.rotatable_bond_count
            })
        except:
            descriptors.append({
                'MW': None, 'XLogP': None, 'TPSA': None,
                'HBD': None, 'HBA': None, 'RotatableBonds': None
            })
    return pd.DataFrame(descriptors)

def profile_data(df):
    """
    Perform data profiling on the calculated descriptors.

    Parameters:
    df (pandas.DataFrame): DataFrame containing molecular descriptors

    Returns:
    dict: Dictionary containing profiling results
    """
    profile = {
        'descriptive_stats': df.describe(),
        'missing_values': df.isnull().sum(),
        'data_types': df.dtypes,
        'correlation_matrix': df.corr()
    }
    return profile

def plot_descriptor_distribution(df, descriptor_name):
    """
    Plot the distribution of a specific descriptor.

    Parameters:
    df (pandas.DataFrame): DataFrame containing molecular descriptors
    descriptor_name (str): Name of the descriptor to plot
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df[descriptor_name].dropna(), kde=True)
    plt.title(f"Distribution of {descriptor_name}")
    plt.xlabel(descriptor_name)
    plt.ylabel("Frequency")
    plt.savefig(f"{descriptor_name}_distribution.png")
    plt.close()

def plot_correlation_heatmap(corr_matrix):
    """
    Plot a heatmap of the correlation matrix.

    Parameters:
    corr_matrix (pandas.DataFrame): Correlation matrix of descriptors
    """
    plt.figure(figsize=(20, 16))
    sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
    plt.title("Correlation Heatmap of Molecular Descriptors")
    plt.savefig("correlation_heatmap.png")
    plt.close()

def main():
    # Load data
    b3db_data = load_b3db_data("b3db_kc_split_20240502.csv")

    # Identify the column to use for compound identification
    possible_columns = ['SMILES', 'Name', 'PubChem CID', 'InChI', 'IUPAC Name', 'Canonical SMILES']
    id_column = None
    id_type = None

    for col in possible_columns:
        if col in b3db_data.columns:
            id_column = col
            id_type = col.lower().replace(' ', '_')
            break

    if id_column is None:
        print("Could not find a suitable column for compound identification.")
        print("Available columns are:", b3db_data.columns)
        return

    print(f"Using '{id_column}' for compound identification")

    # Calculate descriptors
    descriptors_df = calculate_descriptors(b3db_data[id_column], id_type)

    # Combine original data with calculated descriptors
    full_data = pd.concat([b3db_data, descriptors_df], axis=1)

    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    full_data_imputed = pd.DataFrame(imputer.fit_transform(full_data), columns=full_data.columns)

    # Profile data
    profile_results = profile_data(full_data_imputed)

    # Save profiling results
    profile_results['descriptive_stats'].to_csv("descriptive_stats.csv")
    profile_results['missing_values'].to_csv("missing_values.csv")
    profile_results['data_types'].to_csv("data_types.csv")
    profile_results['correlation_matrix'].to_csv("correlation_matrix.csv")

    # Visualizations
    for descriptor in descriptors_df.columns:
        plot_descriptor_distribution(full_data_imputed, descriptor)

    plot_correlation_heatmap(profile_results['correlation_matrix'])

    # Chemical space analysis
    if 'MW' in full_data_imputed.columns and 'XLogP' in full_data_imputed.columns:
        mw = full_data_imputed['MW']
        logp = full_data_imputed['XLogP']
        plt.figure(figsize=(10, 8))
        plt.scatter(mw, logp, alpha=0.5)
        plt.xlabel("Molecular Weight")
        plt.ylabel("XLogP")
        plt.title("Chemical Space: Molecular Weight vs XLogP")
        plt.savefig("chemical_space.png")
        plt.close()
    else:
        print("Could not generate chemical space plot due to missing MW or XLogP data")

    print("Data profiling and analysis complete. Results saved to CSV files and plots.")

if __name__ == "__main__":
    main()

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13820 sha256=6aad764343e1c3e24725b0373c3648a722c6689e24091312955c68a7cb56d997
  Stored in directory: /root/.cache/pip/wheels/90/7c/45/18a0671e3c3316966ef7ed9ad2b3f3300a7e41d3421a44e799
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4
Columns in the CSV file:
Index(['inchi', 'names', 'iupac_name', 'smiles', 'cid', 'record_ids', 'logBB',
       'logBB_group', 'BBB+/BBB-', 'BBB+/BBB-_group', 'split'],
      dtype='object')

First few rows of the data:
                                               inchi             names  \
0  InChI=1S/C10H10BrNO2/c1-10(2)7-5-6(11)3-4-8(7)...         Brofoxine   
1  InChI=1S/C10H10Cl2N2O/c11-