In [None]:
%pip install rdkit
%pip install seaborn

In [None]:
import pandas as pd

# Load filtered activity data
filtered_data = pd.read_csv("dataset/filtered_activities.csv")

# Load compound structures with SMILES
compound_structures = pd.read_csv("dataset/compound_structures.csv")
filtered_data = filtered_data.merge(compound_structures, on="molregno", how="left")

# Load drug mechanism data
drug_mechanism = pd.read_csv("dataset/drug_mechanism.csv")
filtered_data = filtered_data.merge(drug_mechanism, on="molregno", how="left")

# Load target information
target_info = pd.read_csv("dataset/targets.csv")
filtered_data = filtered_data.merge(target_info, on="tid", how="left")

filtered_data.head()


In [5]:
filtered_data = pd.read_csv("dataset/filtered_activities.csv", low_memory=False, dtype={'standard_type': 'str'})

In [None]:
# Unique values in each table for molregno
unique_molregno_filtered = set(filtered_data['molregno'].unique())
unique_molregno_structures = set(compound_structures['molregno'].unique())
unique_molregno_mechanism = set(drug_mechanism['molregno'].unique())

print("Common molregno between filtered_data and compound_structures:", len(unique_molregno_filtered & unique_molregno_structures))
print("Common molregno between filtered_data and drug_mechanism:", len(unique_molregno_filtered & unique_molregno_mechanism))

In [7]:
# Filter the data for common molregno values
common_molregno_mechanism = unique_molregno_filtered & unique_molregno_mechanism
filtered_data = filtered_data[filtered_data['molregno'].isin(common_molregno_mechanism)]
compound_structures = compound_structures[compound_structures['molregno'].isin(common_molregno_mechanism)]
drug_mechanism = drug_mechanism[drug_mechanism['molregno'].isin(common_molregno_mechanism)]


In [None]:
# check for missing values
missing_data = filtered_data.isnull().sum()
print(missing_data[missing_data > 0])

In [9]:
# merge the tables
# Merge compound structures with the main filtered data
merged_data = filtered_data.merge(compound_structures[['molregno', 'canonical_smiles']], on='molregno', how='left')

# Merge drug mechanism information
merged_data = merged_data.merge(drug_mechanism[['molregno', 'mechanism_of_action', 'tid']], on='molregno', how='left')


In [None]:
merged_data.head()

In [11]:
merged_data.to_csv("final_dataset.csv")

In [None]:
merged_data.columns

In [None]:
# Summary statistics
print(merged_data.describe())

# Check for missing values
missing_data = merged_data.isnull().sum()
print("Missing values:\n", missing_data[missing_data > 0])

In [None]:
# distribution of bio activity values
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 5))
sns.histplot(merged_data['standard_value'].dropna(), bins=50, kde=True)
plt.title('Distribution of Standard Bioactivity Values')
plt.xlabel('type')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Mechanism of action analysis

plt.figure(figsize=(10, 6))
sns.countplot(y=merged_data['mechanism_of_action_y'], order=merged_data['mechanism_of_action_y'].value_counts().index[:10])
plt.title('Top 10 Mechanisms of Action')
plt.xlabel('Count')
plt.ylabel('Mechanism of Action')
plt.show()



In [None]:
# Relationship between Bioactivity and Mechanism of Action
plt.figure(figsize=(12, 8))
sns.boxplot(data=merged_data, x='mechanism_of_action_y', y='standard_value')
plt.title('Bioactivity by Mechanism of Action')
plt.xlabel('Mechanism of Action_y')
plt.ylabel('Standard Value')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Frequency of biological targets
plt.figure(figsize=(10, 6))
sns.countplot(y=merged_data['pref_name'], order=merged_data['pref_name'].value_counts().index[:10])
plt.title('Top 10 Biological Targets')
plt.xlabel('Count')
plt.ylabel('Target')
plt.show()


In [None]:
numerical_cols = ['standard_value', 'value', 'pchembl_value']  
sns.pairplot(merged_data[numerical_cols].dropna())
plt.show()


In [None]:
organism_mechanism_data = merged_data[['organism', 'mechanism_of_action_x']].dropna()

# Filter for top 10 mechanisms of action
top_mechanisms = organism_mechanism_data['mechanism_of_action_x'].value_counts().index[:10]
filtered_data = organism_mechanism_data[organism_mechanism_data['mechanism_of_action_x'].isin(top_mechanisms)]

plt.figure(figsize=(12, 8))
sns.countplot(data=filtered_data, y='mechanism_of_action_x', hue='organism')
plt.title('Top 10 Mechanisms of Action by Organism')
plt.xlabel('Count')
plt.ylabel('Mechanism of Action')
plt.legend(title='Organism', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()