In [None]:
from Bio import Entrez
import time
import pandas as pd

In [None]:
Entrez.email = "e-mail@email.com"  

# Your search query
query = '(("novel small molecule" OR "novel peptide" OR "new natural compound") AND (drug discovery OR therapeutic OR treatment)) AND (2020:2025[dp])'

# Search PubMed
handle = Entrez.esearch(db="pubmed", term=query, retmax=1450)
record = Entrez.read(handle)
id_list = record["IdList"]

# Fetch details for all IDs
def fetch_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
    results = handle.read()
    return results

# To download as XML (better structured)
def fetch_abstracts(id_list):
    abstracts = []
    batch_size = 100  # Download in batches to avoid server overload
    for start in range(0, len(id_list), batch_size):
        end = min(start + batch_size, len(id_list))
        id_batch = id_list[start:end]
        handle = Entrez.efetch(db="pubmed", id=",".join(id_batch), rettype="abstract", retmode="xml")
        records = Entrez.read(handle)
        for article in records['PubmedArticle']:
            try:
                title = article['MedlineCitation']['Article']['ArticleTitle']
                abstract = article['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
                pmid = article['MedlineCitation']['PMID']
                date = article['MedlineCitation']['Article']['ArticleDate'][0]
                abstracts.append({
                    "PMID": pmid,
                    "Title": title,
                    "Abstract": abstract,
                    "Date": f"{date['Year']}-{date['Month']}-{date['Day']}"
                })
            except:
                continue
        time.sleep(0.5)  # To be gentle to PubMed servers
    return abstracts

# Fetch and save
data = fetch_abstracts(id_list)
df = pd.DataFrame(data)
df.to_csv("pubmed_novel_molecules.csv", index=False)

print("✅ Abstracts collected and saved!")

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Download stopwords if not installed
nltk.download('stopwords')
nltk.download('punkt')

# Load the abstracts from the CSV file
df = pd.read_csv("pubmed_novel_molecules.csv")

# Step 1: Clean the text (lowercase, remove punctuation, etc.)
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['Cleaned_abstract'] = df['Abstract'].apply(clean_text)

# Step 2: Tokenize the cleaned text
def tokenize_text(text):
    return word_tokenize(text)

df['Tokenized_abstract'] = df['Cleaned_abstract'].apply(tokenize_text)

# Step 3: Remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['Processed_abstract'] = df['Tokenized_abstract'].apply(remove_stopwords)

# Save the processed data to a new CSV
df.to_csv("processed_pubmed_abstracts.csv", index=False)

print("✅ Text Preprocessing Complete! Saved to 'processed_pubmed_abstracts.csv'")

In [None]:
import spacy
import scispacy

In [None]:
# Load SciSpaCy's biomedical model
nlp = spacy.load("en_ner_bionlp13cg_md")

# Function to extract entities (like molecules)
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ["CHEMICAL", "GENE_OR_GENE_PRODUCT"]]  # Chemical, protein, drug names
    return entities
# Load your data into a pandas DataFrame
df = pd.read_csv("processed_pubmed_abstracts.csv")  # Make sure to replace this with your actual data file

# Apply NER to your abstracts
df['Molecules'] = df['Processed_abstract'].apply(extract_entities)

df.to_csv("extracted_molecules_biomedical.csv", index=False)

print("✅ Biomedical Molecules Extraction Complete!")

In [None]:
from chembl_webresource_client.new_client import new_client

In [None]:
import ast
import time

In [None]:
# Load your CSV with extracted molecules (already saved from SciSpacy step)
df = pd.read_csv("extracted_molecules_biomedical.csv")

# If 'Molecules' is stored as a stringified list, convert it back to list
df['Molecules'] = df['Molecules'].apply(ast.literal_eval)

# Initialize ChEMBL client
molecule = new_client.molecule

# Cache to avoid redundant API calls
checked_entities = {}

# Function to check if entity is in ChEMBL
def is_known_drug(entity_name):
    entity_name_lower = entity_name.lower()
    if entity_name_lower in checked_entities:
        return checked_entities[entity_name_lower]
    try:
        results = molecule.search(entity_name)
        known = len(results) > 0
        checked_entities[entity_name_lower] = known
        time.sleep(0.3)  # Sleep to avoid rate limiting
        return known
    except:
        checked_entities[entity_name_lower] = False
        return False

# Apply filtering
df['Novel_Molecules'] = df['Molecules'].apply(lambda entities: [e for e in entities if not is_known_drug(e)])

# Filter rows that have at least one novel molecule
filtered_df = df[df['Novel_Molecules'].map(len) > 0]

# Save results
filtered_df.to_csv("filtered_novel_molecules.csv", index=False)


In [None]:
# Load your filtered dataset
df = pd.read_csv("filtered_novel_molecules.csv")

# Sample lists (you can expand these!)
positive_keywords = ["effective", "inhibits", "inhibitor", "reduces", "suppresses", "treatment", "therapeutic", "antagonist"]
disease_keywords = ["cancer", "diabetes", "infection", "tumor", "alzheimer", "parkinson", "arthritis", "leukemia"]

# Function to extract positive context
def has_positive_context(abstract, molecule_list):
    abstract_lower = abstract.lower()
    return any(mol.lower() in abstract_lower and any(kw in abstract_lower for kw in positive_keywords) for mol in molecule_list)

# Function to extract disease mention
def extract_disease(abstract):
    abstract_lower = abstract.lower()
    found = [d for d in disease_keywords if d in abstract_lower]
    return list(set(found))  # remove duplicates

# Apply the functions
df['Has_Positive_Effect'] = df.apply(lambda row: has_positive_context(row['Processed_abstract'], row['Novel_Molecules']), axis=1)
df['Associated_Diseases'] = df['Processed_abstract'].apply(extract_disease)

# Save updated file
df.to_csv("context_extracted_novel_molecules.csv", index=False)

In [None]:
# Read the CSV into a DataFrame
df = pd.read_csv('context_extracted_novel_molecules.csv')
# Convert the string representation of lists into actual lists using ast.literal_eval
df['Associated_Diseases'] = df['Associated_Diseases'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Remove rows where the 'Associated_Diseases' list is empty
df_cleaned = df[df['Associated_Diseases'].apply(lambda x: len(x) > 0)]

# Save the cleaned DataFrame to a new CSV file
output_path = 'cleaned_novel_molecules.csv'
df_cleaned.to_csv(output_path, index=False)

print(f"Cleaned file saved as {output_path}")

In [None]:
import pandas as pd
from collections import Counter
import ast

# Load the cleaned CSV file
file_path = 'cleaned_novel_molecules.csv'
df = pd.read_csv(file_path)

# Convert the string representation of lists into actual lists using ast.literal_eval
df['Associated_Diseases'] = df['Associated_Diseases'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Count the occurrences of each disease across all rows in the 'Associated_Diseases' column
all_diseases = sum(df['Associated_Diseases'], [])  # Flatten the list of lists
disease_counts = Counter(all_diseases)

# Display the count of each unique disease
print("Disease counts:")
for disease, count in disease_counts.items():
    print(f"{disease}: {count}")

# Perform some basic statistical analysis
print("\nBasic Statistics:")
# Count the number of rows with a single disease vs. multiple diseases
single_disease_count = df['Associated_Diseases'].apply(lambda x: len(x) == 1).sum()
multiple_diseases_count = df['Associated_Diseases'].apply(lambda x: len(x) > 1).sum()

# Output statistics
print(f"Rows with a single disease: {single_disease_count}")
print(f"Rows with multiple diseases: {multiple_diseases_count}")
print(f"Total number of rows: {len(df)}")

In [None]:
import matplotlib.pyplot as plt

# Bar Chart: Visualizing the counts of each disease
plt.figure(figsize=(10,6))
plt.bar(disease_counts.keys(), disease_counts.values(), color='skyblue')
plt.title("Disease Distribution (Bar Chart)", fontsize=14)
plt.xlabel("Diseases", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=45)
plt.show()

# Pie Chart: Visualizing the disease distribution
plt.figure(figsize=(8,8))
plt.pie(disease_counts.values(), labels=disease_counts.keys(), autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title("Disease Distribution (Pie Chart)", fontsize=14)
plt.show()

In [None]:
# Reset index to convert the row labels into a column for plotting
disease_effect_df = disease_effect_counts.reset_index()

# Plotting
plt.figure(figsize=(10,6))
bar_width = 0.4
x = range(len(disease_effect_df))

# Bar for False (No Positive Effect)
plt.bar(
    [i - bar_width/2 for i in x], 
    disease_effect_df[False], 
    width=bar_width, 
    label='No Positive Effect', 
    color='salmon'
)

# Bar for True (Positive Effect)
plt.bar(
    [i + bar_width/2 for i in x], 
    disease_effect_df[True], 
    width=bar_width, 
    label='Positive Effect', 
    color='mediumseagreen'
)

# Customizing the plot
plt.xticks(x, disease_effect_df['Associated_Diseases'], rotation=45)
plt.xlabel('Disease')
plt.ylabel('Count')
plt.title('Positive vs. No Positive Effect by Disease')
plt.legend()
plt.tight_layout()

# Save the plot
plt.savefig('positive_effect_by_disease.png', dpi=300)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import ast
from collections import Counter
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("cleaned_novel_molecules.csv")

# Convert stringified lists to actual lists
df['Molecules'] = df['Molecules'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

# Separate data into positive and non-positive effect groups
positive_df = df[df['Has_Positive_Effect'] == True]
negative_df = df[df['Has_Positive_Effect'] == False]

# Count molecule occurrences in each group
positive_molecules = sum(positive_df['Molecules'], [])
negative_molecules = sum(negative_df['Molecules'], [])

positive_counts = Counter(positive_molecules)
negative_counts = Counter(negative_molecules)

# Create DataFrame for comparison
all_molecules = set(positive_counts) | set(negative_counts)
comparison_df = pd.DataFrame({
    'Molecule': list(all_molecules),
    'Positive_Effect': [positive_counts.get(m, 0) for m in all_molecules],
    'No_Positive_Effect': [negative_counts.get(m, 0) for m in all_molecules]
})

# Sort by most positively associated molecules
comparison_df = comparison_df.sort_values(by='Positive_Effect', ascending=False).head(10)

# Plot
plt.figure(figsize=(12,6))
bar_width = 0.4
x = range(len(comparison_df))

plt.bar([i - bar_width/2 for i in x], comparison_df['No_Positive_Effect'], width=bar_width, label='No Positive Effect', color='gray')
plt.bar([i + bar_width/2 for i in x], comparison_df['Positive_Effect'], width=bar_width, label='Positive Effect', color='green')

plt.xticks(x, comparison_df['Molecule'], rotation=45)
plt.xlabel('Molecule')
plt.ylabel('Count')
plt.title('Top Molecules vs. Positive Effect')
plt.legend()
plt.tight_layout()

# Save the figure
plt.savefig('molecule_positive_effect_correlation.png', dpi=300)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import ast
from collections import defaultdict, Counter

# Load your dataset
df = pd.read_csv("cleaned_novel_molecules.csv")

# Convert stringified lists into actual Python lists
df['Molecules'] = df['Molecules'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])
df['Associated_Diseases'] = df['Associated_Diseases'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

# Initialize a dictionary to collect molecule counts per disease
disease_molecule_map = defaultdict(list)

# Populate the map
for _, row in df.iterrows():
    for disease in row['Associated_Diseases']:
        disease_molecule_map[disease].extend(row['Molecules'])

# Create a dictionary of Counters for each disease
disease_molecule_counts = {disease: Counter(molecules) for disease, molecules in disease_molecule_map.items()}

# Example: Print top 5 molecules for each disease
for disease, counter in disease_molecule_counts.items():
    print(f"\nTop molecules associated with '{disease}':")
    for molecule, count in counter.most_common(5):
        print(f"  {molecule}: {count}")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter  # Assuming you are using Counter for disease_molecule_counts
import ast
from collections import defaultdict, Counter

# Load your dataset
df = pd.read_csv("cleaned_novel_molecules.csv")

# Convert stringified lists into actual Python lists
df['Molecules'] = df['Molecules'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])
df['Associated_Diseases'] = df['Associated_Diseases'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

# Initialize a dictionary to collect molecule counts per disease
disease_molecule_map = defaultdict(list)

# Populate the map
for _, row in df.iterrows():
    for disease in row['Associated_Diseases']:
        disease_molecule_map[disease].extend(row['Molecules'])

# Create a dictionary of Counters for each disease
disease_molecule_counts = {disease: Counter(molecules) for disease, molecules in disease_molecule_map.items()}

# Loop through each disease and plot top 5 molecules
for disease, counter in disease_molecule_counts.items():
    top_molecules = counter.most_common(5)
    molecules, counts = zip(*top_molecules)

    # Use a colormap or manually assign different colors
    colors = plt.cm.tab10.colors[:len(molecules)]  # tab10 has 10 distinct colors

    plt.figure(figsize=(8, 5))

    color_list = ['#A8DADC',  # soft teal
              '#F4A261',  # warm orange
              '#E76F51',  # coral
              '#2A9D8F',  # emerald green
              '#264653',  # deep navy
              '#C44E52',  # brick red
              '#8172B3',  # muted purple
              '#CCB974']  # khaki gold  ]  
    plt.bar(molecules, counts, color=color_list[:len(molecules)])
    plt.title(f"Top 5 Molecules Associated with {disease.capitalize()}")
    plt.xlabel("Molecule")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save each figure
    plt.savefig(f"top_molecules_{disease}.png")
    plt.show()

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import ast

# Load data
df = pd.read_csv("cleaned_novel_molecules.csv")

# Convert stringified lists into actual Python lists
df['Molecules'] = df['Molecules'].apply(ast.literal_eval)

# Select top N molecules you want to use as features
top_molecules = ['stat3', 'crc', 'csf1r', 'srebp1', 'grp78', 'p53', 'nrf2', 'ra', 'tau', 'sirt2']

# Create binary columns for each molecule
for molecule in top_molecules:
    df[molecule] = df['Molecules'].apply(lambda mols: 1 if molecule in mols else 0)

# Define features (X) and label (y)
X = df[top_molecules]
y = df['Has_Positive_Effect']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Optionally: show coefficients
coef_df = pd.DataFrame({'Molecule': top_molecules, 'Coefficient': model.coef_[0]})
print("\nMolecule Influence (Coefficients):")
print(coef_df.sort_values(by='Coefficient', ascending=False))