In [4]:
import pandas as pd
df=pd.read_csv("data/Diseases_Symptoms.csv")
df.head()

Unnamed: 0,Code,Name,Symptoms,Treatments
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal"
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ..."


In [5]:
# Examine the data structure
print("Dataset shape:", df.shape)
print("\nColumn names and types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
print("\nSample symptoms and treatments:")
print("Symptoms example:", df['Symptoms'].iloc[0])
print("Treatments example:", df['Treatments'].iloc[0])

Dataset shape: (400, 4)

Column names and types:
Code           int64
Name          object
Symptoms      object
Treatments    object
dtype: object

Missing values:
Code          0
Name          0
Symptoms      0
Treatments    1
dtype: int64

Sample symptoms and treatments:
Symptoms example: Palpitations, Sweating, Trembling, Shortness of breath, Fear of losing control, Dizziness
Treatments example: Antidepressant medications, Cognitive Behavioral Therapy, Relaxation Techniques


In [6]:
# Install required packages if not already installed
import re
import numpy as np

# Data cleaning and preprocessing functions
def clean_text(text):
    """Clean and standardize text data"""
    if pd.isna(text):
        return ""
    
    # Convert to string and strip whitespace
    text = str(text).strip()
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing commas
    text = text.strip(',').strip()
    
    return text

def parse_comma_separated_values(text):
    """Parse comma-separated values and clean them"""
    if pd.isna(text) or text == "":
        return []
    
    # Split by comma and clean each item
    items = [clean_text(item.strip()) for item in str(text).split(',')]
    # Remove empty items
    items = [item for item in items if item]
    
    return items

def standardize_disease_name(name):
    """Standardize disease names"""
    name = clean_text(name)
    # Convert to title case for consistency
    name = name.title()
    return name

# Apply initial cleaning
print("Cleaning data...")
df_clean = df.copy()

# Clean disease names
df_clean['Name'] = df_clean['Name'].apply(standardize_disease_name)

# Clean symptoms and treatments
df_clean['Symptoms'] = df_clean['Symptoms'].apply(clean_text)
df_clean['Treatments'] = df_clean['Treatments'].apply(clean_text)

print("Data cleaned!")
print("Remaining missing treatments:", df_clean['Treatments'].isna().sum())

Cleaning data...
Data cleaned!
Remaining missing treatments: 0


In [7]:
# Create separate datasets for Neo4j nodes and relationships

# 1. DISEASES nodes
diseases_df = df_clean[['Code', 'Name']].copy()
diseases_df.columns = ['disease_id', 'disease_name']
diseases_df['node_type'] = 'DISEASE'

print("Diseases dataset:")
print(diseases_df.head())
print(f"Total diseases: {len(diseases_df)}")

# 2. SYMPTOMS nodes - extract unique symptoms
symptoms_list = []
for idx, row in df_clean.iterrows():
    symptoms = parse_comma_separated_values(row['Symptoms'])
    for symptom in symptoms:
        if symptom:  # Only non-empty symptoms
            symptoms_list.append({
                'symptom_name': symptom.strip().title(),
                'disease_id': row['Code'],
                'disease_name': row['Name']
            })

# Create unique symptoms dataframe
symptoms_df = pd.DataFrame(symptoms_list)
unique_symptoms = symptoms_df[['symptom_name']].drop_duplicates().reset_index(drop=True)
unique_symptoms['symptom_id'] = range(1, len(unique_symptoms) + 1)
unique_symptoms['node_type'] = 'SYMPTOM'

print(f"\nTotal unique symptoms: {len(unique_symptoms)}")
print("Sample symptoms:")
print(unique_symptoms.head(10))

Diseases dataset:
   disease_id                 disease_name node_type
0           1               Panic Disorder   DISEASE
1           2             Vocal Cord Polyp   DISEASE
2           3              Turner Syndrome   DISEASE
3           4               Cryptorchidism   DISEASE
4           5  Ethylene Glycol Poisoning-1   DISEASE
Total diseases: 400

Total unique symptoms: 939
Sample symptoms:
             symptom_name  symptom_id node_type
0            Palpitations           1   SYMPTOM
1                Sweating           2   SYMPTOM
2               Trembling           3   SYMPTOM
3     Shortness Of Breath           4   SYMPTOM
4  Fear Of Losing Control           5   SYMPTOM
5               Dizziness           6   SYMPTOM
6              Hoarseness           7   SYMPTOM
7           Vocal Changes           8   SYMPTOM
8           Vocal Fatigue           9   SYMPTOM
9           Short Stature          10   SYMPTOM


In [8]:
# 3. TREATMENTS nodes - extract unique treatments
treatments_list = []
for idx, row in df_clean.iterrows():
    treatments = parse_comma_separated_values(row['Treatments'])
    for treatment in treatments:
        if treatment:  # Only non-empty treatments
            treatments_list.append({
                'treatment_name': treatment.strip().title(),
                'disease_id': row['Code'],
                'disease_name': row['Name']
            })

# Create unique treatments dataframe
treatments_df = pd.DataFrame(treatments_list)
unique_treatments = treatments_df[['treatment_name']].drop_duplicates().reset_index(drop=True)
unique_treatments['treatment_id'] = range(1, len(unique_treatments) + 1)
unique_treatments['node_type'] = 'TREATMENT'

print(f"Total unique treatments: {len(unique_treatments)}")
print("Sample treatments:")
print(unique_treatments.head(10))

Total unique treatments: 1292
Sample treatments:
                                      treatment_name  treatment_id  node_type
0                         Antidepressant Medications             1  TREATMENT
1                       Cognitive Behavioral Therapy             2  TREATMENT
2                              Relaxation Techniques             3  TREATMENT
3                                         Voice Rest             4  TREATMENT
4                                     Speech Therapy             5  TREATMENT
5                                   Surgical Removal             6  TREATMENT
6                             Growth Hormone Therapy             7  TREATMENT
7                       Estrogen Replacement Therapy             8  TREATMENT
8                      Cardiac And Renal Evaluations             9  TREATMENT
9  Observation And Monitoring (In Cases Of Mild O...            10  TREATMENT


In [9]:
# 4. Create RELATIONSHIPS datasets

# DISEASE-HAS_SYMPTOM-SYMPTOM relationships
disease_symptom_rels = []
for idx, row in df_clean.iterrows():
    symptoms = parse_comma_separated_values(row['Symptoms'])
    for symptom in symptoms:
        if symptom:
            symptom_clean = symptom.strip().title()
            # Find symptom_id
            symptom_id = unique_symptoms[unique_symptoms['symptom_name'] == symptom_clean]['symptom_id'].values
            if len(symptom_id) > 0:
                disease_symptom_rels.append({
                    'disease_id': row['Code'],
                    'symptom_id': symptom_id[0],
                    'relationship': 'HAS_SYMPTOM'
                })

disease_symptom_df = pd.DataFrame(disease_symptom_rels)

# DISEASE-TREATED_BY-TREATMENT relationships
disease_treatment_rels = []
for idx, row in df_clean.iterrows():
    treatments = parse_comma_separated_values(row['Treatments'])
    for treatment in treatments:
        if treatment:
            treatment_clean = treatment.strip().title()
            # Find treatment_id
            treatment_id = unique_treatments[unique_treatments['treatment_name'] == treatment_clean]['treatment_id'].values
            if len(treatment_id) > 0:
                disease_treatment_rels.append({
                    'disease_id': row['Code'],
                    'treatment_id': treatment_id[0],
                    'relationship': 'TREATED_BY'
                })

disease_treatment_df = pd.DataFrame(disease_treatment_rels)

print(f"Disease-Symptom relationships: {len(disease_symptom_df)}")
print(f"Disease-Treatment relationships: {len(disease_treatment_df)}")

# Display sample relationships
print("\nSample Disease-Symptom relationships:")
print(disease_symptom_df.head())
print("\nSample Disease-Treatment relationships:")
print(disease_treatment_df.head())

Disease-Symptom relationships: 1682
Disease-Treatment relationships: 2000

Sample Disease-Symptom relationships:
   disease_id  symptom_id relationship
0           1           1  HAS_SYMPTOM
1           1           2  HAS_SYMPTOM
2           1           3  HAS_SYMPTOM
3           1           4  HAS_SYMPTOM
4           1           5  HAS_SYMPTOM

Sample Disease-Treatment relationships:
   disease_id  treatment_id relationship
0           1             1   TREATED_BY
1           1             2   TREATED_BY
2           1             3   TREATED_BY
3           2             4   TREATED_BY
4           2             5   TREATED_BY


In [10]:
# 5. Data quality checks and validation
print("=== DATA QUALITY CHECKS ===")

# Check for duplicates
print(f"Duplicate diseases: {diseases_df['disease_name'].duplicated().sum()}")
print(f"Duplicate symptoms: {unique_symptoms['symptom_name'].duplicated().sum()}")
print(f"Duplicate treatments: {unique_treatments['treatment_name'].duplicated().sum()}")

# Check for missing values
print(f"\nMissing values in diseases: {diseases_df.isnull().sum().sum()}")
print(f"Missing values in symptoms: {unique_symptoms.isnull().sum().sum()}")
print(f"Missing values in treatments: {unique_treatments.isnull().sum().sum()}")

# Check relationship integrity
print(f"\nRelationship integrity:")
print(f"All disease IDs in symptom relationships exist: {disease_symptom_df['disease_id'].isin(diseases_df['disease_id']).all()}")
print(f"All disease IDs in treatment relationships exist: {disease_treatment_df['disease_id'].isin(diseases_df['disease_id']).all()}")

# Display some statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Total nodes: {len(diseases_df) + len(unique_symptoms) + len(unique_treatments)}")
print(f"- Diseases: {len(diseases_df)}")
print(f"- Symptoms: {len(unique_symptoms)}")
print(f"- Treatments: {len(unique_treatments)}")
print(f"Total relationships: {len(disease_symptom_df) + len(disease_treatment_df)}")
print(f"- Disease-Symptom: {len(disease_symptom_df)}")
print(f"- Disease-Treatment: {len(disease_treatment_df)}")

# Average symptoms and treatments per disease
avg_symptoms = len(disease_symptom_df) / len(diseases_df)
avg_treatments = len(disease_treatment_df) / len(diseases_df)
print(f"\nAverage symptoms per disease: {avg_symptoms:.2f}")
print(f"Average treatments per disease: {avg_treatments:.2f}")

=== DATA QUALITY CHECKS ===
Duplicate diseases: 8
Duplicate symptoms: 0
Duplicate treatments: 0

Missing values in diseases: 0
Missing values in symptoms: 0
Missing values in treatments: 0

Relationship integrity:
All disease IDs in symptom relationships exist: True
All disease IDs in treatment relationships exist: True

=== SUMMARY STATISTICS ===
Total nodes: 2631
- Diseases: 400
- Symptoms: 939
- Treatments: 1292
Total relationships: 3682
- Disease-Symptom: 1682
- Disease-Treatment: 2000

Average symptoms per disease: 4.21
Average treatments per disease: 5.00


In [11]:
# Handle duplicate diseases by creating unique identifiers
print("Handling duplicate diseases...")

# Check which diseases are duplicated
duplicated_diseases = diseases_df[diseases_df['disease_name'].duplicated(keep=False)].sort_values('disease_name')
print("Duplicated diseases:")
print(duplicated_diseases)

# For Neo4j, we'll keep original disease_id as unique identifier but note the duplicates
# This preserves the original relationships while maintaining data integrity

Handling duplicate diseases...
Duplicated diseases:
     disease_id                           disease_name node_type
52           53  Complex Regional Pain Syndrome (Crps)   DISEASE
378         379  Complex Regional Pain Syndrome (Crps)   DISEASE
216         217         Dermatitis Due To Sun Exposure   DISEASE
389         390         Dermatitis Due To Sun Exposure   DISEASE
162         163                          Endometriosis   DISEASE
380         381                          Endometriosis   DISEASE
50           51                           Fibromyalgia   DISEASE
80           81                           Fibromyalgia   DISEASE
18           19                               Mucocele   DISEASE
27           28                               Mucocele   DISEASE
67           68                               Sciatica   DISEASE
204         205                               Sciatica   DISEASE
377         378                               Sciatica   DISEASE
61           62          Urinary Tract

In [12]:
# 6. Export to CSV files for Neo4j import
import os

# Create output directory
output_dir = "neo4j_data"
os.makedirs(output_dir, exist_ok=True)

# Prepare final datasets for Neo4j
# For nodes, we need: id, name, and node_type

# 1. Export Diseases nodes
diseases_final = diseases_df[['disease_id', 'disease_name', 'node_type']].copy()
diseases_final.columns = ['id', 'name', 'type']
diseases_final.to_csv(f"{output_dir}/diseases.csv", index=False)

# 2. Export Symptoms nodes
symptoms_final = unique_symptoms[['symptom_id', 'symptom_name', 'node_type']].copy()
symptoms_final.columns = ['id', 'name', 'type']
symptoms_final.to_csv(f"{output_dir}/symptoms.csv", index=False)

# 3. Export Treatments nodes
treatments_final = unique_treatments[['treatment_id', 'treatment_name', 'node_type']].copy()
treatments_final.columns = ['id', 'name', 'type']
treatments_final.to_csv(f"{output_dir}/treatments.csv", index=False)

# 4. Export Disease-Symptom relationships
disease_symptom_final = disease_symptom_df.copy()
disease_symptom_final.columns = ['from_id', 'to_id', 'relationship_type']
disease_symptom_final.to_csv(f"{output_dir}/disease_symptom_relationships.csv", index=False)

# 5. Export Disease-Treatment relationships  
disease_treatment_final = disease_treatment_df.copy()
disease_treatment_final.columns = ['from_id', 'to_id', 'relationship_type']
disease_treatment_final.to_csv(f"{output_dir}/disease_treatment_relationships.csv", index=False)

print("✅ CSV files exported successfully!")
print(f"Files created in '{output_dir}' directory:")
print("1. diseases.csv - Disease nodes")
print("2. symptoms.csv - Symptom nodes") 
print("3. treatments.csv - Treatment nodes")
print("4. disease_symptom_relationships.csv - Disease->Symptom relationships")
print("5. disease_treatment_relationships.csv - Disease->Treatment relationships")

# Display file sizes
for filename in ["diseases.csv", "symptoms.csv", "treatments.csv", 
                 "disease_symptom_relationships.csv", "disease_treatment_relationships.csv"]:
    filepath = os.path.join(output_dir, filename)
    if os.path.exists(filepath):
        size = os.path.getsize(filepath)
        print(f"{filename}: {size:,} bytes")

✅ CSV files exported successfully!
Files created in 'neo4j_data' directory:
1. diseases.csv - Disease nodes
2. symptoms.csv - Symptom nodes
3. treatments.csv - Treatment nodes
4. disease_symptom_relationships.csv - Disease->Symptom relationships
5. disease_treatment_relationships.csv - Disease->Treatment relationships
diseases.csv: 12,647 bytes
symptoms.csv: 36,168 bytes
treatments.csv: 65,028 bytes
disease_symptom_relationships.csv: 34,302 bytes
disease_treatment_relationships.csv: 39,314 bytes


In [13]:
# 7. Create Neo4j Cypher import scripts
cypher_script = '''
// Neo4j Knowledge Graph Import Script
// Run these commands in Neo4j Browser or Neo4j Desktop

// 1. Create constraints and indexes for better performance
CREATE CONSTRAINT disease_id IF NOT EXISTS FOR (d:Disease) REQUIRE d.id IS UNIQUE;
CREATE CONSTRAINT symptom_id IF NOT EXISTS FOR (s:Symptom) REQUIRE s.id IS UNIQUE;
CREATE CONSTRAINT treatment_id IF NOT EXISTS FOR (t:Treatment) REQUIRE t.id IS UNIQUE;

// 2. Load Disease nodes
LOAD CSV WITH HEADERS FROM "file:///diseases.csv" AS row
CREATE (d:Disease {
    id: toInteger(row.id),
    name: row.name,
    type: row.type
});

// 3. Load Symptom nodes  
LOAD CSV WITH HEADERS FROM "file:///symptoms.csv" AS row
CREATE (s:Symptom {
    id: toInteger(row.id),
    name: row.name,
    type: row.type
});

// 4. Load Treatment nodes
LOAD CSV WITH HEADERS FROM "file:///treatments.csv" AS row
CREATE (t:Treatment {
    id: toInteger(row.id),
    name: row.name,
    type: row.type
});

// 5. Create Disease-Symptom relationships
LOAD CSV WITH HEADERS FROM "file:///disease_symptom_relationships.csv" AS row
MATCH (d:Disease {id: toInteger(row.from_id)})
MATCH (s:Symptom {id: toInteger(row.to_id)})
CREATE (d)-[:HAS_SYMPTOM]->(s);

// 6. Create Disease-Treatment relationships
LOAD CSV WITH HEADERS FROM "file:///disease_treatment_relationships.csv" AS row
MATCH (d:Disease {id: toInteger(row.from_id)})
MATCH (t:Treatment {id: toInteger(row.to_id)})
CREATE (d)-[:TREATED_BY]->(t);

// 7. Verify the import
MATCH (n) RETURN labels(n) AS NodeType, count(n) AS Count;
MATCH ()-[r]->() RETURN type(r) AS RelationshipType, count(r) AS Count;
'''

# Save Cypher script
with open(f"{output_dir}/import_script.cypher", "w") as f:
    f.write(cypher_script)

print("✅ Neo4j Cypher import script created: neo4j_data/import_script.cypher")
print("\nTo import into Neo4j:")
print("1. Copy the CSV files to Neo4j's import directory")
print("2. Run the Cypher commands in the import_script.cypher file")
print("3. Or use the Neo4j Admin import tool for large datasets")

✅ Neo4j Cypher import script created: neo4j_data/import_script.cypher

To import into Neo4j:
1. Copy the CSV files to Neo4j's import directory
2. Run the Cypher commands in the import_script.cypher file
3. Or use the Neo4j Admin import tool for large datasets


In [14]:
# 8. Sample visualization and queries for validation
print("=== SAMPLE DATA FOR VALIDATION ===")

# Show a complete example: disease with its symptoms and treatments
sample_disease_id = 1
sample_disease = diseases_df[diseases_df['disease_id'] == sample_disease_id]
sample_symptoms = disease_symptom_df[disease_symptom_df['disease_id'] == sample_disease_id]
sample_treatments = disease_treatment_df[disease_treatment_df['disease_id'] == sample_disease_id]

print(f"\nSample Disease: {sample_disease['disease_name'].iloc[0]}")
print(f"Symptoms ({len(sample_symptoms)}):")
for _, rel in sample_symptoms.iterrows():
    symptom_name = unique_symptoms[unique_symptoms['symptom_id'] == rel['symptom_id']]['symptom_name'].iloc[0]
    print(f"  - {symptom_name}")

print(f"\nTreatments ({len(sample_treatments)}):")
for _, rel in sample_treatments.iterrows():
    treatment_name = unique_treatments[unique_treatments['treatment_id'] == rel['treatment_id']]['treatment_name'].iloc[0]
    print(f"  - {treatment_name}")

print(f"\n=== NEO4J QUERY EXAMPLES ===")
print("""
// Find all symptoms for a specific disease
MATCH (d:Disease {name: "Panic Disorder"})-[:HAS_SYMPTOM]->(s:Symptom)
RETURN d.name, collect(s.name) AS symptoms;

// Find all treatments for a specific disease  
MATCH (d:Disease {name: "Panic Disorder"})-[:TREATED_BY]->(t:Treatment)
RETURN d.name, collect(t.name) AS treatments;

// Find diseases that share symptoms with a given disease
MATCH (d1:Disease {name: "Panic Disorder"})-[:HAS_SYMPTOM]->(s:Symptom)<-[:HAS_SYMPTOM]-(d2:Disease)
WHERE d1 <> d2
RETURN d2.name, count(s) AS shared_symptoms
ORDER BY shared_symptoms DESC;

// Find the most common symptoms across all diseases
MATCH (d:Disease)-[:HAS_SYMPTOM]->(s:Symptom)
RETURN s.name, count(d) AS disease_count
ORDER BY disease_count DESC
LIMIT 10;

// Find diseases that can be treated with similar treatments
MATCH (d1:Disease)-[:TREATED_BY]->(t:Treatment)<-[:TREATED_BY]-(d2:Disease)
WHERE d1 <> d2
RETURN d1.name, d2.name, count(t) AS shared_treatments
ORDER BY shared_treatments DESC
LIMIT 10;
""")

=== SAMPLE DATA FOR VALIDATION ===

Sample Disease: Panic Disorder
Symptoms (6):
  - Palpitations
  - Sweating
  - Trembling
  - Shortness Of Breath
  - Fear Of Losing Control
  - Dizziness

Treatments (3):
  - Antidepressant Medications
  - Cognitive Behavioral Therapy
  - Relaxation Techniques

=== NEO4J QUERY EXAMPLES ===

// Find all symptoms for a specific disease
MATCH (d:Disease {name: "Panic Disorder"})-[:HAS_SYMPTOM]->(s:Symptom)
RETURN d.name, collect(s.name) AS symptoms;

// Find all treatments for a specific disease  
MATCH (d:Disease {name: "Panic Disorder"})-[:TREATED_BY]->(t:Treatment)
RETURN d.name, collect(t.name) AS treatments;

// Find diseases that share symptoms with a given disease
MATCH (d1:Disease {name: "Panic Disorder"})-[:HAS_SYMPTOM]->(s:Symptom)<-[:HAS_SYMPTOM]-(d2:Disease)
WHERE d1 <> d2
RETURN d2.name, count(s) AS shared_symptoms
ORDER BY shared_symptoms DESC;

// Find the most common symptoms across all diseases
MATCH (d:Disease)-[:HAS_SYMPTOM]->(s:Sympt