# Disease Prediction - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the disease prediction dataset to understand patterns, relationships, and insights that will guide model development.

## 1. Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import sys
import os
import joblib
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency
from collections import Counter

# Add src to path
sys.path.append('../src')

from utils.helpers import *

# Settings
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', 50)

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load processed data and preprocessor
train_data = pd.read_csv('../data/processed/train_encoded.csv')
test_data = pd.read_csv('../data/processed/test_encoded.csv')
preprocessor = joblib.load('../models/preprocessor.pkl')

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

# Get feature and target columns
symptom_cols = preprocessor.get_symptom_names()
disease_names = preprocessor.get_disease_names()

print(f"Number of symptoms: {len(symptom_cols)}")
print(f"Number of diseases: {len(disease_names)}")

Training data shape: (4920, 133)
Testing data shape: (42, 133)
Number of symptoms: 132
Number of diseases: 42


## 2. Disease Distribution Analysis

In [3]:
# Decode labels for visualization
train_data['disease_name'] = preprocessor.decode_predictions(train_data['prognosis'])
test_data['disease_name'] = preprocessor.decode_predictions(test_data['prognosis'])

# Disease frequency analysis
disease_counts = train_data['disease_name'].value_counts()

print("DISEASE DISTRIBUTION ANALYSIS:")
print("=" * 50)
print(f"Total unique diseases: {len(disease_counts)}")
print(f"Average samples per disease: {disease_counts.mean():.1f}")
print(f"Standard deviation: {disease_counts.std():.1f}")
print(f"Min samples: {disease_counts.min()}")
print(f"Max samples: {disease_counts.max()}")

print("\nTop 10 most common diseases:")
for i, (disease, count) in enumerate(disease_counts.head(10).items(), 1):
    percentage = (count / len(train_data)) * 100
    print(f"{i:2d}. {disease:25s}: {count:3d} ({percentage:4.1f}%)")

DISEASE DISTRIBUTION ANALYSIS:
Total unique diseases: 42
Average samples per disease: 117.1
Standard deviation: 0.4
Min samples: 117
Max samples: 118

Top 10 most common diseases:
 1. Fungal infection         : 118 ( 2.4%)
 2. GERD                     : 118 ( 2.4%)
 3. Chronic cholestasis      : 118 ( 2.4%)
 4. Drug Reaction            : 118 ( 2.4%)
 5. Peptic ulcer disease     : 118 ( 2.4%)
 6. Allergy                  : 118 ( 2.4%)
 7. Hyperthyroidism          : 117 ( 2.4%)
 8. Common Cold              : 117 ( 2.4%)
 9. Pneumonia                : 117 ( 2.4%)
10. Dimorphic hemmorhoids(piles): 117 ( 2.4%)


In [4]:
# Interactive disease distribution plot
fig = px.bar(
    x=disease_counts.values, 
    y=disease_counts.index,
    orientation='h',
    title="Disease Distribution in Training Data",
    labels={'x': 'Number of Cases', 'y': 'Disease'},
    height=800
)
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()

# Pie chart for top diseases
fig = px.pie(
    values=disease_counts.head(15).values, 
    names=disease_counts.head(15).index,
    title="Top 15 Diseases Distribution",
    height=600
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

## 3. Symptom Analysis

In [5]:
# Symptom frequency analysis
symptom_frequencies = train_data[symptom_cols].sum().sort_values(ascending=False)

print("SYMPTOM FREQUENCY ANALYSIS:")
print("=" * 50)
print(f"Total symptoms analyzed: {len(symptom_frequencies)}")
print(f"Average symptom frequency: {symptom_frequencies.mean():.1f}")
print(f"Most common symptom: {symptom_frequencies.index[0]} ({symptom_frequencies.iloc[0]} cases)")
print(f"Least common symptom: {symptom_frequencies.index[-1]} ({symptom_frequencies.iloc[-1]} cases)")

# Symptoms that never appear
zero_symptoms = symptom_frequencies[symptom_frequencies == 0]
print(f"\nSymptoms that never appear: {len(zero_symptoms)}")

# Symptoms that appear in >50% of cases
common_symptoms = symptom_frequencies[symptom_frequencies > len(train_data) * 0.5]
print(f"Symptoms in >50% of cases: {len(common_symptoms)}")

# Symptoms that appear in <1% of cases
rare_symptoms = symptom_frequencies[symptom_frequencies < len(train_data) * 0.01]
print(f"Symptoms in <1% of cases: {len(rare_symptoms)}")

SYMPTOM FREQUENCY ANALYSIS:
Total symptoms analyzed: 132
Average symptom frequency: 186.2
Most common symptom: vomiting (1492 cases)
Least common symptom: bladder_discomfort (6 cases)

Symptoms that never appear: 0
Symptoms in >50% of cases: 0
Symptoms in <1% of cases: 11


In [6]:
# Interactive symptom frequency plot
fig = px.bar(
    x=symptom_frequencies.head(30).values, 
    y=symptom_frequencies.head(30).index,
    orientation='h',
    title="Top 30 Most Common Symptoms",
    labels={'x': 'Frequency', 'y': 'Symptoms'},
    height=800
)
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()

# Symptom frequency distribution
fig = px.histogram(
    x=symptom_frequencies.values,
    nbins=50,
    title="Distribution of Symptom Frequencies",
    labels={'x': 'Frequency', 'y': 'Number of Symptoms'}
)
fig.add_vline(x=symptom_frequencies.mean(), line_dash="dash", line_color="red",
              annotation_text=f"Mean: {symptom_frequencies.mean():.1f}")
fig.show()

## 4. Disease-Symptom Relationship Analysis

In [7]:
# Create disease-symptom relationship matrix
def create_disease_symptom_matrix(data, diseases, symptoms, top_n=10):
    """Create matrix showing relationship between diseases and symptoms"""
    # Get top diseases and symptoms
    top_diseases = data['disease_name'].value_counts().head(top_n).index
    top_symptoms = data[symptoms].sum().sort_values(ascending=False).head(20).index
    
    # Create matrix
    matrix = []
    for disease in top_diseases:
        disease_data = data[data['disease_name'] == disease]
        symptom_prevalence = disease_data[top_symptoms].mean()
        matrix.append(symptom_prevalence)
    
    return pd.DataFrame(matrix, index=top_diseases, columns=top_symptoms)

# Create and visualize disease-symptom matrix
disease_symptom_matrix = create_disease_symptom_matrix(train_data, disease_names, symptom_cols)

print(f"Disease-Symptom Matrix Shape: {disease_symptom_matrix.shape}")
print("\nSample of the matrix:")
print(disease_symptom_matrix.iloc[:5, :5].round(2))

Disease-Symptom Matrix Shape: (10, 20)

Sample of the matrix:
                      vomiting  fatigue  headache  high_fever  nausea
disease_name                                                         
Fungal infection          0.00     0.00      0.00        0.02    0.00
GERD                      0.92     0.00      0.00        0.01    0.01
Chronic cholestasis       0.82     0.00      0.00        0.00    0.75
Drug Reaction             0.00     0.01      0.00        0.00    0.00
Peptic ulcer disease      0.75     0.00      0.01        0.00    0.00


In [8]:
# Interactive heatmap of disease-symptom relationships
fig = px.imshow(
    disease_symptom_matrix,
    title="Disease-Symptom Relationship Heatmap (Top 10 Diseases, Top 20 Symptoms)",
    color_continuous_scale="RdYlBu_r",
    aspect="auto",
    height=600
)
fig.update_layout(
    xaxis_title="Symptoms",
    yaxis_title="Diseases"
)
fig.show()

In [9]:
# Find unique symptom patterns for each disease
print("UNIQUE SYMPTOM PATTERNS ANALYSIS:")
print("=" * 50)

for disease in disease_names[:10]:  # Top 10 diseases
    disease_data = train_data[train_data['disease_name'] == disease]
    if len(disease_data) == 0:
        continue
    
    # Find symptoms that are highly associated with this disease
    symptom_prevalence = disease_data[symptom_cols].mean()
    high_prevalence_symptoms = symptom_prevalence[symptom_prevalence > 0.7].sort_values(ascending=False)
    
    if len(high_prevalence_symptoms) > 0:
        print(f"\n{disease}:")
        print(f"  Samples: {len(disease_data)}")
        print(f"  Key symptoms (>70% prevalence):")
        for symptom, prevalence in high_prevalence_symptoms.head(5).items():
            print(f"    - {symptom}: {prevalence:.1%}")

UNIQUE SYMPTOM PATTERNS ANALYSIS:

(vertigo) Paroymsal  Positional Vertigo:
  Samples: 117
  Key symptoms (>70% prevalence):
    - vomiting: 76.1%
    - unsteadiness: 74.4%
    - headache: 72.6%
    - nausea: 70.9%

AIDS:
  Samples: 117
  Key symptoms (>70% prevalence):
    - high_fever: 90.6%
    - extra_marital_contacts: 89.7%
    - patches_in_throat: 80.3%
    - muscle_wasting: 78.6%

Acne:
  Samples: 117
  Key symptoms (>70% prevalence):
    - scurring: 91.5%
    - skin_rash: 87.2%
    - pus_filled_pimples: 87.2%
    - blackheads: 85.5%

Alcoholic hepatitis:
  Samples: 117
  Key symptoms (>70% prevalence):
    - history_of_alcohol_consumption: 77.8%
    - swelling_of_stomach: 72.6%
    - yellowish_skin: 70.9%
    - distention_of_abdomen: 70.9%

Allergy:
  Samples: 118
  Key symptoms (>70% prevalence):
    - shivering: 89.8%
    - chills: 87.3%
    - watering_from_eyes: 87.3%
    - continuous_sneezing: 84.7%

Arthritis:
  Samples: 117
  Key symptoms (>70% prevalence):
    - muscle_w

## 5. Patient Symptom Profile Analysis

In [10]:
# Analyze number of symptoms per patient
symptoms_per_patient = train_data[symptom_cols].sum(axis=1)

print("SYMPTOMS PER PATIENT ANALYSIS:")
print("=" * 50)
print(f"Average symptoms per patient: {symptoms_per_patient.mean():.2f}")
print(f"Median symptoms per patient: {symptoms_per_patient.median():.2f}")
print(f"Standard deviation: {symptoms_per_patient.std():.2f}")
print(f"Min symptoms: {symptoms_per_patient.min()}")
print(f"Max symptoms: {symptoms_per_patient.max()}")

# Distribution by disease
symptoms_by_disease = train_data.groupby('disease_name')[symptom_cols].sum(axis=1).groupby('disease_name').mean()
print(f"\nAverage symptoms by disease (top 5):")
for disease, avg_symptoms in symptoms_by_disease.sort_values(ascending=False).head(5).items():
    print(f"  {disease}: {avg_symptoms:.2f} symptoms")

SYMPTOMS PER PATIENT ANALYSIS:
Average symptoms per patient: 5.00
Median symptoms per patient: 5.00
Standard deviation: 1.65
Min symptoms: 3
Max symptoms: 11


TypeError: GroupBy.sum() got an unexpected keyword argument 'axis'

In [12]:
# Analyze number of symptoms per patient
symptoms_per_patient = train_data[symptom_cols].sum(axis=1)

print("SYMPTOMS PER PATIENT ANALYSIS:")
print("=" * 50)
print(f"Average symptoms per patient: {symptoms_per_patient.mean():.2f}")
print(f"Median symptoms per patient: {symptoms_per_patient.median():.2f}")
print(f"Standard deviation: {symptoms_per_patient.std():.2f}")
print(f"Min symptoms: {symptoms_per_patient.min()}")
print(f"Max symptoms: {symptoms_per_patient.max()}")

# ✅ Distribution by disease (fixed)
train_data["symptom_count"] = train_data[symptom_cols].sum(axis=1)

symptoms_by_disease = train_data.groupby("disease_name")["symptom_count"].mean()

print(f"\nAverage symptoms by disease (top 5):")
for disease, avg_symptoms in symptoms_by_disease.sort_values(ascending=False).head(5).items():
    print(f"  {disease}: {avg_symptoms:.2f} symptoms")


SYMPTOMS PER PATIENT ANALYSIS:
Average symptoms per patient: 5.00
Median symptoms per patient: 5.00
Standard deviation: 1.65
Min symptoms: 3
Max symptoms: 11

Average symptoms by disease (top 5):
  Chicken pox: 6.12 symptoms
  Pneumonia: 6.12 symptoms
  Tuberculosis: 6.08 symptoms
  Hepatitis D: 6.03 symptoms
  Jaundice: 5.97 symptoms


In [13]:
# Visualize symptoms per patient distribution
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=['Histogram', 'Box Plot', 'Violin Plot', 'by Disease (Top 10)'],
    specs=[[{'type': 'histogram'}, {'type': 'box'}],
           [{'type': 'violin'}, {'type': 'bar'}]]
)

# Histogram
fig.add_trace(
    go.Histogram(x=symptoms_per_patient, name='Symptoms per Patient', nbinsx=20),
    row=1, col=1
)

# Box plot
fig.add_trace(
    go.Box(y=symptoms_per_patient, name='Distribution'),
    row=1, col=2
)

# Violin plot
fig.add_trace(
    go.Violin(y=symptoms_per_patient, name='Distribution'),
    row=2, col=1
)

# By disease
top_10_diseases = symptoms_by_disease.sort_values(ascending=False).head(10)
fig.add_trace(
    go.Bar(x=top_10_diseases.values, y=top_10_diseases.index, orientation='h'),
    row=2, col=2
)

fig.update_layout(
    title_text="Symptoms per Patient Analysis",
    height=800,
    showlegend=False
)

fig.show()

## 6. Correlation Analysis

In [14]:
# Calculate correlation matrix for top symptoms
top_20_symptoms = symptom_frequencies.head(20).index
correlation_matrix = train_data[top_20_symptoms].corr()

print("SYMPTOM CORRELATION ANALYSIS:")
print("=" * 50)

# Find highly correlated symptom pairs
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.3:  # Threshold for high correlation
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                corr_value
            ))

# Sort by absolute correlation
high_corr_pairs = sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)

print(f"Found {len(high_corr_pairs)} highly correlated symptom pairs (|r| > 0.3):")
for symptom1, symptom2, corr in high_corr_pairs[:10]:
    print(f"  {symptom1} <-> {symptom2}: {corr:.3f}")

SYMPTOM CORRELATION ANALYSIS:
Found 9 highly correlated symptom pairs (|r| > 0.3):
  chest_pain <-> breathlessness: 0.387
  yellowish_skin <-> dark_urine: 0.349
  abdominal_pain <-> dark_urine: 0.326
  skin_rash <-> itching: 0.326
  yellowish_skin <-> yellowing_of_eyes: 0.316
  loss_of_appetite <-> yellowing_of_eyes: 0.312
  abdominal_pain <-> yellowish_skin: 0.306
  sweating <-> breathlessness: 0.303
  loss_of_appetite <-> yellowish_skin: 0.300


In [15]:
# Interactive correlation heatmap
fig = px.imshow(
    correlation_matrix,
    title="Symptom Correlation Heatmap (Top 20 Symptoms)",
    color_continuous_scale="RdBu",
    aspect="auto",
    height=700
)
fig.update_layout(
    xaxis_title="Symptoms",
    yaxis_title="Symptoms"
)
fig.show()

## 7. Dimensionality Reduction Analysis

In [16]:
# PCA Analysis
from sklearn.preprocessing import StandardScaler

# Prepare data for PCA
X = train_data[symptom_cols].values
y = train_data['prognosis'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate cumulative explained variance ratio
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)

print("PCA ANALYSIS:")
print("=" * 50)
print(f"Total components: {len(pca.components_)}")
print(f"Components for 90% variance: {np.argmax(cumsum_variance >= 0.90) + 1}")
print(f"Components for 95% variance: {np.argmax(cumsum_variance >= 0.95) + 1}")
print(f"Components for 99% variance: {np.argmax(cumsum_variance >= 0.99) + 1}")

print(f"\nFirst 10 components explain {cumsum_variance[9]:.1%} of variance")
print(f"First 20 components explain {cumsum_variance[19]:.1%} of variance")
print(f"First 50 components explain {cumsum_variance[49]:.1%} of variance")

PCA ANALYSIS:
Total components: 132
Components for 90% variance: 88
Components for 95% variance: 107
Components for 99% variance: 126

First 10 components explain 27.3% of variance
First 20 components explain 46.7% of variance
First 50 components explain 75.4% of variance


In [18]:
# Visualize PCA results
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['Explained Variance Ratio', 'Cumulative Explained Variance']
)

# Individual explained variance
fig.add_trace(
    go.Bar(x=list(range(1, 21)), y=pca.explained_variance_ratio_[:20],
           name='Individual Variance'),
    row=1, col=1
)

# Cumulative explained variance
fig.add_trace(
    go.Scatter(x=list(range(1, 101)), y=cumsum_variance[:100],
               mode='lines+markers', name='Cumulative Variance'),
    row=1, col=2
)

# Add reference lines
fig.add_hline(y=0.90, line_dash="dash", line_color="red", 
              annotation_text="90%", row=1, col=2)
fig.add_hline(y=0.95, line_dash="dash", line_color="orange", 
              annotation_text="95%", row=1, col=2)

fig.update_layout(
    title_text="PCA Analysis Results",
    height=400,
    showlegend=False
)

fig.show()

In [19]:
# 2D PCA visualization
# Use first 2 principal components
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

# Create DataFrame for plotting
pca_df = pd.DataFrame({
    'PC1': X_pca_2d[:, 0],
    'PC2': X_pca_2d[:, 1],
    'Disease': train_data['disease_name']
})

# Plot only top 10 diseases for clarity
top_diseases = train_data['disease_name'].value_counts().head(10).index
pca_df_subset = pca_df[pca_df['Disease'].isin(top_diseases)]

fig = px.scatter(
    pca_df_subset, x='PC1', y='PC2', color='Disease',
    title=f"PCA Visualization (Top 10 Diseases)\nPC1: {pca_2d.explained_variance_ratio_[0]:.1%} variance, PC2: {pca_2d.explained_variance_ratio_[1]:.1%} variance",
    height=600
)

fig.update_traces(marker=dict(size=8, line=dict(width=1, color='white')))
fig.show()

## 8. Statistical Tests and Feature Importance

In [20]:
# Chi-square test for feature independence
from sklearn.feature_selection import chi2, SelectKBest

print("FEATURE IMPORTANCE ANALYSIS:")
print("=" * 50)

# Calculate chi-square statistics
chi2_scores, p_values = chi2(X, y)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': symptom_cols,
    'Chi2_Score': chi2_scores,
    'P_Value': p_values
}).sort_values('Chi2_Score', ascending=False)

print("Top 20 most important features (Chi-square test):")
for i, (_, row) in enumerate(feature_importance.head(20).iterrows(), 1):
    print(f"{i:2d}. {row['Feature']:25s}: {row['Chi2_Score']:8.2f} (p={row['P_Value']:.2e})")

# Features with low importance
low_importance = feature_importance[feature_importance['Chi2_Score'] < 1.0]
print(f"\nFeatures with low importance (Chi2 < 1.0): {len(low_importance)}")

# Features with high p-values (not statistically significant)
high_p_value = feature_importance[feature_importance['P_Value'] > 0.05]
print(f"Features with p-value > 0.05: {len(high_p_value)}")

FEATURE IMPORTANCE ANALYSIS:
Top 20 most important features (Chi-square test):
 1. scurring                 :  4075.14 (p=0.00e+00)
 2. extra_marital_contacts   :  3992.76 (p=0.00e+00)
 3. altered_sensorium        :  3909.01 (p=0.00e+00)
 4. pus_filled_pimples       :  3907.47 (p=0.00e+00)
 5. nodal_skin_eruptions     :  3837.40 (p=0.00e+00)
 6. continuous_feel_of_urine :  3830.31 (p=0.00e+00)
 7. blackheads               :  3825.77 (p=0.00e+00)
 8. shivering                :  3789.64 (p=0.00e+00)
 9. burning_micturition      :  3784.77 (p=0.00e+00)
10. watering_from_eyes       :  3738.02 (p=0.00e+00)
11. spotting_urination       :  3725.49 (p=0.00e+00)
12. foul_smell_of_urine      :  3716.66 (p=0.00e+00)
13. stomach_pain             :  3666.54 (p=0.00e+00)
14. muscle_weakness          :  3648.57 (p=0.00e+00)
15. red_sore_around_nose     :  3638.41 (p=0.00e+00)
16. yellow_crust_ooze        :  3625.62 (p=0.00e+00)
17. painful_walking          :  3623.64 (p=0.00e+00)
18. dischromic_patch

In [21]:
# Visualize feature importance
top_30_features = feature_importance.head(30)

fig = px.bar(
    top_30_features, 
    x='Chi2_Score', 
    y='Feature',
    orientation='h',
    title="Top 30 Most Important Features (Chi-square Test)",
    labels={'Chi2_Score': 'Chi-square Score'},
    height=800
)

fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()

## 9. Clustering Analysis

In [22]:
# K-means clustering analysis
from sklearn.metrics import silhouette_score

print("CLUSTERING ANALYSIS:")
print("=" * 50)

# Try different numbers of clusters
k_range = range(2, 11)
silhouette_scores = []
inertias = []

# Use PCA-reduced data for clustering (first 50 components)
pca_50 = PCA(n_components=50)
X_pca_50 = pca_50.fit_transform(X_scaled)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_pca_50)
    
    silhouette_avg = silhouette_score(X_pca_50, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    inertias.append(kmeans.inertia_)
    
    print(f"k={k}: Silhouette Score = {silhouette_avg:.3f}, Inertia = {kmeans.inertia_:.0f}")

# Find optimal k
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"\nOptimal k based on silhouette score: {optimal_k}")

CLUSTERING ANALYSIS:
k=2: Silhouette Score = 0.092, Inertia = 473361
k=3: Silhouette Score = 0.051, Inertia = 458902
k=4: Silhouette Score = 0.066, Inertia = 443220
k=5: Silhouette Score = 0.073, Inertia = 428017
k=6: Silhouette Score = 0.094, Inertia = 414513
k=7: Silhouette Score = 0.128, Inertia = 404004
k=8: Silhouette Score = 0.117, Inertia = 386067
k=9: Silhouette Score = 0.123, Inertia = 371983
k=10: Silhouette Score = 0.169, Inertia = 362468

Optimal k based on silhouette score: 10


In [23]:
# Visualize clustering results
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['Silhouette Score vs K', 'Elbow Method (Inertia vs K)']
)

# Silhouette scores
fig.add_trace(
    go.Scatter(x=list(k_range), y=silhouette_scores, mode='lines+markers',
               name='Silhouette Score'),
    row=1, col=1
)

# Inertia (for elbow method)
fig.add_trace(
    go.Scatter(x=list(k_range), y=inertias, mode='lines+markers',
               name='Inertia'),
    row=1, col=2
)

# Highlight optimal k
fig.add_vline(x=optimal_k, line_dash="dash", line_color="red",
              annotation_text=f"Optimal k={optimal_k}", row=1, col=1)

fig.update_layout(
    title_text="K-means Clustering Analysis",
    height=400,
    showlegend=False
)

fig.show()

## 10. Key Insights and Recommendations

In [24]:
# Generate comprehensive insights
print("🔍 COMPREHENSIVE EDA INSIGHTS")
print("=" * 60)

# Data characteristics
print("📊 DATA CHARACTERISTICS:")
print(f"  • Dataset size: {len(train_data):,} training samples, {len(test_data)} test samples")
print(f"  • Features: {len(symptom_cols)} symptoms")
print(f"  • Classes: {len(disease_names)} diseases")
print(f"  • Class balance: {'Balanced' if disease_counts.std() < 5 else 'Imbalanced'} (std: {disease_counts.std():.1f})")
print(f"  • Sparsity: {(1 - train_data[symptom_cols].mean().mean()):.1%} of features are zeros")

# Key patterns
print(f"\n🔍 KEY PATTERNS:")
print(f"  • Average symptoms per patient: {symptoms_per_patient.mean():.1f}")
print(f"  • Most common disease: {disease_counts.index[0]} ({disease_counts.iloc[0]} cases)")
print(f"  • Most common symptom: {symptom_frequencies.index[0]} ({symptom_frequencies.iloc[0]} cases, {(symptom_frequencies.iloc[0]/len(train_data)):.1%})")
print(f"  • Dimensionality: {np.argmax(cumsum_variance >= 0.95) + 1} components for 95% variance")

# Data quality
print(f"\n✅ DATA QUALITY:")
print(f"  • Missing values: {train_data.isnull().sum().sum()} (0%)")
print(f"  • Duplicate rows: {train_data.duplicated().sum()} (0%)")
print(f"  • Feature consistency: All features are binary (0/1)")
print(f"  • Label consistency: All diseases present in both train/test")

# Modeling recommendations
print(f"\n🎯 MODELING RECOMMENDATIONS:")
print(f"  1. Feature Selection: Consider using top {len(feature_importance[feature_importance['P_Value'] < 0.01])} features (p < 0.01)")
print(f"  2. Dimensionality: PCA with ~50 components captures {cumsum_variance[49]:.1%} variance")
print(f"  3. Class Balance: {'No balancing needed' if disease_counts.std() < 5 else 'Consider class balancing techniques'}")
print(f"  4. Algorithm Choice: Tree-based models may work well due to sparse binary features")
print(f"  5. Validation: Use stratified cross-validation due to multiple classes")

# Potential challenges
print(f"\n⚠️  POTENTIAL CHALLENGES:")
if len(zero_symptoms) > 0:
    print(f"  • {len(zero_symptoms)} symptoms never appear - consider removing")
if len(high_corr_pairs) > 10:
    print(f"  • {len(high_corr_pairs)} highly correlated symptom pairs - potential multicollinearity")
if len(rare_symptoms) > 20:
    print(f"  • {len(rare_symptoms)} rare symptoms (<1% prevalence) - may cause overfitting")

print(f"\n🏆 READY FOR MODEL TRAINING!")
print("  The dataset shows good quality and clear patterns that should")
print("  enable effective machine learning model development.")

🔍 COMPREHENSIVE EDA INSIGHTS
📊 DATA CHARACTERISTICS:
  • Dataset size: 4,920 training samples, 42 test samples
  • Features: 132 symptoms
  • Classes: 42 diseases
  • Class balance: Balanced (std: 0.4)
  • Sparsity: 96.2% of features are zeros

🔍 KEY PATTERNS:
  • Average symptoms per patient: 5.0
  • Most common disease: Fungal infection (118 cases)
  • Most common symptom: vomiting (1492 cases, 30.3%)
  • Dimensionality: 107 components for 95% variance

✅ DATA QUALITY:
  • Missing values: 0 (0%)
  • Duplicate rows: 1864 (0%)
  • Feature consistency: All features are binary (0/1)
  • Label consistency: All diseases present in both train/test

🎯 MODELING RECOMMENDATIONS:
  1. Feature Selection: Consider using top 126 features (p < 0.01)
  2. Dimensionality: PCA with ~50 components captures 75.4% variance
  3. Class Balance: No balancing needed
  4. Algorithm Choice: Tree-based models may work well due to sparse binary features
  5. Validation: Use stratified cross-validation due to mul

In [25]:
# Save EDA insights
eda_insights = {
    'dataset_stats': {
        'n_samples': len(train_data),
        'n_features': len(symptom_cols),
        'n_classes': len(disease_names),
        'avg_symptoms_per_patient': float(symptoms_per_patient.mean()),
        'class_balance_std': float(disease_counts.std())
    },
    'top_diseases': disease_counts.head(10).to_dict(),
    'top_symptoms': symptom_frequencies.head(20).to_dict(),
    'important_features': feature_importance.head(30)[['Feature', 'Chi2_Score']].to_dict('records'),
    'pca_components_95_variance': int(np.argmax(cumsum_variance >= 0.95) + 1),
    'optimal_clusters': int(optimal_k)
}

import json
os.makedirs('../data/processed', exist_ok=True)
with open('../data/processed/eda_insights.json', 'w') as f:
    json.dump(eda_insights, f, indent=2)

print("\n💾 EDA insights saved to '../data/processed/eda_insights.json'")
print("\n📈 Exploratory Data Analysis completed successfully!")


💾 EDA insights saved to '../data/processed/eda_insights.json'

📈 Exploratory Data Analysis completed successfully!
