In [None]:
# Phase 1: Problem Understanding and Data Exploration
## SW485 - Machine Learning Project
### Medical Diagnosis System

**Group Members:**
- Shahad Alsabui - ID: 444
- Sarah Alomran - ID: 444200911
- Reem Al Mutlaq - ID: 444200533
- Shadn Alsaif - ID: 443201150
- Mashael Albgomi
- ID: 44202218

**Date:** October 2025

In [None]:
---
## Section 1: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
print("Libraries imported successfully")

In [None]:
df = pd.read_csv('Dataset/your_filename.csv')

print("Dataset loaded successfully")
print(f"Shape: {df.shape}")

In [None]:
---
## Section 2: Dataset Goal & Source

### Dataset Information

**Goal:**
This dataset is used to build a disease prediction system that can:
1. Identify potential diseases based on patient symptoms
2. Recommend appropriate medical specialists
3. Provide preliminary diagnostic guidance

**Source:**
[Kaggle - Diseases and Symptoms Dataset](https://www.kaggle.com/datasets/dhivyeshrk/diseases-and-symptoms-dataset)

**Application Domain:** Healthcare - Medical Diagnosis

**Purpose:**
- Early disease detection
- Reducing diagnostic time
- Improving patient outcomes
- Optimizing healthcare resource allocation

In [None]:
---
## Section 3: General Information

In [None]:
print("="*70)
print("DATASET OVERVIEW")
print("="*70)

# Basic information
print(f"\n1. Dataset Dimensions:")
print(f"   - Number of observations (rows): {df.shape[0]}")
print(f"   - Number of features (columns): {df.shape[1]}")

In [None]:
print(f"\n2. Column Names:")
for i, col in enumerate(df.columns, 1):
    print(f"   {i}. {col}")

In [None]:
print(f"\n3. Data Types:")
print(df.dtypes)

In [None]:
print(f"\n4. First 10 rows:")
display(df.head(10))

print(f"\n5. Last 10 rows:")
display(df.tail(10))

In [None]:
# Common names: 'Disease', 'disease', 'prognosis', 'Disease_Name'
if 'Disease' in df.columns:
    target_col = 'Disease'
elif 'disease' in df.columns:
    target_col = 'disease'
elif 'prognosis' in df.columns:
    target_col = 'prognosis'
else:
    print("Available columns:")
    print(df.columns.tolist())
    target_col = df.columns[-1]  # Assume last column
    print(f"\nAssuming target column is: {target_col}")

print(f"\n6. Target Variable: '{target_col}'")
print(f"   - Number of unique diseases: {df[target_col].nunique()}")
print(f"\n   - Disease distribution:\n")
print(df[target_col].value_counts())

In [None]:
print(f"\n7. Basic Statistics:")
display(df.describe(include='all'))

In [None]:
print("\n8. Dataset Information:")
df.info()

In [None]:
---
## Section 4: Summary & Visualization

### 4.1 Missing Values Analysis

In [None]:
print("\n" + "="*70)
print("MISSING VALUES ANALYSIS")
print("="*70)

missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print("\nMissing Values Summary:")
if len(missing_df) > 0:
    display(missing_df)
else:
    print("✓ No missing values found!")

In [None]:
# Visualize missing values
plt.figure(figsize=(14, 6))
sns.heatmap(df.isnull(), cbar=True, cmap='viridis', yticklabels=False)
plt.title('Missing Values Heatmap', fontsize=16, fontweight='bold')
plt.xlabel('Columns', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
### 4.2 Class Distribution (Target Variable)

In [None]:
print("\n" + "="*70)
print("CLASS DISTRIBUTION")
print("="*70)

disease_counts = df[target_col].value_counts()
print(f"\nDisease distribution:")
print(disease_counts)

In [None]:
# Plot class distribution
plt.figure(figsize=(14, 8))
disease_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Diseases in Dataset', fontsize=16, fontweight='bold')
plt.xlabel('Disease', fontsize=12)
plt.ylabel('Number of Samples', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Check for class imbalance
print(f"\nClass Imbalance Analysis:")
print(f"Most common disease: {disease_counts.idxmax()} ({disease_counts.max()} samples)")
print(f"Least common disease: {disease_counts.idxmin()} ({disease_counts.min()} samples)")
imbalance_ratio = disease_counts.max() / disease_counts.min()
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")

if imbalance_ratio > 3:
    print("\n⚠ Warning: Significant class imbalance detected!")
    print("   This may require special handling in Phase 2")
else:
    print("\n✓ Classes are relatively balanced")

In [None]:
### 4.3 Statistical Summary

In [None]:
print("\n" + "="*70)
print("STATISTICAL SUMMARY")
print("="*70)

# Get symptom columns
symptom_cols = [col for col in df.columns if col != target_col]

print(f"\nNumber of symptom features: {len(symptom_cols)}")
print(f"\nSample symptom columns (first 15):")
for col in symptom_cols[:15]:
    print(f"  - {col}")

In [None]:
# Check if symptoms are numerical
if df[symptom_cols].dtypes[0] in ['int64', 'float64', 'int32', 'float32']:
    print("\n\nStatistical Summary of Symptoms:")
    display(df[symptom_cols].describe())
    
    # Symptom frequency
    print("\n\nMost Common Symptoms:")
    symptom_sums = df[symptom_cols].sum().sort_values(ascending=False)
    display(symptom_sums.head(15))
else:
    print("\nSymptoms appear to be non-numerical.")
    print("Data types:", df[symptom_cols].dtypes.value_counts())

In [None]:
# Visualize top symptoms numericaly
if df[symptom_cols].dtypes[0] in ['int64', 'float64', 'int32', 'float32']:
    symptom_sums = df[symptom_cols].sum().sort_values(ascending=False)
    
    plt.figure(figsize=(12, 8))
    symptom_sums.head(20).plot(kind='barh', color='coral', edgecolor='black')
    plt.title('Top 20 Most Frequent Symptoms', fontsize=16, fontweight='bold')
    plt.xlabel('Frequency', fontsize=12)
    plt.ylabel('Symptom', fontsize=12)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
### 4.4 Correlation Analysis

In [None]:
# Correlation analysis (if applicable)
if df[symptom_cols].dtypes[0] in ['int64', 'float64', 'int32', 'float32']:
    print("\n" + "="*70)
    print("CORRELATION ANALYSIS")
    print("="*70)
    
    # Sample correlation matrix (top symptoms only)
    symptom_sums = df[symptom_cols].sum().sort_values(ascending=False)
    top_symptoms = symptom_sums.head(15).index
    correlation_matrix = df[top_symptoms].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix - Top 15 Symptoms', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
### 4.5 Data Distribution Visualizations

In [None]:
print("\n" + "="*70)
print("DATA DISTRIBUTION VISUALIZATIONS")
print("="*70)

# Distribution of number of symptoms per case
if df[symptom_cols].dtypes[0] in ['int64', 'float64', 'int32', 'float32']:
    df['total_symptoms'] = df[symptom_cols].sum(axis=1)
    
    plt.figure(figsize=(12, 6))
    plt.hist(df['total_symptoms'], bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
    plt.title('Distribution of Number of Symptoms per Case', fontsize=16, fontweight='bold')
    plt.xlabel('Number of Symptoms', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.grid(axis='y', alpha=0.3)
    plt.axvline(df['total_symptoms'].mean(), color='red', linestyle='--', 
                linewidth=2, label=f'Mean: {df["total_symptoms"].mean():.2f}')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    print(f"\nSymptom Count Statistics:")
    print(f"  - Average: {df['total_symptoms'].mean():.2f}")
    print(f"  - Median: {df['total_symptoms'].median():.2f}")
    print(f"  - Min: {df['total_symptoms'].min():.0f}")
    print(f"  - Max: {df['total_symptoms'].max():.0f}")
    print(f"  - Std: {df['total_symptoms'].std():.2f}")

In [None]:
# Box plot of symptoms per disease
if 'total_symptoms' in df.columns:
    # Show top 10 diseases only
    top_diseases = df[target_col].value_counts().head(10).index
    df_top = df[df[target_col].isin(top_diseases)]
    
    plt.figure(figsize=(14, 6))
    sns.boxplot(data=df_top, x=target_col, y='total_symptoms', palette='Set2')
    plt.title('Symptom Count by Disease (Top 10)', fontsize=16, fontweight='bold')
    plt.xlabel('Disease', fontsize=12)
    plt.ylabel('Number of Symptoms', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
---
## Section 5: Preprocessing Techniques

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

print("="*70)
print("PREPROCESSING STEPS")
print("="*70)
print("\nCreating a copy of the dataset for preprocessing...")
print(f"Original shape: {df.shape}")

In [None]:
### 5.1 Handle Missing Values

In [None]:
print("\n1. HANDLING MISSING VALUES")
print("-"*70)

total_missing = df_processed.isnull().sum().sum()

if total_missing > 0:
    print(f"Missing values detected: {total_missing}")
    print("\nApplying preprocessing strategies...")
    
    # For symptom columns - fill with 0
    for col in symptom_cols:
        if df_processed[col].isnull().sum() > 0:
            missing_count = df_processed[col].isnull().sum()
            df_processed[col].fillna(0, inplace=True)
            print(f"   ✓ Filled {missing_count} missing values in '{col}' with 0")
    
    # For target variable - remove rows
    if df_processed[target_col].isnull().sum() > 0:
        rows_before = len(df_processed)
        df_processed = df_processed.dropna(subset=[target_col])
        rows_after = len(df_processed)
        print(f"   ✓ Removed {rows_before - rows_after} rows with missing labels")
    
    print(f"\n Justification:")
    print(f"   • Symptom missing values → 0 (symptom not present)")
    print(f"   • Missing labels removed (need labeled data for ML)")
else:
    print("✓ No missing values found.")

print(f"\nShape after: {df_processed.shape}")

In [None]:
### 5.2 Remove Duplicates

In [None]:
print("\n2. REMOVING DUPLICATES")
print("-"*70)

duplicates = df_processed.duplicated().sum()
if duplicates > 0:
    df_processed = df_processed.drop_duplicates()
    print(f"   ✓ Removed {duplicates} duplicate rows")
    print(f"\n Justification:")
    print(f"   • Duplicates can bias the model")
else:
    print("✓ No duplicates found.")

print(f"\nShape after: {df_processed.shape}")

In [None]:
### 5.3 Data Type Conversion

In [None]:
print("\n3. DATA TYPE CONVERSION")
print("-"*70)

# Ensure symptom columns are numeric
conversion_needed = False
converted_cols = []

for col in symptom_cols:
    if df_processed[col].dtype == 'object':
        conversion_needed = True
        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
        df_processed[col].fillna(0, inplace=True)
        converted_cols.append(col)

if conversion_needed:
    print(f"   ✓ Converted {len(converted_cols)} columns to numeric")
    print(f"\n Justification: ML algorithms require numerical input")
else:
    print("✓ All symptom columns are already numeric.")

print(f"\nData types: {df_processed.dtypes.value_counts().to_dict()}")

In [None]:
### 5.4 Feature Engineering

In [None]:
print("\n4. FEATURE ENGINEERING")
print("-"*70)

# Create symptom count feature
if 'total_symptoms' not in df_processed.columns:
    df_processed['total_symptoms'] = df_processed[symptom_cols].sum(axis=1)
    print("   ✓ Created 'total_symptoms' feature")
    print(f"\n Justification:")
    print(f"   • Total symptom count may help classification")
    print(f"   • Provides aggregate information")
    
    print(f"\n   Statistics:")
    print(f"   - Mean: {df_processed['total_symptoms'].mean():.2f}")
    print(f"   - Min: {df_processed['total_symptoms'].min():.0f}")
    print(f"   - Max: {df_processed['total_symptoms'].max():.0f}")
else:
    print("✓ 'total_symptoms' already exists")

In [None]:
### 5.5 Encoding Target Variable

In [None]:
print("\n5. ENCODING TARGET VARIABLE")
print("-"*70)

from sklearn.preprocessing import LabelEncoder

# Encode disease labels
le = LabelEncoder()
df_processed['disease_encoded'] = le.fit_transform(df_processed[target_col])

print(f"   ✓ Encoded disease labels to numeric values")
print(f"   - Number of classes: {len(le.classes_)}")
print(f"\nMapping (first 10):")
for i, disease in enumerate(le.classes_[:10]):
    print(f"   {i}: {disease}")

print(f"\n Justification: ML algorithms need numeric labels")

In [None]:
### 5.6 Class Imbalance Analysis

In [None]:
print("\n6. CLASS IMBALANCE ANALYSIS")
print("-"*70)

class_distribution = df_processed[target_col].value_counts()
imbalance_ratio = class_distribution.max() / class_distribution.min()

if imbalance_ratio > 3:
    print(f"   ⚠ Class imbalance detected (ratio: {imbalance_ratio:.2f}:1)")
    print(f"\n   Strategies for Phase 2:")
    print(f"   • SMOTE (Synthetic Minority Over-sampling)")
    print(f"   • Class weighting")
    print(f"   • Undersampling majority class")
    print(f"   • Ensemble methods")
else:
    print(f"   ✓ Classes relatively balanced (ratio: {imbalance_ratio:.2f}:1)")

In [None]:
### 5.7 Preprocessing Summary

In [None]:
print("\n" + "="*70)
print("PREPROCESSING SUMMARY")
print("="*70)

print(f"\nOriginal dataset: {df.shape}")
print(f"Processed dataset: {df_processed.shape}")
print(f"Rows removed: {df.shape[0] - df_processed.shape[0]}")
print(f"Features: {df_processed.shape[1]}")

# Save processed data
df_processed.to_csv('Dataset/diseases_symptoms_processed.csv', index=False)
print(f"\n✓ Processed data saved to 'Dataset/diseases_symptoms_processed.csv'")

print(f"\nProcessed Data Preview:")
display(df_processed.head())

In [None]:
---
## Section 6: Final Summary and Key Insights

In [None]:
print("="*70)
print("KEY INSIGHTS FROM DATA EXPLORATION")
print("="*70)

print(f"""
1. DATASET CHARACTERISTICS:
   - Total samples: {df_processed.shape[0]}
   - Number of features: {df_processed.shape[1]}
   - Number of diseases: {df_processed[target_col].nunique()}
   - Average symptoms per case: {df_processed['total_symptoms'].mean():.2f}

2. DATA QUALITY:
   - Missing values: {'Handled' if total_missing > 0 else 'None'}
   - Duplicates: {'Removed' if duplicates > 0 else 'None'}
   - Data types: Consistent and numeric

3. CLASS DISTRIBUTION:
   - Most common disease: {class_distribution.idxmax()}
   - Least common disease: {class_distribution.idxmin()}
   - Imbalance ratio: {imbalance_ratio:.2f}:1

4. PREPROCESSING APPLIED:
   ✓ Missing value imputation
   ✓ Duplicate removal
   ✓ Feature encoding
   ✓ Feature engineering (total_symptoms)

5. READINESS FOR PHASE 2:
   ✓ Dataset is clean and preprocessed
   ✓ Features are numeric
   ✓ Class imbalance identified
   ✓ Data saved and ready for ML

6. NEXT STEPS (Phase 2):
   • Apply supervised learning algorithms
   • Compare model performance
   • Handle class imbalance if needed
   • Fine-tune hyperparameters
""")

print("="*70)
print("Phase 1 Complete!")
print("="*70)