# Disease Prediction - Data Preparation

This notebook handles data loading, cleaning, and initial preprocessing for the disease prediction hackathon project.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append('../src')

from data.preprocess import DataPreprocessor
from utils.helpers import load_data, plot_class_distribution

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 20)

print("Libraries imported successfully!")
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Load Raw Data

In [None]:
# Load training and testing data
train_data = load_data('../data/raw/Training.csv')
test_data = load_data('../data/raw/Testing.csv')

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

## 3. Data Exploration

In [None]:
# Basic info about the dataset
print("TRAINING DATA INFO:")
print("=" * 50)
print(f"Shape: {train_data.shape}")
print(f"Columns: {train_data.columns.tolist()[:10]}... (showing first 10)")
print(f"Data types: {train_data.dtypes.value_counts().to_dict()}")
print(f"Memory usage: {train_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nTESTING DATA INFO:")
print("=" * 50)
print(f"Shape: {test_data.shape}")
print(f"Data types: {test_data.dtypes.value_counts().to_dict()}")
print(f"Memory usage: {test_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Check for missing values
print("MISSING VALUES ANALYSIS:")
print("=" * 50)

train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

print(f"Training data missing values: {train_missing.sum()}")
print(f"Testing data missing values: {test_missing.sum()}")

if train_missing.sum() > 0:
    print("\nColumns with missing values in training data:")
    print(train_missing[train_missing > 0])
    
if test_missing.sum() > 0:
    print("\nColumns with missing values in testing data:")
    print(test_missing[test_missing > 0])

In [None]:
# Examine the target variable (prognosis)
print("TARGET VARIABLE ANALYSIS:")
print("=" * 50)

print(f"Number of unique diseases in training data: {train_data['prognosis'].nunique()}")
print(f"Number of unique diseases in testing data: {test_data['prognosis'].nunique()}")

print("\nDiseases in training data:")
print(train_data['prognosis'].value_counts().head(10))

print("\nDiseases in testing data:")
print(test_data['prognosis'].value_counts().head(10))

In [None]:
# Check if all diseases in test set are present in train set
train_diseases = set(train_data['prognosis'].unique())
test_diseases = set(test_data['prognosis'].unique())

missing_in_train = test_diseases - train_diseases
missing_in_test = train_diseases - test_diseases

print(f"Diseases in test but not in train: {len(missing_in_train)}")
if missing_in_train:
    print(f"Missing diseases: {list(missing_in_train)}")
    
print(f"\nDiseases in train but not in test: {len(missing_in_test)}")
if missing_in_test:
    print(f"Missing diseases: {list(missing_in_test)[:10]}... (showing first 10)")

## 4. Symptom Analysis

In [None]:
# Get symptom columns (all columns except prognosis)
symptom_cols = [col for col in train_data.columns if col != 'prognosis']
print(f"Number of symptom features: {len(symptom_cols)}")
print(f"First 10 symptoms: {symptom_cols[:10]}")

# Check symptom value distribution
print("\nSymptom value distribution in training data:")
for col in symptom_cols[:5]:  # Show first 5 symptoms
    print(f"{col}: {train_data[col].value_counts().to_dict()}")

In [None]:
# Calculate symptom frequencies
symptom_frequencies = train_data[symptom_cols].sum().sort_values(ascending=False)

print("TOP 20 MOST COMMON SYMPTOMS:")
print("=" * 50)
for i, (symptom, freq) in enumerate(symptom_frequencies.head(20).items(), 1):
    percentage = (freq / len(train_data)) * 100
    print(f"{i:2d}. {symptom:30s}: {freq:4d} ({percentage:5.1f}%)")

In [None]:
# Visualize symptom frequencies
plt.figure(figsize=(15, 10))

# Top 30 symptoms
plt.subplot(2, 1, 1)
top_30_symptoms = symptom_frequencies.head(30)
top_30_symptoms.plot(kind='barh')
plt.title('Top 30 Most Common Symptoms')
plt.xlabel('Frequency')
plt.ylabel('Symptoms')

# Symptom frequency distribution
plt.subplot(2, 1, 2)
plt.hist(symptom_frequencies.values, bins=50, alpha=0.7, color='skyblue')
plt.title('Distribution of Symptom Frequencies')
plt.xlabel('Frequency')
plt.ylabel('Number of Symptoms')
plt.axvline(symptom_frequencies.mean(), color='red', linestyle='--', label=f'Mean: {symptom_frequencies.mean():.1f}')
plt.legend()

plt.tight_layout()
plt.show()

## 5. Data Quality Checks

In [None]:
# Check for duplicate rows
print("DUPLICATE ANALYSIS:")
print("=" * 50)

train_duplicates = train_data.duplicated().sum()
test_duplicates = test_data.duplicated().sum()

print(f"Duplicate rows in training data: {train_duplicates}")
print(f"Duplicate rows in testing data: {test_duplicates}")

if train_duplicates > 0:
    print(f"Percentage of duplicates in training data: {(train_duplicates/len(train_data))*100:.2f}%")
    
if test_duplicates > 0:
    print(f"Percentage of duplicates in testing data: {(test_duplicates/len(test_data))*100:.2f}%")

In [None]:
# Check symptom patterns per disease
print("SYMPTOM PATTERNS ANALYSIS:")
print("=" * 50)

# Average number of symptoms per patient
symptoms_per_patient = train_data[symptom_cols].sum(axis=1)
print(f"Average symptoms per patient: {symptoms_per_patient.mean():.2f}")
print(f"Min symptoms per patient: {symptoms_per_patient.min()}")
print(f"Max symptoms per patient: {symptoms_per_patient.max()}")

# Distribution of symptom counts
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(symptoms_per_patient, bins=20, alpha=0.7, color='lightgreen')
plt.title('Distribution of Symptoms per Patient')
plt.xlabel('Number of Symptoms')
plt.ylabel('Number of Patients')
plt.axvline(symptoms_per_patient.mean(), color='red', linestyle='--', label=f'Mean: {symptoms_per_patient.mean():.1f}')
plt.legend()

plt.subplot(1, 2, 2)
plt.boxplot(symptoms_per_patient)
plt.title('Boxplot of Symptoms per Patient')
plt.ylabel('Number of Symptoms')

plt.tight_layout()
plt.show()

## 6. Initialize Data Preprocessor

In [None]:
# Initialize the data preprocessor
preprocessor = DataPreprocessor()

# Load data through preprocessor
train_df, test_df = preprocessor.load_data('../data/raw/Training.csv', '../data/raw/Testing.csv')

print("\nData loaded through preprocessor successfully!")

In [None]:
# Prepare features and target
X_train, y_train = preprocessor.prepare_features_and_target()
X_test, y_test = preprocessor.prepare_features_and_target(test_df)

print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Testing target shape: {y_test.shape}")

print(f"\nUnique encoded labels: {np.unique(y_train)[:10]}... (showing first 10)")
print(f"Disease names: {preprocessor.get_disease_names()[:5]}... (showing first 5)")

## 7. Check Class Balance

In [None]:
# Analyze class distribution
from collections import Counter

class_distribution = Counter(y_train)
print("CLASS DISTRIBUTION ANALYSIS:")
print("=" * 50)

print(f"Number of classes: {len(class_distribution)}")
print(f"Min samples per class: {min(class_distribution.values())}")
print(f"Max samples per class: {max(class_distribution.values())}")
print(f"Average samples per class: {np.mean(list(class_distribution.values())):.1f}")

# Check if classes are balanced
class_counts = list(class_distribution.values())
is_balanced = max(class_counts) - min(class_counts) <= 1
print(f"\nClasses are balanced: {is_balanced}")

if not is_balanced:
    print("Recommendation: Consider using class balancing techniques")

In [None]:
# Visualize class distribution
plt.figure(figsize=(15, 8))

disease_names = preprocessor.get_disease_names()
disease_counts = [class_distribution[i] for i in range(len(disease_names))]

plt.bar(range(len(disease_names)), disease_counts)
plt.title('Disease Class Distribution in Training Data')
plt.xlabel('Disease Index')
plt.ylabel('Number of Samples')
plt.xticks(range(0, len(disease_names), 5))  # Show every 5th tick

# Add horizontal line for average
avg_count = np.mean(disease_counts)
plt.axhline(y=avg_count, color='r', linestyle='--', label=f'Average: {avg_count:.1f}')
plt.legend()

plt.tight_layout()
plt.show()

## 8. Data Validation and Summary

In [None]:
# Final validation
print("DATA VALIDATION SUMMARY:")
print("=" * 50)

validations = []

# Check 1: Correct number of features
expected_features = 132
actual_features = X_train.shape[1]
validations.append((f"Feature count ({expected_features}", actual_features == expected_features))

# Check 2: Correct number of classes
expected_classes = 42
actual_classes = len(np.unique(y_train))
validations.append((f"Class count ({expected_classes}", actual_classes == expected_classes))

# Check 3: No missing values
no_missing = not np.isnan(X_train).any()
validations.append(("No missing values", no_missing))

# Check 4: Binary features (0 or 1)
binary_features = np.all(np.isin(X_train, [0, 1]))
validations.append(("Binary features (0/1)", binary_features))

# Check 5: Consistent test set
consistent_test = X_test.shape[1] == X_train.shape[1]
validations.append(("Consistent test set features", consistent_test))

# Print validation results
for validation, passed in validations:
    status = "✅ PASS" if passed else "❌ FAIL"
    print(f"{status} - {validation}")

all_passed = all(result[1] for result in validations)
print(f"\nOverall validation: {'✅ ALL CHECKS PASSED' if all_passed else '❌ SOME CHECKS FAILED'}")

if all_passed:
    print("\n🎉 Data preparation completed successfully!")
    print("Ready for exploratory data analysis and model training.")

## 9. Save Prepared Data

In [None]:
# Save the preprocessor object for later use
import joblib

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save preprocessor
joblib.dump(preprocessor, '../models/preprocessor.pkl')
print("Preprocessor saved to '../models/preprocessor.pkl'")

# Save basic processed data
processed_train = pd.DataFrame(X_train, columns=preprocessor.get_symptom_names())
processed_train['prognosis'] = y_train

processed_test = pd.DataFrame(X_test, columns=preprocessor.get_symptom_names())
processed_test['prognosis'] = y_test

os.makedirs('../data/processed', exist_ok=True)
processed_train.to_csv('../data/processed/train_encoded.csv', index=False)
processed_test.to_csv('../data/processed/test_encoded.csv', index=False)

print("Processed data saved to '../data/processed/'")
print("\n📊 Data preparation notebook completed successfully!")