# Phase 0: Data Understanding & Preparation

This notebook demonstrates the complete data preprocessing pipeline for the AMR Pattern Recognition project.

## Pipeline Overview
1. Load and Explore Raw Data
2. Data Cleaning
3. Feature Encoding
4. Target Variable Creation (MAR Index & Species)
5. Data Splitting
6. Save Processed Data

## 1. Load and Explore Raw Data

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

# Custom modules
import sys
sys.path.append('..')
from src.data.preprocessing import (
    load_raw_data,
    identify_antibiotic_columns,
    clean_interpretation_values,
    handle_missing_values,
    encode_resistance,
    standardize_species_labels,
    calculate_mar_index,
    create_mar_target,
    prepare_species_target
)
from src.data.splitting import stratified_split
from src.features.build_features import extract_resistance_features, create_feature_matrix

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

In [None]:
# Load raw data
data_path = '../data/raw/rawdata.csv'
df = load_raw_data(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Number of samples: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
# Display column information
print("Column names:")
print(df.columns.tolist())

In [None]:
# Display data types
print("Data types:")
print(df.dtypes)

In [None]:
# Display first few rows
print("First 5 rows:")
df.head()

In [None]:
# Identify antibiotic interpretation columns
antibiotic_int_cols = identify_antibiotic_columns(df)
print(f"Number of antibiotic interpretation columns: {len(antibiotic_int_cols)}")
print(f"\nAntibiotic interpretation columns:")
for i, col in enumerate(antibiotic_int_cols, 1):
    print(f"{i}. {col}")

## 2. Data Cleaning

In [None]:
# Check for special characters and variations in interpretation values
print("Unique interpretation values before cleaning:")
for col in antibiotic_int_cols[:5]:  # Show first 5 as example
    unique_vals = df[col].unique()
    print(f"\n{col}: {unique_vals}")

In [None]:
# Standardize interpretation values
df = clean_interpretation_values(df, antibiotic_int_cols)

print("\nUnique interpretation values after cleaning:")
for col in antibiotic_int_cols[:5]:  # Show first 5 as example
    unique_vals = df[col].unique()
    print(f"{col}: {unique_vals}")

In [None]:
# Check missing values before handling
print("Missing values in interpretation columns (before):")
missing_before = df[antibiotic_int_cols].isnull().sum()
print(missing_before[missing_before > 0])
print(f"\nTotal missing values: {df[antibiotic_int_cols].isnull().sum().sum()}")
print(f"Rows with all antibiotics missing: {df[antibiotic_int_cols].isnull().all(axis=1).sum()}")

In [None]:
# Handle missing values - drop rows where ALL antibiotics are missing
df = handle_missing_values(df, antibiotic_int_cols, strategy='drop')

print(f"\nDataset shape after handling missing values: {df.shape}")
print(f"Rows with all antibiotics missing (after): {df[antibiotic_int_cols].isnull().all(axis=1).sum()}")

In [None]:
# Standardize species names
print("Species before standardization:")
print(df['bacterial_species'].value_counts())

df = standardize_species_labels(df)

print("\nSpecies after standardization:")
print(df['bacterial_species'].value_counts())

## 3. Feature Encoding

In [None]:
# Encode resistance interpretations: s=0, i=1, r=2
df = encode_resistance(df, antibiotic_int_cols, method='ordinal')

# Get encoded column names
encoded_cols = [col for col in df.columns if col.endswith('_encoded')]
print(f"Number of encoded columns: {len(encoded_cols)}")
print(f"\nFirst 5 encoded columns:")
for col in encoded_cols[:5]:
    print(f"  {col}")

In [None]:
# Show encoding distribution for sample columns
print("Encoding distribution (s=0, i=1, r=2):")
for i, col in enumerate(encoded_cols[:3]):
    print(f"\n{col}:")
    print(df[col].value_counts(dropna=False).sort_index())

In [None]:
# Visualize encoding distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, col in enumerate(encoded_cols[:3]):
    df[col].value_counts().sort_index().plot(kind='bar', ax=axes[i])
    axes[i].set_title(f'Distribution: {col}')
    axes[i].set_xlabel('Encoded Value')
    axes[i].set_ylabel('Count')
    axes[i].set_xticklabels(['Susceptible (0)', 'Intermediate (1)', 'Resistant (2)'], rotation=45)
plt.tight_layout()
plt.show()

## 4. Target Variable Creation

### 4.1 MAR Index Calculation

In [None]:
# Calculate MAR index
df['MAR_index'] = calculate_mar_index(df, encoded_cols)

print("MAR Index Statistics:")
print(df['MAR_index'].describe())
print(f"\nMissing MAR values: {df['MAR_index'].isnull().sum()}")

In [None]:
# Visualize MAR distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['MAR_index'].dropna(), bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(0.2, color='red', linestyle='--', linewidth=2, label='Threshold (0.2)')
axes[0].set_xlabel('MAR Index')
axes[0].set_ylabel('Frequency')
axes[0].set_title('MAR Index Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(df['MAR_index'].dropna())
axes[1].axhline(0.2, color='red', linestyle='--', linewidth=2, label='Threshold (0.2)')
axes[1].set_ylabel('MAR Index')
axes[1].set_title('MAR Index Box Plot')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Create binary High_MAR target (threshold=0.2)
df['High_MAR'] = create_mar_target(df, threshold=0.2)

print("High_MAR class distribution:")
print(df['High_MAR'].value_counts())
print(f"\nClass proportions:")
print(df['High_MAR'].value_counts(normalize=True))

In [None]:
# Visualize High_MAR distribution
plt.figure(figsize=(8, 5))
df['High_MAR'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.title('High MAR Class Distribution')
plt.xlabel('High_MAR (0=Low, 1=High)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Low MAR (0)', 'High MAR (1)'], rotation=0)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

### 4.2 Species Classification Target

In [None]:
# Show species value counts
print("Species distribution (before merging rare species):")
species_counts = df['bacterial_species'].value_counts()
print(species_counts)
print(f"\nTotal unique species: {df['bacterial_species'].nunique()}")

In [None]:
# Prepare species target (merge species with < 10 samples into 'Other')
df = prepare_species_target(df, min_samples=10)

print("Species distribution (after merging rare species):")
print(df['species_target'].value_counts())
print(f"\nTotal unique species targets: {df['species_target'].nunique()}")

In [None]:
# Visualize species distribution
plt.figure(figsize=(12, 6))
species_counts = df['species_target'].value_counts()
species_counts.plot(kind='bar', color='steelblue')
plt.title('Species Target Distribution (min_samples=10)')
plt.xlabel('Species')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 5. Data Splitting

Split data into 70% train, 20% validation, 10% test with stratification.

In [None]:
# Create feature matrix
X = create_feature_matrix(df)
print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"\nFeature names:")
print(X.columns.tolist())

In [None]:
# Prepare targets
# For MAR target
y_mar = df['High_MAR'].copy()

# For species target
y_species = df['species_target'].copy()

print(f"MAR target shape: {y_mar.shape}")
print(f"Species target shape: {y_species.shape}")
print(f"\nFeature matrix has {X.isnull().sum().sum()} missing values")

In [None]:
# Split data for MAR target with stratification
X_train_mar, X_val_mar, X_test_mar, y_train_mar, y_val_mar, y_test_mar = stratified_split(
    X, y_mar,
    train_size=0.7,
    val_size=0.2,
    test_size=0.1,
    random_state=42
)

print("MAR Target - Split sizes:")
print(f"Training set: {X_train_mar.shape[0]} samples ({X_train_mar.shape[0]/len(X)*100:.1f}%)")
print(f"Validation set: {X_val_mar.shape[0]} samples ({X_val_mar.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test_mar.shape[0]} samples ({X_test_mar.shape[0]/len(X)*100:.1f}%)")

In [None]:
# Check class distribution in splits
print("\nMAR Target - Class distribution in splits:")
print("\nTraining set:")
print(y_train_mar.value_counts())
print(y_train_mar.value_counts(normalize=True))

print("\nValidation set:")
print(y_val_mar.value_counts())
print(y_val_mar.value_counts(normalize=True))

print("\nTest set:")
print(y_test_mar.value_counts())
print(y_test_mar.value_counts(normalize=True))

In [None]:
# Split data for Species target with stratification
X_train_sp, X_val_sp, X_test_sp, y_train_sp, y_val_sp, y_test_sp = stratified_split(
    X, y_species,
    train_size=0.7,
    val_size=0.2,
    test_size=0.1,
    random_state=42
)

print("Species Target - Split sizes:")
print(f"Training set: {X_train_sp.shape[0]} samples ({X_train_sp.shape[0]/len(X)*100:.1f}%)")
print(f"Validation set: {X_val_sp.shape[0]} samples ({X_val_sp.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test_sp.shape[0]} samples ({X_test_sp.shape[0]/len(X)*100:.1f}%)")

In [None]:
# Visualize split distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MAR target distribution
mar_data = pd.DataFrame({
    'Train': y_train_mar.value_counts(),
    'Validation': y_val_mar.value_counts(),
    'Test': y_test_mar.value_counts()
})
mar_data.plot(kind='bar', ax=axes[0])
axes[0].set_title('MAR Target Distribution Across Splits')
axes[0].set_xlabel('High_MAR Class')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Low (0)', 'High (1)'], rotation=0)
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Species target distribution (top 5)
sp_train_top = y_train_sp.value_counts().head(5)
sp_val_top = y_val_sp.value_counts().head(5)
sp_test_top = y_test_sp.value_counts().head(5)
sp_data = pd.DataFrame({
    'Train': sp_train_top,
    'Validation': sp_val_top,
    'Test': sp_test_top
})
sp_data.plot(kind='bar', ax=axes[1])
axes[1].set_title('Species Target Distribution (Top 5)')
axes[1].set_xlabel('Species')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(sp_data.index, rotation=45, ha='right')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 6. Save Processed Data

In [None]:
# Create processed data directory if it doesn't exist
processed_dir = Path('../data/processed')
processed_dir.mkdir(parents=True, exist_ok=True)

print(f"Saving processed data to {processed_dir}...")

In [None]:
# Save cleaned dataset
df.to_csv(processed_dir / 'cleaned_data.csv', index=False)
print("✓ Saved cleaned_data.csv")

In [None]:
# Save feature matrices (MAR target splits)
X_train_mar.to_csv(processed_dir / 'X_train.csv', index=False)
X_val_mar.to_csv(processed_dir / 'X_val.csv', index=False)
X_test_mar.to_csv(processed_dir / 'X_test.csv', index=False)
print("✓ Saved feature matrices (X_train, X_val, X_test)")

In [None]:
# Save MAR targets
y_train_mar.to_csv(processed_dir / 'y_train_mar.csv', index=False, header=['High_MAR'])
y_val_mar.to_csv(processed_dir / 'y_val_mar.csv', index=False, header=['High_MAR'])
y_test_mar.to_csv(processed_dir / 'y_test_mar.csv', index=False, header=['High_MAR'])
print("✓ Saved MAR targets (y_train_mar, y_val_mar, y_test_mar)")

In [None]:
# Save Species targets
y_train_sp.to_csv(processed_dir / 'y_train_species.csv', index=False, header=['species_target'])
y_val_sp.to_csv(processed_dir / 'y_val_species.csv', index=False, header=['species_target'])
y_test_sp.to_csv(processed_dir / 'y_test_species.csv', index=False, header=['species_target'])
print("✓ Saved species targets (y_train_species, y_val_species, y_test_species)")

In [None]:
# Save feature names
feature_names = X.columns.tolist()
with open(processed_dir / 'feature_names.json', 'w') as f:
    json.dump(feature_names, f, indent=2)
print("✓ Saved feature_names.json")

In [None]:
# Save encoding mappings
encoding_info = {
    'resistance_encoding': {
        's': 0,
        'i': 1,
        'r': 2,
        'description': 'Ordinal encoding for antibiotic resistance interpretations'
    },
    'mar_threshold': 0.2,
    'species_min_samples': 10,
    'num_features': len(feature_names),
    'num_antibiotics': len(antibiotic_int_cols),
    'train_size': 0.7,
    'val_size': 0.2,
    'test_size': 0.1,
    'random_state': 42
}

with open(processed_dir / 'encoding_mappings.json', 'w') as f:
    json.dump(encoding_info, f, indent=2)
print("✓ Saved encoding_mappings.json")

In [None]:
# Create summary
summary = f"""
Data Preprocessing Summary
==========================

Original dataset:
  - Samples: {df.shape[0]}
  - Total columns: {df.shape[1]}
  - Antibiotic columns: {len(antibiotic_int_cols)}

Feature matrix:
  - Features: {X.shape[1]}
  - Samples: {X.shape[0]}

MAR Target:
  - High MAR samples: {(y_mar == 1).sum()} ({(y_mar == 1).sum()/len(y_mar)*100:.1f}%)
  - Low MAR samples: {(y_mar == 0).sum()} ({(y_mar == 0).sum()/len(y_mar)*100:.1f}%)

Species Target:
  - Unique species: {y_species.nunique()}

Data Splits:
  - Training: {X_train_mar.shape[0]} samples (70%)
  - Validation: {X_val_mar.shape[0]} samples (20%)
  - Test: {X_test_mar.shape[0]} samples (10%)

Saved Files:
  - cleaned_data.csv
  - X_train.csv, X_val.csv, X_test.csv
  - y_train_mar.csv, y_val_mar.csv, y_test_mar.csv
  - y_train_species.csv, y_val_species.csv, y_test_species.csv
  - feature_names.json
  - encoding_mappings.json
"""

print(summary)

In [None]:
print("\n" + "="*50)
print("DATA PREPROCESSING COMPLETE!")
print("="*50)
print(f"\nAll processed files saved to: {processed_dir.absolute()}")