In [14]:
# Core libraries
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from matplotlib.ticker import PercentFormatter
from typing import Dict, Tuple, List
from scipy import stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.utils.class_weight import compute_class_weight
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.utils import check_X_y

try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    _HAS_IMB = True
except Exception:
    _HAS_IMB = False

#import dataset

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cardiotocography = fetch_ucirepo(id=193) 
  
# data (as pandas dataframes) 
X = cardiotocography.data.features 
y = cardiotocography.data.targets 
  
# metadata 
print(cardiotocography.metadata) 
  
# variable information 
print(cardiotocography.variables) 



{'uci_id': 193, 'name': 'Cardiotocography', 'repository_url': 'https://archive.ics.uci.edu/dataset/193/cardiotocography', 'data_url': 'https://archive.ics.uci.edu/static/public/193/data.csv', 'abstract': 'The dataset consists of measurements of fetal heart rate (FHR) and uterine contraction (UC) features on cardiotocograms classified by expert obstetricians.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 2126, 'num_features': 21, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['CLASS', 'NSP'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2000, 'last_updated': 'Fri Mar 15 2024', 'dataset_doi': '10.24432/C51S4N', 'creators': ['D. Campos', 'J. Bernardes'], 'intro_paper': None, 'additional_info': {'summary': '2126 fetal cardiotocograms (CTGs) were automatically processed and the respective diagnostic features measured. The CTGs were also classified

In [15]:
#Data Cleaning and Preprocessing

df = pd.concat([X.reset_index(drop=True), 
                y.reset_index(drop=True)], axis=1)

#Check missing values
missing_total = df.isnull().sum().sum()
print("Number of missing values:", missing_total)

#preprocess function
def preprocess_ctg(
    df: pd.DataFrame,
    target_col: List[str] = "NSP",
    drop_cols: List[str] = ["DR"],  # drop DR, p=1
    test_size: float = 0.2,
    random_state: int = 3407,
    use_smote: bool = True
) -> Dict[str, object]:
    

    # 1. select features
    feature_cols = [c for c in df.columns if c not in drop_cols + target_col]
    X = df[feature_cols].to_numpy()
    
    # 2. encode labels (for scikit-learn classifier)
    y_raw = df[target_col].to_numpy()
    if np.array_equal(np.unique(y_raw), np.array([1, 2, 3])):
        y = y_raw.astype(int) - 1
    else:
        _, y = np.unique(y_raw, return_inverse=True)

    # 3. stratified split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    
    #4. scale (train-fit, test-transform)
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)

    #5. SMOTE (only on training set) 
    """
    We decide to use SMOTE since the number of type "normal"(1655) is much bigger than "Suspect"(295) and "Pathologic"(176)
    but S and P are equally or even more important in real medical analysis. 
    
    """
    if use_smote:
        sm = SMOTE(random_state=random_state)
        X_tr, y_tr = sm.fit_resample(X_tr, y_tr)


    return {
        "X_train": X_tr,
        "y_train": y_tr,
        "X_test": X_te,
        "y_test": y_te,
        "scaler": scaler,
        "feature_names": feature_cols,
    }


Number of missing values: 0


In [10]:
#EDA
