In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
support2 = fetch_ucirepo(id=880)

# data (as pandas dataframes)
X = support2.data.features
y = support2.data.targets

# metadata
print(support2.metadata)

# variable information
print(support2.variables)


{'uci_id': 880, 'name': 'SUPPORT2', 'repository_url': 'https://archive.ics.uci.edu/dataset/880/support2', 'data_url': 'https://archive.ics.uci.edu/static/public/880/data.csv', 'abstract': "This dataset comprises 9105 individual critically ill patients across 5 United States medical centers, accessioned throughout 1989-1991 and 1992-1994.\nEach row concerns hospitalized patient records who met the inclusion and exclusion criteria for nine disease categories: acute respiratory failure, chronic obstructive pulmonary disease, congestive heart failure, liver disease, coma, colon cancer, lung cancer, multiple organ system failure with malignancy, and multiple organ system failure with sepsis. The goal is to determine these patients' 2- and 6-month survival rates based on several physiologic, demographics, and disease severity information. \nIt is an important problem because it addresses the growing national concern over patients' loss of control near the end of life. It enables earlier deci

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.concat([X,y],axis=1)
data

Unnamed: 0,age,sex,dzgroup,dzclass,num.co,edu,income,scoma,charges,totcst,...,ph,glucose,bun,urine,adlp,adls,adlsc,death,hospdead,sfdm2
0,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,0.0,9715.0,,...,7.459961,,,,7.0,7.0,7.000000,0,0,
1,60.33899,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,44.0,34496.0,,...,7.250000,,,,,1.0,1.000000,1,1,<2 mo. follow-up
2,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,41094.0,,...,7.459961,,,,1.0,0.0,0.000000,1,0,<2 mo. follow-up
3,42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,0.0,3075.0,,...,,,,,0.0,0.0,0.000000,1,0,no(M2 and SIP pres)
4,79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,26.0,50127.0,,...,7.509766,,,,,2.0,2.000000,0,0,no(M2 and SIP pres)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,8.0,,0.0,52870.0,34329.3125,...,7.459961,188.0,21.0,,,0.0,0.000000,0,0,
9101,55.15399,female,Coma,Coma,1,11.0,,41.0,35377.0,23558.5000,...,7.289062,190.0,49.0,0.0,,0.0,0.000000,0,0,
9102,70.38196,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,0.0,46564.0,31409.0156,...,7.379883,189.0,60.0,3900.0,,,2.525391,0,0,
9103,47.01999,male,MOSF w/Malig,ARF/MOSF,1,13.0,,0.0,58439.0,,...,7.469727,246.0,55.0,,,0.0,0.000000,1,1,<2 mo. follow-up


# Data Preprocessing

## Data Cleaning

In [4]:
missing_value_table = data.isnull().sum()
missing_value_proportion = missing_value_table[missing_value_table>0].sort_values(ascending=False) / len(data)
for i in missing_value_proportion.index:
    print("{}: {:.2f}%".format(i,missing_value_proportion[i]*100), f'dtype={data[i].dtype}')

adlp: 61.95% dtype=float64
urine: 53.40% dtype=float64
glucose: 49.42% dtype=float64
bun: 47.80% dtype=float64
totmcst: 38.17% dtype=float64
alb: 37.03% dtype=float64
income: 32.75% dtype=object
adls: 31.49% dtype=float64
bili: 28.57% dtype=float64
pafi: 25.54% dtype=float64
ph: 25.09% dtype=float64
prg2m: 18.11% dtype=float64
edu: 17.95% dtype=float64
prg6m: 17.94% dtype=float64
sfdm2: 15.38% dtype=object
totcst: 9.75% dtype=float64
wblc: 2.33% dtype=float64
charges: 1.89% dtype=float64
avtisst: 0.90% dtype=float64
crea: 0.74% dtype=float64
race: 0.46% dtype=object
dnrday: 0.33% dtype=float64
dnr: 0.33% dtype=object
scoma: 0.01% dtype=float64
sps: 0.01% dtype=float64
meanbp: 0.01% dtype=float64
aps: 0.01% dtype=float64
surv2m: 0.01% dtype=float64
surv6m: 0.01% dtype=float64
resp: 0.01% dtype=float64
hrt: 0.01% dtype=float64
temp: 0.01% dtype=float64
sod: 0.01% dtype=float64


In [5]:
X_pd = pd.DataFrame(X)
y_pd = pd.DataFrame(y)
X_pd.isnull().sum(), y_pd.isnull().sum()

(age            0
 sex            0
 dzgroup        0
 dzclass        0
 num.co         0
 edu         1634
 income      2982
 scoma          1
 charges      172
 totcst       888
 totmcst     3475
 avtisst       82
 race          42
 sps            1
 aps            1
 surv2m         1
 surv6m         1
 hday           0
 diabetes       0
 dementia       0
 ca             0
 prg2m       1649
 prg6m       1633
 dnr           30
 dnrday        30
 meanbp         1
 wblc         212
 hrt            1
 resp           1
 temp           1
 pafi        2325
 alb         3372
 bili        2601
 crea          67
 sod            1
 ph          2284
 glucose     4500
 bun         4352
 urine       4862
 adlp        5641
 adls        2867
 adlsc          0
 dtype: int64,
 death          0
 hospdead       0
 sfdm2       1400
 dtype: int64)

In [6]:
for i in data.columns:
  print(f'{i}:{data[i].dtype}')

age:float64
sex:object
dzgroup:object
dzclass:object
num.co:int64
edu:float64
income:object
scoma:float64
charges:float64
totcst:float64
totmcst:float64
avtisst:float64
race:object
sps:float64
aps:float64
surv2m:float64
surv6m:float64
hday:int64
diabetes:int64
dementia:int64
ca:object
prg2m:float64
prg6m:float64
dnr:object
dnrday:float64
meanbp:float64
wblc:float64
hrt:float64
resp:float64
temp:float64
pafi:float64
alb:float64
bili:float64
crea:float64
sod:float64
ph:float64
glucose:float64
bun:float64
urine:float64
adlp:float64
adls:float64
adlsc:float64
death:int64
hospdead:int64
sfdm2:object


In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder
# OrdinalEncoder is used for encoding categorical features


def fill_missing_values(data: pd.DataFrame, missing_col: str) -> pd.DataFrame:
    """
    Fill missing values in a specified column of a DataFrame.
    For categorical columns (dtype 'object'), use DecisionTreeClassifier.
    For numerical columns, use Polynomial Regression (degree = 2).

    Parameters:
    -----------
    data : pd.DataFrame
        The input DataFrame containing missing values.
    missing_col : str
        The name of the column with missing values to be filled.

    Returns:
    --------
    pd.DataFrame
        The DataFrame with missing values in the specified column filled.
    """
    # Create a boolean mask to identify rows with missing values in the target column
    missing_mask = data[missing_col].isnull()

    # If there are no missing values, return the original data
    if not missing_mask.any():
        return data

    # Separate the feature matrix (X) and the target column (y)
    # X_full: DataFrame with the missing column dropped
    # y_full: The target column with missing values
    X_full = data.drop(missing_col, axis = 1)
    y_full = data[missing_col]

    # Split the data into training set (non - missing values) and prediction set (missing values)
    # X_train: Features for training (rows without missing values in the target column)
    # y_train: Target values for training (non - missing values in the target column)
    # X_missing: Features for prediction (rows with missing values in the target column)
    X_train = X_full[~missing_mask]
    y_train = y_full[~missing_mask]
    X_missing = X_full[missing_mask]

    # Handle categorical columns (dtype 'object')
    if data[missing_col].dtype == 'object' or 'O':
        # Use OrdinalEncoder to encode categorical features.
        # handle_unknown='use_encoded_value' and unknown_value=-1: Deal with unknown categories in the prediction set
        enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

        # Encode the training features
        X_train_enc = enc.fit_transform(X_train)
        # Encode the prediction features if there are missing values to predict
        X_missing_enc = enc.transform(X_missing) if not X_missing.empty else None

        # Encode the training target (convert categorical target to numerical)
        y_train_enc = enc.fit_transform(y_train.values.reshape(-1, 1)).ravel()

        # Create and train a DecisionTreeClassifier
        model = DecisionTreeClassifier(random_state = 42)
        model.fit(X_train_enc, y_train_enc)

        # Predict the missing values if there are rows to predict
        if not X_missing.empty:
            y_pred_enc = model.predict(X_missing_enc)
            # Decode the predicted values back to the original categorical values
            y_pred = enc.inverse_transform(y_pred_enc.reshape(-1, 1)).ravel()
            # Fill the missing values in the original DataFrame
            data.loc[missing_mask, missing_col] = y_pred

    # Handle numerical columns
    else:
        # Create polynomial features (degree = 2)
        poly = PolynomialFeatures(degree = 2)
        # Transform the training features to polynomial features
        X_train_poly = poly.fit_transform(X_train)

        # Create and train a LinearRegression model
        model = LinearRegression()
        model.fit(X_train_poly, y_train)

        # Predict the missing values if there are rows to predict
        if not X_missing.empty:
            # Transform the prediction features to polynomial features
            X_missing_poly = poly.transform(X_missing)
            y_pred = model.predict(X_missing_poly)
            # Fill the missing values in the original DataFrame
            data.loc[missing_mask, missing_col] = y_pred

    return data[missing_col]


for i in missing_value_proportion.index:
  data[i] = fill_missing_values(data, i)

  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
 15566.1562 2677.668 13825.3984 328378.5 21992.5469 328378.5 2402.2305
 4942.6133 14153.4141 28589.9844 2610.9629 3553.2793 4699.6172 4865.1836
 1732.7705 34996.3438 16626.1875 328378.5 29580.2344 38120.125 113269.062
 23774.0 328378.5 3473.5801 1871.2832 328378.5 10006.6484 328378.5
 10915.9531 0.0 4241.5977 328378.5 2026.7148 83700.5 269131.25 15566.1562
 10652.4844 328378.5 94576.687

In [8]:
data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
dzgroup,0
dzclass,0
num.co,0
edu,0
income,0
scoma,0
charges,0
totcst,0


In [9]:
data

Unnamed: 0,age,sex,dzgroup,dzclass,num.co,edu,income,scoma,charges,totcst,...,ph,glucose,bun,urine,adlp,adls,adlsc,death,hospdead,sfdm2
0,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,0.0,9715.0,3718.8301,...,7.459961,152.0,16.0,48.0,7.0,7.0,7.000000,0,0,no(M2 and SIP pres)
1,60.33899,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,44.0,34496.0,328378.5,...,7.25,89.0,57.0,1220.0,0.0,1.0,1.000000,1,1,<2 mo. follow-up
2,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,41094.0,7537.4258,...,7.459961,417.0,45.0,2780.0,1.0,0.0,0.000000,1,0,<2 mo. follow-up
3,42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,0.0,3075.0,8907.6328,...,7.449219,85.0,14.0,2390.0,0.0,0.0,0.000000,1,0,no(M2 and SIP pres)
4,79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,12.0,under $11k,26.0,50127.0,4920.8438,...,7.509766,417.0,7.0,3770.0,5.0,2.0,2.000000,0,0,no(M2 and SIP pres)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,8.0,$25-$50k,0.0,52870.0,34329.3125,...,7.459961,188.0,21.0,2900.0,0.0,0.0,0.000000,0,0,SIP>=30
9101,55.15399,female,Coma,Coma,1,11.0,under $11k,41.0,35377.0,23558.5,...,7.289062,190.0,49.0,0.0,1.0,0.0,0.000000,0,0,SIP>=30
9102,70.38196,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,12.0,under $11k,0.0,46564.0,31409.0156,...,7.379883,189.0,60.0,3900.0,2.0,0.0,2.525391,0,0,no(M2 and SIP pres)
9103,47.01999,male,MOSF w/Malig,ARF/MOSF,1,13.0,under $11k,0.0,58439.0,25119.9688,...,7.469727,246.0,55.0,400.0,1.0,0.0,0.000000,1,1,<2 mo. follow-up


### Outlier Detection

In [11]:
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import LabelEncoder

def outlier_detection(data):
    results = data.copy()

    # ------------------------------------------------------
    # Method 1: z-score (normal distribution)
    # ------------------------------------------------------
    def z_score_detector(col, threshold=3):
        mean = col.mean()
        std = col.std()
        z_score = (col - mean) / std
        return (abs(z_score) > threshold).astype(int)  # 1=outlier

    # apply to numeric columns
    num_cols = results.columns.index
    for col in num_cols:
        results[f'zscore_{col}'] = z_score_detector(results[col])

    # ------------------------------------------------------
    # Method 2: IQR (skewed distribution)
    # ------------------------------------------------------
    def iqr_detector(col, threshold=1.5):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr
        return ((col < lower_bound) | (col > upper_bound)).astype(int)  # 1=异常

    # apply to features
    # results['iqr_feature3'] = iqr_detector(results['feature3'])

    # ------------------------------------------------------
    # Method 3: Isolation Forest
    # ------------------------------------------------------

    X_num = results[num_cols]
    iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
    results['iso_forest'] = iso_forest.fit_predict(X_num)  # -1=outlier, 1=normal

    # ------------------------------------------------------
    # Method 4: One-class SVM (high-dimensional data)
    # ------------------------------------------------------
    ocsvm = OneClassSVM(nu=0.1, kernel='rbf', gamma=0.1)  # nu≈outlier proportion
    results['oneclass_svm'] = ocsvm.fit_predict(X_num)    # -1=outlier, 1=normal

    # ------------------------------------------------------
    # Method 5: Outlier based on categories
    # ------------------------------------------------------
    def category_outlier_detector(col, threshold=0.05):
        # calculate frequency
        freq = col.value_counts(normalize=True)
        # outlier as the frequency is lower than an interval
        rare_categories = freq[freq < threshold].index
        return col.isin(rare_categories).astype(int)  # 1=outlier

    results['category_outlier'] = category_outlier_detector(results['category'])

    return results

# Data Visualization

# Clustering Analysis

# Prediction: Training and Testing

# Evaluation and Choice of Prediction Model

# Open-Ended Exploration