In [2]:
pip install ucimlrepo numpy pandas matplotlib scikit-learn seaborn autograd torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
support2 = fetch_ucirepo(id=880)

# data (as pandas dataframes)
X = support2.data.features
y = support2.data.targets

# metadata
print(support2.metadata)

# variable information
print(support2.variables)


{'uci_id': 880, 'name': 'SUPPORT2', 'repository_url': 'https://archive.ics.uci.edu/dataset/880/support2', 'data_url': 'https://archive.ics.uci.edu/static/public/880/data.csv', 'abstract': "This dataset comprises 9105 individual critically ill patients across 5 United States medical centers, accessioned throughout 1989-1991 and 1992-1994.\nEach row concerns hospitalized patient records who met the inclusion and exclusion criteria for nine disease categories: acute respiratory failure, chronic obstructive pulmonary disease, congestive heart failure, liver disease, coma, colon cancer, lung cancer, multiple organ system failure with malignancy, and multiple organ system failure with sepsis. The goal is to determine these patients' 2- and 6-month survival rates based on several physiologic, demographics, and disease severity information. \nIt is an important problem because it addresses the growing national concern over patients' loss of control near the end of life. It enables earlier deci

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.concat([X,y],axis=1)
data

Unnamed: 0,age,sex,dzgroup,dzclass,num.co,edu,income,scoma,charges,totcst,...,ph,glucose,bun,urine,adlp,adls,adlsc,death,hospdead,sfdm2
0,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,0.0,9715.0,,...,7.459961,,,,7.0,7.0,7.000000,0,0,
1,60.33899,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,44.0,34496.0,,...,7.250000,,,,,1.0,1.000000,1,1,<2 mo. follow-up
2,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,41094.0,,...,7.459961,,,,1.0,0.0,0.000000,1,0,<2 mo. follow-up
3,42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,0.0,3075.0,,...,,,,,0.0,0.0,0.000000,1,0,no(M2 and SIP pres)
4,79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,26.0,50127.0,,...,7.509766,,,,,2.0,2.000000,0,0,no(M2 and SIP pres)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,8.0,,0.0,52870.0,34329.3125,...,7.459961,188.0,21.0,,,0.0,0.000000,0,0,
9101,55.15399,female,Coma,Coma,1,11.0,,41.0,35377.0,23558.5000,...,7.289062,190.0,49.0,0.0,,0.0,0.000000,0,0,
9102,70.38196,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,0.0,46564.0,31409.0156,...,7.379883,189.0,60.0,3900.0,,,2.525391,0,0,
9103,47.01999,male,MOSF w/Malig,ARF/MOSF,1,13.0,,0.0,58439.0,,...,7.469727,246.0,55.0,,,0.0,0.000000,1,1,<2 mo. follow-up


# Data Preprocessing

## Data Cleaning

### Missing Values

In [5]:
missing_value_table = data.isnull().sum()
missing_value_proportion = missing_value_table[missing_value_table>0].sort_values(ascending=False) / len(data)
for i in missing_value_proportion.index:
    print("{}: {:.2f}%".format(i,missing_value_proportion[i]*100), f'dtype={data[i].dtype}')

adlp: 61.95% dtype=float64
urine: 53.40% dtype=float64
glucose: 49.42% dtype=float64
bun: 47.80% dtype=float64
totmcst: 38.17% dtype=float64
alb: 37.03% dtype=float64
income: 32.75% dtype=object
adls: 31.49% dtype=float64
bili: 28.57% dtype=float64
pafi: 25.54% dtype=float64
ph: 25.09% dtype=float64
prg2m: 18.11% dtype=float64
edu: 17.95% dtype=float64
prg6m: 17.94% dtype=float64
sfdm2: 15.38% dtype=object
totcst: 9.75% dtype=float64
wblc: 2.33% dtype=float64
charges: 1.89% dtype=float64
avtisst: 0.90% dtype=float64
crea: 0.74% dtype=float64
race: 0.46% dtype=object
dnrday: 0.33% dtype=float64
dnr: 0.33% dtype=object
scoma: 0.01% dtype=float64
sps: 0.01% dtype=float64
meanbp: 0.01% dtype=float64
aps: 0.01% dtype=float64
surv2m: 0.01% dtype=float64
surv6m: 0.01% dtype=float64
resp: 0.01% dtype=float64
hrt: 0.01% dtype=float64
temp: 0.01% dtype=float64
sod: 0.01% dtype=float64


In [6]:
X_pd = pd.DataFrame(X)
y_pd = pd.DataFrame(y)
X_pd.isnull().sum(), y_pd.isnull().sum()

(age            0
 sex            0
 dzgroup        0
 dzclass        0
 num.co         0
 edu         1634
 income      2982
 scoma          1
 charges      172
 totcst       888
 totmcst     3475
 avtisst       82
 race          42
 sps            1
 aps            1
 surv2m         1
 surv6m         1
 hday           0
 diabetes       0
 dementia       0
 ca             0
 prg2m       1649
 prg6m       1633
 dnr           30
 dnrday        30
 meanbp         1
 wblc         212
 hrt            1
 resp           1
 temp           1
 pafi        2325
 alb         3372
 bili        2601
 crea          67
 sod            1
 ph          2284
 glucose     4500
 bun         4352
 urine       4862
 adlp        5641
 adls        2867
 adlsc          0
 dtype: int64,
 death          0
 hospdead       0
 sfdm2       1400
 dtype: int64)

In [7]:
for i in data.columns:
  print(f'{i}:{data[i].dtype}')

age:float64
sex:object
dzgroup:object
dzclass:object
num.co:int64
edu:float64
income:object
scoma:float64
charges:float64
totcst:float64
totmcst:float64
avtisst:float64
race:object
sps:float64
aps:float64
surv2m:float64
surv6m:float64
hday:int64
diabetes:int64
dementia:int64
ca:object
prg2m:float64
prg6m:float64
dnr:object
dnrday:float64
meanbp:float64
wblc:float64
hrt:float64
resp:float64
temp:float64
pafi:float64
alb:float64
bili:float64
crea:float64
sod:float64
ph:float64
glucose:float64
bun:float64
urine:float64
adlp:float64
adls:float64
adlsc:float64
death:int64
hospdead:int64
sfdm2:object


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder
# OrdinalEncoder is used for encoding categorical features


def fill_missing_values(data: pd.DataFrame, missing_col: str) -> pd.DataFrame:
    """
    Fill missing values in a specified column of a DataFrame.
    For categorical columns (dtype 'object'), use DecisionTreeClassifier.
    For numerical columns, use Polynomial Regression (degree = 2).

    Parameters:
    -----------
    data : pd.DataFrame
        The input DataFrame containing missing values.
    missing_col : str
        The name of the column with missing values to be filled.

    Returns:
    --------
    pd.DataFrame
        The DataFrame with missing values in the specified column filled.
    """
    # Create a boolean mask to identify rows with missing values in the target column
    missing_mask = data[missing_col].isnull()

    # If there are no missing values, return the original data
    if not missing_mask.any():
        return data

    # Separate the feature matrix (X) and the target column (y)
    # X_full: DataFrame with the missing column dropped
    # y_full: The target column with missing values
    X_full = data.drop(missing_col, axis = 1)
    y_full = data[missing_col]

    # Split the data into training set (non - missing values) and prediction set (missing values)
    # X_train: Features for training (rows without missing values in the target column)
    # y_train: Target values for training (non - missing values in the target column)
    # X_missing: Features for prediction (rows with missing values in the target column)
    X_train = X_full[~missing_mask]
    y_train = y_full[~missing_mask]
    X_missing = X_full[missing_mask]

    # Handle categorical columns (dtype 'object')
    if data[missing_col].dtype == 'object' or 'O':
        # Use OrdinalEncoder to encode categorical features.
        # handle_unknown='use_encoded_value' and unknown_value=-1: Deal with unknown categories in the prediction set
        enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

        # Encode the training features
        X_train_enc = enc.fit_transform(X_train)
        # Encode the prediction features if there are missing values to predict
        X_missing_enc = enc.transform(X_missing) if not X_missing.empty else None

        # Encode the training target (convert categorical target to numerical)
        y_train_enc = enc.fit_transform(y_train.values.reshape(-1, 1)).ravel()

        # Create and train a DecisionTreeClassifier
        model = DecisionTreeClassifier(random_state = 42)
        model.fit(X_train_enc, y_train_enc)

        # Predict the missing values if there are rows to predict
        if not X_missing.empty:
            y_pred_enc = model.predict(X_missing_enc)
            # Decode the predicted values back to the original categorical values
            y_pred = enc.inverse_transform(y_pred_enc.reshape(-1, 1)).ravel()
            # Fill the missing values in the original DataFrame
            data.loc[missing_mask, missing_col] = y_pred

    # Handle numerical columns
    else:
        # Create polynomial features (degree = 2)
        poly = PolynomialFeatures(degree = 2)
        # Transform the training features to polynomial features
        X_train_poly = poly.fit_transform(X_train)

        # Create and train a LinearRegression model
        model = LinearRegression()
        model.fit(X_train_poly, y_train)

        # Predict the missing values if there are rows to predict
        if not X_missing.empty:
            # Transform the prediction features to polynomial features
            X_missing_poly = poly.transform(X_missing)
            y_pred = model.predict(X_missing_poly)
            # Fill the missing values in the original DataFrame
            data.loc[missing_mask, missing_col] = y_pred

    return data[missing_col]


for i in missing_value_proportion.index:
  data[i] = fill_missing_values(data, i)

  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  y_type = type_of_target(y, input_name="y")
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  data.loc[missing_mask, missing_col] = y_pred
  y_type = type_of_target(y, input_name="y")
 15566.1562 2677.668 13825.3984 328378.5 21992.5469 328378.5 2402.2305
 4942.6133 14153.4141 28589.9844 2610.9629 3553.2793 4699.6172 4865.1836
 1732.7705 34996.3438 16626.1875 328378.5 29580.2344 38120.125 113269.062
 23774.0 328378.5 3473.5801 1871.2832 328378.5 10006.6484 328378.5
 10915.9531 0.

In [9]:
data.isnull().sum()

age         0
sex         0
dzgroup     0
dzclass     0
num.co      0
edu         0
income      0
scoma       0
charges     0
totcst      0
totmcst     0
avtisst     0
race        0
sps         0
aps         0
surv2m      0
surv6m      0
hday        0
diabetes    0
dementia    0
ca          0
prg2m       0
prg6m       0
dnr         0
dnrday      0
meanbp      0
wblc        0
hrt         0
resp        0
temp        0
pafi        0
alb         0
bili        0
crea        0
sod         0
ph          0
glucose     0
bun         0
urine       0
adlp        0
adls        0
adlsc       0
death       0
hospdead    0
sfdm2       0
dtype: int64

In [10]:
data

Unnamed: 0,age,sex,dzgroup,dzclass,num.co,edu,income,scoma,charges,totcst,...,ph,glucose,bun,urine,adlp,adls,adlsc,death,hospdead,sfdm2
0,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,0.0,9715.0,3718.8301,...,7.459961,152.0,16.0,48.0,7.0,7.0,7.000000,0,0,no(M2 and SIP pres)
1,60.33899,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,44.0,34496.0,328378.5,...,7.25,89.0,57.0,1220.0,0.0,1.0,1.000000,1,1,<2 mo. follow-up
2,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,41094.0,7537.4258,...,7.459961,417.0,45.0,2780.0,1.0,0.0,0.000000,1,0,<2 mo. follow-up
3,42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,0.0,3075.0,8907.6328,...,7.449219,85.0,14.0,2390.0,0.0,0.0,0.000000,1,0,no(M2 and SIP pres)
4,79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,12.0,under $11k,26.0,50127.0,4920.8438,...,7.509766,417.0,7.0,3770.0,5.0,2.0,2.000000,0,0,no(M2 and SIP pres)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,66.07300,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,8.0,$25-$50k,0.0,52870.0,34329.3125,...,7.459961,188.0,21.0,2900.0,0.0,0.0,0.000000,0,0,SIP>=30
9101,55.15399,female,Coma,Coma,1,11.0,under $11k,41.0,35377.0,23558.5,...,7.289062,190.0,49.0,0.0,1.0,0.0,0.000000,0,0,SIP>=30
9102,70.38196,male,ARF/MOSF w/Sepsis,ARF/MOSF,1,12.0,under $11k,0.0,46564.0,31409.0156,...,7.379883,189.0,60.0,3900.0,2.0,0.0,2.525391,0,0,no(M2 and SIP pres)
9103,47.01999,male,MOSF w/Malig,ARF/MOSF,1,13.0,under $11k,0.0,58439.0,25119.9688,...,7.469727,246.0,55.0,400.0,1.0,0.0,0.000000,1,1,<2 mo. follow-up


### Outlier Detection

In [11]:
def outlier_detection(data):
    """
    Detect outliers for numerical and categorical features.
    Returns a DataFrame with outlier flags (1=outlier, 0=normal) for each method.
    """
    results = data.copy()
    outlier_flags = pd.DataFrame(index=results.index)

    # Numerical: IQR method (threshold=3, typical for moderate outlier frequency)
    num_cols = results.select_dtypes(include=["number"]).columns
    def iqr_detector(col, threshold=3):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr
        return ((col < lower_bound) | (col > upper_bound)).astype(int)
    for col in num_cols:
        outlier_flags[f'iqr_{col}'] = iqr_detector(results[col])

    # Categorical: rare category (threshold=0.0005, i.e., <.1% frequency)
    cat_cols = results.select_dtypes(include=["object", "category"]).columns
    def category_outlier_detector(col, threshold=0.0005):
        freq = col.value_counts(normalize=True)
        rare_categories = freq[freq < threshold].index
        return col.isin(rare_categories).astype(int)
    for col in cat_cols:
        outlier_flags[f'cat_outlier_{col}'] = category_outlier_detector(results[col])

    return outlier_flags

In [12]:
demo = outlier_detection(data.copy())
demo.describe()

Unnamed: 0,iqr_age,iqr_num.co,iqr_hday,iqr_diabetes,iqr_dementia,iqr_adlsc,iqr_death,iqr_hospdead,cat_outlier_sex,cat_outlier_dzgroup,...,cat_outlier_bili,cat_outlier_crea,cat_outlier_sod,cat_outlier_ph,cat_outlier_glucose,cat_outlier_bun,cat_outlier_urine,cat_outlier_adlp,cat_outlier_adls,cat_outlier_sfdm2
count,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,...,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0,9105.0
mean,0.0,0.0,0.12279,0.195277,0.03251,0.0,0.0,0.0,0.0,0.0,...,0.036573,0.009226,0.003075,0.003954,0.041955,0.009226,0.214607,0.0,0.0,0.0
std,0.0,0.0,0.328214,0.396436,0.177359,0.0,0.0,0.0,0.0,0.0,...,0.187722,0.095612,0.055372,0.062759,0.200497,0.095612,0.410572,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [13]:
count = 0
for row in demo.index:
    if demo.loc[row].sum() > 3.5:
        # print(f"Row {row} is an outlier in the following methods: {demo.loc[row][demo.loc[row] == 1].index.tolist()}")
        pass
    else:
        print(f"Row {row} is not an outlier in any method.")
        count += 1
count

Row 0 is not an outlier in any method.
Row 2 is not an outlier in any method.
Row 3 is not an outlier in any method.
Row 4 is not an outlier in any method.
Row 5 is not an outlier in any method.
Row 6 is not an outlier in any method.
Row 7 is not an outlier in any method.
Row 8 is not an outlier in any method.
Row 10 is not an outlier in any method.
Row 11 is not an outlier in any method.
Row 12 is not an outlier in any method.
Row 13 is not an outlier in any method.
Row 15 is not an outlier in any method.
Row 16 is not an outlier in any method.
Row 18 is not an outlier in any method.
Row 19 is not an outlier in any method.
Row 20 is not an outlier in any method.
Row 21 is not an outlier in any method.
Row 23 is not an outlier in any method.
Row 24 is not an outlier in any method.
Row 25 is not an outlier in any method.
Row 26 is not an outlier in any method.
Row 28 is not an outlier in any method.
Row 29 is not an outlier in any method.
Row 30 is not an outlier in any method.
Row 31 i

4308

In [14]:
# delete outliers
def remove_outliers(data):
    """
    Remove outliers from the DataFrame.
    Returns a DataFrame with outliers removed.
    """
    crit = outlier_detection(data)
    for row in crit.index:
        if crit.loc[row].sum() > 3.5:
            data = data.drop(row)
    return data

data_no_outliers = remove_outliers(data.copy())

In [15]:
# recode the index
data_no_outliers.reset_index(drop=True, inplace=True)
data_no_outliers

Unnamed: 0,age,sex,dzgroup,dzclass,num.co,edu,income,scoma,charges,totcst,...,ph,glucose,bun,urine,adlp,adls,adlsc,death,hospdead,sfdm2
0,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,0.0,9715.0,3718.8301,...,7.459961,152.0,16.0,48.0,7.0,7.0,7.000000,0,0,no(M2 and SIP pres)
1,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,41094.0,7537.4258,...,7.459961,417.0,45.0,2780.0,1.0,0.0,0.000000,1,0,<2 mo. follow-up
2,42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,0.0,3075.0,8907.6328,...,7.449219,85.0,14.0,2390.0,0.0,0.0,0.000000,1,0,no(M2 and SIP pres)
3,79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,12.0,under $11k,26.0,50127.0,4920.8438,...,7.509766,417.0,7.0,3770.0,5.0,2.0,2.000000,0,0,no(M2 and SIP pres)
4,93.01599,male,Coma,Coma,1,14.0,$25-$50k,55.0,6884.0,328378.5,...,7.65918,417.0,7.0,3976.0,5.0,1.0,1.000000,1,1,<2 mo. follow-up
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4303,51.45200,female,Lung Cancer,Cancer,1,12.0,$11-$25k,0.0,8023.0,4859.3516,...,7.359375,154.0,59.0,1450.0,0.0,0.0,1.922607,1,0,no(M2 and SIP pres)
4304,79.17596,male,COPD,COPD/CHF/Cirrhosis,2,4.0,under $11k,9.0,20070.0,13088.1406,...,7.449219,149.0,11.0,1150.0,0.0,0.0,0.000000,0,0,no(M2 and SIP pres)
4305,18.04199,female,MOSF w/Malig,ARF/MOSF,0,12.0,under $11k,0.0,32141.0,0.0,...,7.419922,159.0,18.0,3060.0,0.0,0.0,2.147461,0,0,no(M2 and SIP pres)
4306,68.61597,female,COPD,COPD/CHF/Cirrhosis,2,12.0,under $11k,0.0,2758.0,1847.376,...,7.489258,142.0,17.0,950.0,0.0,0.0,0.494751,0,0,no(M2 and SIP pres)


## Feature Engineering

In [16]:
def encoding(data):
    '''
    Encode categorical features using OneHotEncoder for multi-category variables,
    and boolean indicator (1/0) for dichotomous variables.
    '''
    results = data.copy()
    cat_cols = results.select_dtypes(include=["object", "category"]).columns.tolist()
    for col in cat_cols:
        n_unique = results[col].nunique(dropna=False)
        if n_unique == 2:
            # Binary indicator: map the two categories to 1 and 0
            categories = results[col].dropna().unique()
            mapping = {categories[0]: 1, categories[1]: 0}
            results[f'binary_{col}'] = results[col].map(mapping)
            results = results.drop(col, axis=1)
        elif n_unique > 2:
            # One-hot encode multi-category columns
            dummies = pd.get_dummies(results[col], prefix=col, drop_first=True, dtype=float)
            results = pd.concat([results.drop(col, axis=1), dummies], axis=1)
    return results

data_encoded = encoding(data_no_outliers)
data_encoded

Unnamed: 0,age,num.co,hday,diabetes,dementia,adlsc,death,hospdead,binary_sex,dzgroup_CHF,...,adls_2.0,adls_3.0,adls_4.0,adls_5.0,adls_6.0,adls_7.0,sfdm2_Coma or Intub,sfdm2_SIP>=30,sfdm2_adl>=4 (>=5 if sur),sfdm2_no(M2 and SIP pres)
0,62.84998,0,1,0,0,7.000000,0,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,52.74698,2,4,0,0,0.000000,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42.38498,2,1,0,0,0.000000,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,79.88495,1,3,0,0,2.000000,0,0,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,93.01599,1,1,0,0,1.000000,1,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4303,51.45200,1,1,0,0,1.922607,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4304,79.17596,2,1,0,0,0.000000,0,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4305,18.04199,0,4,0,0,2.147461,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4306,68.61597,2,1,0,0,0.494751,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Data Aggregation

# Data Visualization

# Clustering Analysis

# Prediction: Training and Testing

# Evaluation and Choice of Prediction Model

# Open-Ended Exploration