In [1]:
import pandas as pd
import numpy as np
from importlib import reload
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error,r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
import argparse
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time
from sklearn.pipeline import  Pipeline, make_pipeline

from transformation import RemoveSkewnessKurtosis, Standardize

# Import Data
# path = '/home/daisy/FDA_Dataset/inpatient_all_final_1.csv'
# df1 = pd.read_csv(path).iloc[:,1:]
# df1.drop(columns = ['Veteran flag','Event date','Marital status', 'Marital status encoded',
#                     'State','Ruca category'], inplace=True, axis=1)
# X_admission1 = df1.drop(columns = ['Readmission', 'Died'])
# Y_admission1 = df1[['Readmission']]

# Split Train and Test
# X_train_ad1, X_test_ad1, y_train_ad1, y_test_ad1 = train_test_split(X_admission1, Y_admission1, test_size=0.20, random_state=42)

# 'Internalpatientid' is
# Transform Data
targets = ['readmission within 300 days', 'died_within_900days']

cat_cols = ['AO', 'CVD','Ethnicity', 'Gender', 'Races', 'Ethnicity_0', 'Ethnicity_1', 
            'Ethnicity_2', 'Races_0', 'Races_1', 'Races_2', 'Races_3', 
            'Ruca category encoded']

cat_cols = ['AO', 'CVD', 'Ruca category encoded', 'Ethnicity', 
            'Gender', 'Races', 'Ethnicity_0',
            'Ethnicity_1', 'Ethnicity_2', 'Races_0', 
            'Races_1', 'Races_2', 'Races_3','DOMICILIARY', 
            'MEDICINE', 'NHCU', 'NON-COUNT', 'OTHERS', 'PSYCHIATRY']

numeric_cols = ['num_stays', 'stay_length', 'num_unique_units',
       'num_transfers', 'num_cvd_readmission', 'unique_admitting_specialty', 
       'unique_discharging_specialty','Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'age_mean', 'age_std', 'age_min', 'age_max', 'stay_min',
       'stay_max', 'stay_mean', 'stay_std', 'freq', 'total_procedure',
       'num_surgery_pro', 'num_immunization', 'Num med per admission mean',
       'Num med per admission min', 'Num med per admission max',
       'Total medications', 'mean age at specailty', 'period mean', 
       'specialty medical count', 'specialty support count',
       'period std','specialty count', 'Age 20-40 hypotension',
       'Age 40-60 hypotension', 'Age 60-80 hypotension',
       'Age 80-100 hypotension', 'Age 100-120 hypotension',
       'Age 20-40 hypertension', 'Age 40-60 hypertension',
       'Age 60-80 hypertension', 'Age 80-100 hypertension',
       'Age 100-120 hypertension', 'Age 20-40 healthy', 'Age 40-60 healthy',
       'Age 60-80 healthy', 'Age 80-100 healthy', 'Age 100-120 healthy',
       'lab_count', 'lab_freq', 'lab_age_mean', 'lab_age_std']

log_numeric_cols = ['num_stays_log', 'stay_length_log',
       'num_transfers_log', 'num_cvd_readmission_log',
       'unique_admitting_specialty_log', 'Age 20-40_log', 'Age 40-60_log',
       'Age 60-80_log', 'Age 80-100_log', 'Age 100-120_log', 'stay_min_log',
       'stay_max_log', 'stay_mean_log', 'stay_std_log', 'freq_log',
       'total_procedure_log', 'num_surgery_pro_log',
       'Num med per admission mean_log', 'Num med per admission min_log',
       'Num med per admission max_log', 'Total medications_log',
       'period mean_log', 'specialty medical count_log',
       'specialty support count_log', 'period std_log', 'specialty count_log',
       'Age 20-40 hypotension_log', 'Age 40-60 hypotension_log',
       'Age 60-80 hypotension_log', 'Age 80-100 hypotension_log',
       'Age 100-120 hypotension_log', 'Age 20-40 hypertension_log',
       'Age 40-60 hypertension_log', 'Age 60-80 hypertension_log',
       'Age 80-100 hypertension_log', 'Age 100-120 hypertension_log',
       'Age 20-40 healthy_log', 'Age 40-60 healthy_log',
       'Age 60-80 healthy_log', 'Age 80-100 healthy_log',
       'Age 100-120 healthy_log', 'lab_count_log', 'lab_freq_log']

# Import Data
path = '/home/daisy/FDA_Dataset/inpatient_all_final_1.csv'
df1 = pd.read_csv(path).iloc[:,1:]
X_admission1 = df1.drop(columns = ['readmission within 300 days', 'died_within_900days'])
Y_admission1 = df1[['readmission within 300 days']]

X_mortality1 = df1.drop(columns = ['died_within_900days'])
Y_mortality1 = df1[['died_within_900days']]

# Split Train and Test
X_train_ad1, X_test_ad1, y_train_ad1, y_test_ad1 = train_test_split(X_admission1, Y_admission1, test_size=0.20, random_state=42)


transform_steps = [("imputer", SimpleImputer(strategy="mean")),
                   ('RemoveSkewnessKurtosis', RemoveSkewnessKurtosis(targets, cat_cols, numeric_cols, log_numeric_cols)),
                   ('StandardizeStandardScaler', Standardize(log_numeric_cols, RobustScaler()))]
transform_pipeline = Pipeline(transform_steps)

data_prepared = transform_pipeline.fit_transform(X_train_ad1)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [16]:
df1.columns

Index(['Internalpatientid', 'num_stays', 'stay_length', 'num_unique_units',
       'num_transfers', 'num_cvd_readmission', 'AO', 'CVD',
       'unique_admitting_specialty', 'unique_discharging_specialty',
       'DOMICILIARY', 'MEDICINE', 'NHCU', 'NON-COUNT', 'OTHERS', 'PSYCHIATRY',
       'SURGERY', 'Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'age_mean', 'age_std', 'age_min', 'age_max', 'stay_min',
       'stay_max', 'stay_mean', 'stay_std', 'freq', 'Medical', 'Mental',
       'Others_Specialty', 'Rehab', 'Gerontology',
       'readmission within 300 days', 'Age at death', 'died_within_900days',
       'total_procedure', 'num_surgery_pro', 'Ethnicity', 'Gender', 'Races',
       'Ethnicity_0', 'Ethnicity_1', 'Ethnicity_2', 'Races_0', 'Races_1',
       'Races_2', 'Races_3', 'num_immunization', 'Num med per admission mean',
       'Num med per admission min', 'Num med per admission max',
       'Total medications', 'mean age at specailty', 'period mean',
  

In [5]:
# Feature Selection


Unnamed: 0,Internalpatientid,num_stays_log_rob_scaled,stay_length_log_rob_scaled,num_transfers_log_rob_scaled,num_cvd_readmission_log_rob_scaled,unique_admitting_specialty_log_rob_scaled,Age 20-40_log_rob_scaled,Age 40-60_log_rob_scaled,Age 60-80_log_rob_scaled,Age 80-100_log_rob_scaled,...,Races_0,Races_1,Races_2,Races_3,DOMICILIARY,MEDICINE,NHCU,NON-COUNT,OTHERS,PSYCHIATRY
58864,117504,-0.203114,0.662921,0.693147,0.000000,0.000000,0.0,0.0,0.160558,0.000000,...,1,0,0,0,0,1,0,1,0,1
79350,158601,-0.834044,-1.191133,0.000000,0.000000,-0.756471,0.0,0.0,-0.226294,0.000000,...,1,0,0,0,0,0,0,0,0,0
16980,33850,-0.834044,0.080737,0.000000,0.000000,-0.756471,0.0,0.0,-0.613147,1.000000,...,0,1,0,0,0,1,0,0,0,0
66650,133029,-0.203114,-0.481701,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1.000000,...,0,0,1,0,0,2,0,0,0,0
59734,119286,-0.834044,-0.027878,0.693147,0.000000,-0.756471,0.0,0.0,-0.226294,0.000000,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,12293,-0.464974,0.013631,0.693147,0.000000,-0.313964,0.0,0.0,-0.613147,1.584963,...,0,1,0,0,0,0,1,0,0,0
54886,109616,1.113928,0.838865,1.791759,2.321928,1.367211,0.0,0.0,0.773706,2.584963,...,0,1,0,0,0,13,0,1,2,0
76820,153543,0.869744,1.182991,0.693147,0.000000,0.610740,0.0,0.0,0.818378,0.000000,...,1,0,0,0,0,9,1,0,0,0
860,1629,-0.203114,0.440412,0.693147,1.000000,0.000000,0.0,0.0,0.160558,0.000000,...,0,1,0,0,0,2,1,0,0,0


In [8]:
# Grid Search CV
import numpy as np
import pandas as pd
import scipy as sp
import copy,os,sys,psutil
import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import dump_svmlight_file
from sklearn.linear_model import LogisticRegression
 
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
 
def print_best_score(gsearch,param_test):
     # 输出best score
    print("Best score: %0.3f" % gsearch.best_score_)
    print("Best parameters set:")
    # 输出最佳的分类器到底使用了怎样的参数
    best_parameters = gsearch.best_estimator_.get_params()
    for param_name in sorted(param_test.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
 
LogisticRegression_param = { 
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
    }

def LogisticRegression_Grid_CV(X_train,y_train, LogisticRegression_param):
    estimator = LogisticRegression()
    gsearch = GridSearchCV(estimator , param_grid = LogisticRegression_param, scoring='roc_auc', cv=5 )
    gsearch.fit(X_train, y_train)
    gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
    print_best_score(gsearch,LogisticRegression_param)


LogisticRegression_Grid_CV(X_train_ad1, y_train_ad1, LogisticRegression_param)

ValueError: 
All the 210 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------------------------------------------------
140 fits failed with the following error:
Traceback (most recent call last):
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1196, in fit
    X, y = self._validate_data(
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/home/hassan/.conda/envs/mla/lib/python3.10/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [6]:
df1.columns

Index(['Internalpatientid', 'num_stays', 'stay_length', 'num_unique_units',
       'num_transfers', 'num_cvd_readmission', 'AO', 'CVD',
       'unique_admitting_specialty', 'unique_discharging_specialty',
       'DOMICILIARY', 'MEDICINE', 'NHCU', 'NON-COUNT', 'OTHERS', 'PSYCHIATRY',
       'SURGERY', 'Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'age_mean', 'age_std', 'age_min', 'age_max', 'stay_min',
       'stay_max', 'stay_mean', 'stay_std', 'freq', 'Medical', 'Mental',
       'Others_Specialty', 'Rehab', 'Gerontology',
       'readmission within 300 days', 'Age at death', 'died_within_900days',
       'total_procedure', 'num_surgery_pro', 'Ethnicity', 'Gender', 'Races',
       'Ethnicity_0', 'Ethnicity_1', 'Ethnicity_2', 'Races_0', 'Races_1',
       'Races_2', 'Races_3', 'num_immunization', 'Num med per admission mean',
       'Num med per admission min', 'Num med per admission max',
       'Total medications', 'mean age at specailty', 'period mean',
  

In [3]:
df1

Unnamed: 0,Internalpatientid,num_stays,stay_length,num_unique_units,num_transfers,num_cvd_readmission,AO,CVD,unique_admitting_specialty,unique_discharging_specialty,...,Age 100-120 healthy,Age 20-40 hypertension,Age 40-60 hypertension,Age 60-80 hypertension,Age 80-100 hypertension,Age 100-120 hypertension,lab_count,lab_freq,lab_age_mean,lab_age_std
0,1,3,14.16,2,0,0,0,0,3,2,...,0.0,0.0,4.0,177.0,0.0,0.0,159.0,10.60,68.340586,3.105130
1,2,21,71.17,5,2,9,0,1,9,8,...,0.0,0.0,52.0,232.0,0.0,0.0,497.0,23.67,64.917227,3.982301
2,3,1,1.83,1,0,0,0,1,1,1,...,0.0,0.0,0.0,9.0,178.0,0.0,10.0,10.00,78.595827,0.234229
3,4,1,7.15,1,0,0,0,1,1,1,...,0.0,0.0,0.0,3.0,38.0,0.0,98.0,7.00,82.637824,2.862040
4,5,1,1.04,1,0,0,0,1,1,1,...,0.0,0.0,0.0,23.0,0.0,0.0,23.0,7.67,75.673279,0.783771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84531,169055,1,6.47,1,0,0,0,1,1,1,...,0.0,0.0,3.0,25.0,0.0,0.0,35.0,17.50,58.629968,0.503409
84532,169057,28,94.63,3,2,12,0,1,11,9,...,0.0,0.0,0.0,247.0,218.0,0.0,587.0,26.68,80.195538,4.555468
84533,169060,7,41.68,2,0,4,0,1,5,2,...,0.0,0.0,43.0,45.0,0.0,0.0,359.0,23.93,62.483905,5.565441
84534,169062,11,135.67,4,0,0,1,0,6,4,...,0.0,0.0,0.0,84.0,0.0,0.0,75.0,18.75,72.170051,1.402000


In [None]:
# Train Classifiers
