# Install Pacakges

In [1]:
!pip install tpot mljar-supervised

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 2.7 MB/s 
[?25hCollecting mljar-supervised
  Downloading mljar-supervised-0.11.3.tar.gz (112 kB)
[K     |████████████████████████████████| 112 kB 19.4 MB/s 
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting xgboost>=1.1.0
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[K     |████████████████████████████████| 255.9 MB 34 kB/s 
[?25hCollecting deap>=1.2
  Downloading deap-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (139 kB)
[K     |████████████████████████████████| 139 kB 53.6 MB/s 
Collecting lightgbm>=3.0.0
  Downloading lightgbm-3.3.3-py3-none-manylinux1_x86_64.whl

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML


# Options Available

- mode — the package ships with four built-in models. 
  - The Explain mode is ideal for explaining and understanding the data. It results in visualizations of feature importance as well as tree visualizations.
  - The Perform is used when building ML models for production. 
  - The Compete is meant to build models used in machine learning competitions. 
  - The Optuna mode is used to search for highly-tuned ML models.
- algorithms — specifies the algorithms you would like to use. They are usually passed in as a list.
- results_path — the path where the results will be stored
- total_time_limit — the total time in seconds for training the model
- train_ensemble — dictates if an ensemble will be created at the end of the training process
- stack_models — determines if a models stack will be created
- eval_metric — the metric that will be optimized. If auto the logloss is used for classification problems while the rmse is used for regression problems

In [None]:
#automl = AutoML(
    # mode="Explain"
    # algorithms=""
    # results_path="AutoML_22",
    # total_time_limit=30 * 60,
    # train_ensemble=True,
    # stack_models="",
    # eval_metric=""
#)

# Healthcare Dataset - SPARCS

## Load in dataset

In [3]:
import pandas as pd
sparcs = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_sparcs.csv')
sparcs

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [4]:
sparcs.columns

Index(['Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Facility Id', 'Facility Name',
       'Age Group', 'Zip Code - 3 digits', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'Discharge Year', 'CCS Diagnosis Code', 'CCS Diagnosis Description',
       'CCS Procedure Code', 'CCS Procedure Description', 'APR DRG Code',
       'APR DRG Description', 'APR MDC Code', 'APR MDC Description',
       'APR Severity of Illness Code', 'APR Severity of Illness Description',
       'APR Risk of Mortality', 'APR Medical Surgical Description',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Birth Weight', 'Abortion Edit Indicator',
       'Emergency Department Indicator', 'Total Charges', 'Total Costs'],
      dtype='object')

## Potential variables of interest

- APR Risk of Mortality (categorical) 
- Total costs (continuous) 
- Length of Stay

In [5]:
sparcs['Ethnicity'].describe()

count                 23583
unique                    4
top       Not Span/Hispanic
freq                  19641
Name: Ethnicity, dtype: object

In [6]:
sparcs['Total Costs'].describe()

count    2.358300e+04
mean     1.472282e+04
std      2.718098e+04
min      6.700000e-01
25%      4.471700e+03
50%      8.320120e+03
75%      1.590874e+04
max      1.591541e+06
Name: Total Costs, dtype: float64

In [7]:
sparcs['Type of Admission'].value_counts()

Emergency        14968
Elective          4508
Newborn           2285
Urgent            1743
Trauma              63
Not Available       16
Name: Type of Admission, dtype: int64

## Create some simplified binary versions

In [None]:
sparcs['Length of Stay'] = pd.to_numeric(sparcs['Length of Stay'], errors='coerce')
sparcs['sparcs_los'] = sparcs['Length of Stay'].apply(lambda x: 'long' if x > 3 else 'short')
sparcs.drop('Length of Stay', axis=1, inplace=True)
sparcs['sparcs_los'].value_counts()


# MLJar Examples

## Binary Classifier Example 1 - SPARCS

### **Create new model**

In [8]:
X = sparcs.drop(columns=['Age Group'])

In [9]:
y = sparcs["Age Group"]

In [10]:
X

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Zip Code - 3 digits,Gender,Race,Ethnicity,Length of Stay,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,147,M,White,Not Span/Hispanic,3,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,148,F,White,Not Span/Hispanic,2,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,147,M,White,Not Span/Hispanic,3,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,148,F,White,Not Span/Hispanic,1,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,148,M,White,Not Span/Hispanic,14,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,117,F,White,Not Span/Hispanic,6,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,117,F,Other Race,Spanish/Hispanic,3,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,117,M,White,Not Span/Hispanic,12,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,117,M,White,Not Span/Hispanic,5,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [11]:
y

0           30 to 49
1           18 to 29
2            0 to 17
3        70 or Older
4           50 to 69
            ...     
23578    70 or Older
23579        0 to 17
23580       30 to 49
23581    70 or Older
23582    70 or Older
Name: Age Group, Length: 23583, dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

In [13]:
X_test

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Zip Code - 3 digits,Gender,Race,Ethnicity,Length of Stay,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
2651,Finger Lakes,Monroe,2701005.0,413.0,Strong Memorial Hospital,144,F,White,Not Span/Hispanic,3,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,,0,N,Y,6295.78,3957.08
3592,Central NY,Jefferson,2201000.0,367.0,Samaritan Medical Center,136,M,White,Not Span/Hispanic,7,...,Extreme,Medical,Medicare,Medicare,Self-Pay,0,N,Y,45827.45,12032.77
17261,New York City,Manhattan,7002032.0,1466.0,Mount Sinai Roosevelt,112,M,Black/African American,Not Span/Hispanic,4,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,,3900,N,N,16402.65,8931.00
1347,Western NY,Genesee,1801000.0,339.0,United Memorial Medical Center North Street Ca...,140,F,White,Not Span/Hispanic,2,...,Minor,Medical,Medicaid,Medicaid,,0,N,N,6609.92,2027.64
21757,Long Island,Nassau,2951001.0,541.0,North Shore University Hospital,115,M,White,Spanish/Hispanic,5,...,Minor,Medical,Medicare,,,0,N,Y,69397.30,13520.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14192,New York City,Manhattan,7002002.0,1439.0,Mount Sinai Beth Israel,104,F,Black/African American,Spanish/Hispanic,2,...,Minor,Surgical,Private Health Insurance,Self-Pay,,0,N,N,45290.30,12783.82
5096,Central NY,Tompkins,5401001.0,977.0,Cayuga Medical Center at Ithaca,148,F,White,Unknown,2,...,Minor,Medical,Private Health Insurance,Self-Pay,Self-Pay,0,N,N,2824.50,4864.31
21554,Long Island,Nassau,2951001.0,541.0,North Shore University Hospital,111,M,White,Not Span/Hispanic,2,...,Minor,Medical,Blue Cross/Blue Shield,,,3100,N,N,9216.52,1855.86
5053,Central NY,St Lawrence,4429000.0,815.0,Canton-Potsdam Hospital,136,F,White,Unknown,10,...,Moderate,Surgical,Medicare,Private Health Insurance,Self-Pay,0,N,Y,37142.46,21557.47


In [45]:
automl = AutoML(results_path="Age_Group", mode="Explain")

In [46]:
automl.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: Age_Group
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 1.547487 trained in 1.16 seconds
2_DecisionTree logloss 1.006226 trained in 18.17 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost logloss 0.817724 trained in 60.92 seconds
4_Default_NeuralNetwork logloss 0.94393 trained in 8.09 seconds
5_Default_RandomForest logloss 0.947229 trained in 33.73 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.817724 trained in 0.64 seconds
AutoML fit time: 133.31 seconds
AutoML best model: 3_Default_Xgboost


AutoML(results_path='Age_Group')

In [47]:
pred = automl.predict(X_test)
pred

array(['18 to 29', '70 or Older', '0 to 17', ..., '0 to 17',
       '70 or Older', '70 or Older'], dtype=object)

In [48]:
automl.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_Baseline,Baseline,logloss,1.54749,1.88
,2_DecisionTree,Decision Tree,logloss,1.00623,19.38
the best,3_Default_Xgboost,Xgboost,logloss,0.817724,62.19
,4_Default_NeuralNetwork,Neural Network,logloss,0.94393,9.13
,5_Default_RandomForest,Random Forest,logloss,0.947229,35.05
,Ensemble,Ensemble,logloss,0.817724,0.64

Model,Weight
3_Default_Xgboost,3

Unnamed: 0,0 to 17,18 to 29,30 to 49,50 to 69,70 or Older,accuracy,macro avg,weighted avg,logloss
precision,0.890017,0.506623,0.49328,0.517715,0.734665,0.631389,0.62846,0.627035,0.817724
recall,0.799392,0.334061,0.428238,0.582114,0.844955,0.631389,0.597752,0.631389,0.817724
f1-score,0.842274,0.402632,0.458463,0.548029,0.78596,0.631389,0.607472,0.624985,0.817724
support,658.0,458.0,857.0,1230.0,1219.0,0.631389,4422.0,4422.0,0.817724

Unnamed: 0,Predicted as 0 to 17,Predicted as 18 to 29,Predicted as 30 to 49,Predicted as 50 to 69,Predicted as 70 or Older
Labeled as 0 to 17,526,12,50,68,2
Labeled as 18 to 29,20,153,184,96,5
Labeled as 30 to 49,26,115,367,320,29
Labeled as 50 to 69,18,21,139,716,336
Labeled as 70 or Older,1,1,4,183,1030

Unnamed: 0,0 to 17,18 to 29,30 to 49,50 to 69,70 or Older,accuracy,macro avg,weighted avg,logloss
precision,0.873857,0.452308,0.369674,0.497189,0.696085,0.585256,0.577823,0.578705,0.94393
recall,0.726444,0.320961,0.344224,0.503252,0.860541,0.585256,0.551084,0.585256,0.94393
f1-score,0.793361,0.375479,0.356495,0.500202,0.769626,0.585256,0.559033,0.577327,0.94393
support,658.0,458.0,857.0,1230.0,1219.0,0.585256,4422.0,4422.0,0.94393

Unnamed: 0,Predicted as 0 to 17,Predicted as 18 to 29,Predicted as 30 to 49,Predicted as 50 to 69,Predicted as 70 or Older
Labeled as 0 to 17,478,5,109,62,4
Labeled as 18 to 29,16,147,187,101,7
Labeled as 30 to 49,22,173,295,308,59
Labeled as 50 to 69,29,0,194,619,388
Labeled as 70 or Older,2,0,13,155,1049

Unnamed: 0,0 to 17,18 to 29,30 to 49,50 to 69,70 or Older,accuracy,macro avg,weighted avg,logloss
precision,0,0,0,0.278155,0,0.278155,0.0556309,0.07737,1.54749
recall,0,0,0,1.0,0,0.278155,0.2,0.278155,1.54749
f1-score,0,0,0,0.435244,0,0.278155,0.0870488,0.121065,1.54749
support,658,458,857,1230.0,1219,0.278155,4422.0,4422.0,1.54749

Unnamed: 0,Predicted as 0 to 17,Predicted as 18 to 29,Predicted as 30 to 49,Predicted as 50 to 69,Predicted as 70 or Older
Labeled as 0 to 17,0,0,0,658,0
Labeled as 18 to 29,0,0,0,458,0
Labeled as 30 to 49,0,0,0,857,0
Labeled as 50 to 69,0,0,0,1230,0
Labeled as 70 or Older,0,0,0,1219,0

Unnamed: 0,0 to 17,18 to 29,30 to 49,50 to 69,70 or Older,accuracy,macro avg,weighted avg,logloss
precision,0.890017,0.506623,0.49328,0.517715,0.734665,0.631389,0.62846,0.627035,0.817724
recall,0.799392,0.334061,0.428238,0.582114,0.844955,0.631389,0.597752,0.631389,0.817724
f1-score,0.842274,0.402632,0.458463,0.548029,0.78596,0.631389,0.607472,0.624985,0.817724
support,658.0,458.0,857.0,1230.0,1219.0,0.631389,4422.0,4422.0,0.817724

Unnamed: 0,Predicted as 0 to 17,Predicted as 18 to 29,Predicted as 30 to 49,Predicted as 50 to 69,Predicted as 70 or Older
Labeled as 0 to 17,526,12,50,68,2
Labeled as 18 to 29,20,153,184,96,5
Labeled as 30 to 49,26,115,367,320,29
Labeled as 50 to 69,18,21,139,716,336
Labeled as 70 or Older,1,1,4,183,1030

Unnamed: 0,0 to 17,18 to 29,30 to 49,50 to 69,70 or Older,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.547085,0.484404,0.434917,0.690106,0.593849,0.631302,0.610558,0.947229
recall,0.647416,0.266376,0.308051,0.573171,0.909762,0.593849,0.540955,0.593849,0.947229
f1-score,0.785978,0.358297,0.376605,0.494563,0.784855,0.593849,0.56006,0.580976,0.947229
support,658.0,458.0,857.0,1230.0,1219.0,0.593849,4422.0,4422.0,0.947229

Unnamed: 0,Predicted as 0 to 17,Predicted as 18 to 29,Predicted as 30 to 49,Predicted as 50 to 69,Predicted as 70 or Older
Labeled as 0 to 17,426,4,33,194,1
Labeled as 18 to 29,0,122,151,178,7
Labeled as 30 to 49,0,97,264,439,57
Labeled as 50 to 69,0,0,92,705,433
Labeled as 70 or Older,0,0,5,105,1109

Unnamed: 0,0 to 17,18 to 29,30 to 49,50 to 69,70 or Older,accuracy,macro avg,weighted avg,logloss
precision,1.0,0.460177,0,0.38477,0.67068,0.557214,0.503125,0.488373,1.00623
recall,0.647416,0.340611,0,0.62439,0.913864,0.557214,0.505256,0.557214,1.00623
f1-score,0.785978,0.391468,0,0.476131,0.773611,0.557214,0.485438,0.503198,1.00623
support,658.0,458.0,857,1230.0,1219.0,0.557214,4422.0,4422.0,1.00623

Unnamed: 0,Predicted as 0 to 17,Predicted as 18 to 29,Predicted as 30 to 49,Predicted as 50 to 69,Predicted as 70 or Older
Labeled as 0 to 17,426,9,0,222,1
Labeled as 18 to 29,0,156,0,290,12
Labeled as 30 to 49,0,166,0,612,79
Labeled as 50 to 69,0,7,0,768,455
Labeled as 70 or Older,0,1,0,104,1114


### **Test new (not really) data**

In [18]:
# load in the data model 

automl_sparcs_los = AutoML(results_path="Age Group")

In [19]:
# create a new dataset that follows the same data structure as the training set
X_withlos = sparcs.sample(25)
X_withoutlos = X_withlos.drop(columns=['Age Group'])

In [20]:
X_withlos

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
10503,New York City,Bronx,7000014.0,1176.0,SBH Health System,30 to 49,104.0,M,Other Race,Not Span/Hispanic,...,Minor,Medical,Medicaid,Medicaid,,0,N,Y,48538.61,12841.87
5366,Capital/Adirond,Albany,101000.0,1.0,Albany Medical Center Hospital,18 to 29,120.0,M,Other Race,Unknown,...,Minor,Surgical,Self-Pay,,,0,N,Y,63037.55,19952.44
9314,New York City,Bronx,7000002.0,1165.0,Jacobi Medical Center,50 to 69,104.0,M,Black/African American,Not Span/Hispanic,...,Major,Medical,Medicaid,,,0,N,Y,53147.51,30723.25
3924,Central NY,Oneida,3202003.0,599.0,Faxton-St Lukes Healthcare St Lukes Division,30 to 49,133.0,F,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,7610.0,4282.29
23116,Long Island,Suffolk,5154000.0,924.0,Southside Hospital,0 to 17,117.0,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,Medicaid,,3200,N,N,10944.0,2775.1
7026,Hudson Valley,Dutchess,1302001.0,181.0,Vassar Brothers Medical Center,30 to 49,125.0,F,Other Race,Unknown,...,Minor,Medical,Blue Cross/Blue Shield,,,0,N,N,19156.63,6796.34
15820,New York City,Manhattan,7002024.0,1456.0,Mount Sinai Hospital,50 to 69,112.0,M,White,Not Span/Hispanic,...,Minor,Surgical,Private Health Insurance,Self-Pay,,0,N,Y,19386.09,7708.71
5897,Capital/Adirond,Columbia,1001000.0,146.0,Columbia Memorial Hospital,70 or Older,120.0,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Blue Cross/Blue Shield,,0,N,Y,8941.62,3349.33
2799,Finger Lakes,Monroe,2754001.0,471.0,The Unity Hospital of Rochester,70 or Older,146.0,F,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,13420.78,5505.49
23002,Long Island,Suffolk,5154000.0,924.0,Southside Hospital,18 to 29,117.0,M,Black/African American,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,Y,36423.0,6992.44


In [21]:
X_withoutlos

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Zip Code - 3 digits,Gender,Race,Ethnicity,Length of Stay,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
10503,New York City,Bronx,7000014.0,1176.0,SBH Health System,104.0,M,Other Race,Not Span/Hispanic,15,...,Minor,Medical,Medicaid,Medicaid,,0,N,Y,48538.61,12841.87
5366,Capital/Adirond,Albany,101000.0,1.0,Albany Medical Center Hospital,120.0,M,Other Race,Unknown,3,...,Minor,Surgical,Self-Pay,,,0,N,Y,63037.55,19952.44
9314,New York City,Bronx,7000002.0,1165.0,Jacobi Medical Center,104.0,M,Black/African American,Not Span/Hispanic,8,...,Major,Medical,Medicaid,,,0,N,Y,53147.51,30723.25
3924,Central NY,Oneida,3202003.0,599.0,Faxton-St Lukes Healthcare St Lukes Division,133.0,F,White,Not Span/Hispanic,3,...,Minor,Medical,Medicaid,,,0,N,N,7610.0,4282.29
23116,Long Island,Suffolk,5154000.0,924.0,Southside Hospital,117.0,M,White,Not Span/Hispanic,2,...,Minor,Medical,Medicaid,Medicaid,,3200,N,N,10944.0,2775.1
7026,Hudson Valley,Dutchess,1302001.0,181.0,Vassar Brothers Medical Center,125.0,F,Other Race,Unknown,3,...,Minor,Medical,Blue Cross/Blue Shield,,,0,N,N,19156.63,6796.34
15820,New York City,Manhattan,7002024.0,1456.0,Mount Sinai Hospital,112.0,M,White,Not Span/Hispanic,1,...,Minor,Surgical,Private Health Insurance,Self-Pay,,0,N,Y,19386.09,7708.71
5897,Capital/Adirond,Columbia,1001000.0,146.0,Columbia Memorial Hospital,120.0,F,White,Not Span/Hispanic,3,...,Moderate,Medical,Medicare,Blue Cross/Blue Shield,,0,N,Y,8941.62,3349.33
2799,Finger Lakes,Monroe,2754001.0,471.0,The Unity Hospital of Rochester,146.0,F,White,Not Span/Hispanic,6,...,Major,Medical,Medicare,Medicare,,0,N,Y,13420.78,5505.49
23002,Long Island,Suffolk,5154000.0,924.0,Southside Hospital,117.0,M,Black/African American,Not Span/Hispanic,6,...,Minor,Medical,Medicaid,,,0,N,Y,36423.0,6992.44


In [22]:
predict = automl.predict(X_withoutlos)
predict

array(['30 to 49', '50 to 69', '50 to 69', '18 to 29', '0 to 17',
       '18 to 29', '30 to 49', '70 or Older', '70 or Older', '30 to 49',
       '30 to 49', '30 to 49', '70 or Older', '30 to 49', '50 to 69',
       '50 to 69', '70 or Older', '30 to 49', '0 to 17', '50 to 69',
       '70 or Older', '30 to 49', '70 or Older', '18 to 29',
       '70 or Older'], dtype=object)

In [24]:
# actual values from X_withlos
values_actual = X_withlos['Age Group'].values.tolist()
values_predicted = predict.tolist()
output = pd.DataFrame({'actual': values_actual, 'predicted': values_predicted})
output

Unnamed: 0,actual,predicted
0,30 to 49,30 to 49
1,18 to 29,50 to 69
2,50 to 69,50 to 69
3,30 to 49,18 to 29
4,0 to 17,0 to 17
5,30 to 49,18 to 29
6,50 to 69,30 to 49
7,70 or Older,70 or Older
8,70 or Older,70 or Older
9,18 to 29,30 to 49


## Regression - Example - GENERIC

> Indented block



In [25]:
import numpy as np
import pandas as pd
from supervised.automl import AutoML

df = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/HHA-507-2022/main/autoML/datasets/data_sparcs.csv')
x_cols = [c for c in df.columns if c != "Total Charges"]
X = df[x_cols]
y = df["Total Charges"]

In [26]:
df

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Medical,Private Health Insurance,,,0,N,Y,4757.01,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,5090.25,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,4948.50,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,4719.75,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,50384.75,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,50833.00,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Medical,Private Health Insurance,,,3200,N,N,10948.00,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Minor,Medical,Medicaid,,,0,N,N,46421.00,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Medical,Medicare,Medicare,,0,N,Y,46122.00,7951.26


In [27]:
x_cols

['Health Service Area',
 'Hospital County',
 'Operating Certificate Number',
 'Facility Id',
 'Facility Name',
 'Age Group',
 'Zip Code - 3 digits',
 'Gender',
 'Race',
 'Ethnicity',
 'Length of Stay',
 'Type of Admission',
 'Patient Disposition',
 'Discharge Year',
 'CCS Diagnosis Code',
 'CCS Diagnosis Description',
 'CCS Procedure Code',
 'CCS Procedure Description',
 'APR DRG Code',
 'APR DRG Description',
 'APR MDC Code',
 'APR MDC Description',
 'APR Severity of Illness Code',
 'APR Severity of Illness Description',
 'APR Risk of Mortality',
 'APR Medical Surgical Description',
 'Payment Typology 1',
 'Payment Typology 2',
 'Payment Typology 3',
 'Birth Weight',
 'Abortion Edit Indicator',
 'Emergency Department Indicator',
 'Total Costs']

In [33]:
X

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Severity of Illness Description,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,30 to 49,147,M,White,Not Span/Hispanic,...,Minor,Minor,Medical,Private Health Insurance,,,0,N,Y,4747.83
1,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,18 to 29,148,F,White,Not Span/Hispanic,...,Minor,Minor,Medical,Blue Cross/Blue Shield,Self-Pay,Self-Pay,0,N,N,2985.64
2,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,0 to 17,147,M,White,Not Span/Hispanic,...,Minor,Minor,Medical,Self-Pay,Self-Pay,Self-Pay,2900,N,N,2129.67
3,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,70 or Older,148,F,White,Not Span/Hispanic,...,Moderate,Moderate,Medical,Medicare,Medicare,Self-Pay,0,N,Y,8454.41
4,Western NY,Allegany,228000.0,39.0,Memorial Hosp of Wm F & Gertrude F Jones A/K/A...,50 to 69,148,M,White,Not Span/Hispanic,...,Extreme,Major,Medical,Blue Cross/Blue Shield,Medicare,Self-Pay,0,N,Y,34565.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23578,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,F,White,Not Span/Hispanic,...,Moderate,Moderate,Medical,Medicare,Private Health Insurance,,0,N,Y,8961.40
23579,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,0 to 17,117,F,Other Race,Spanish/Hispanic,...,Minor,Minor,Medical,Private Health Insurance,,,3200,N,N,2214.06
23580,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,30 to 49,117,M,White,Not Span/Hispanic,...,Moderate,Minor,Medical,Medicaid,,,0,N,N,11083.24
23581,Long Island,Suffolk,5157003.0,943.0,St Catherine of Siena Hospital,70 or Older,117,M,White,Not Span/Hispanic,...,Major,Major,Medical,Medicare,Medicare,,0,N,Y,7951.26


In [29]:
y

0         4757.01
1         5090.25
2         4948.50
3         4719.75
4        50384.75
           ...   
23578    50833.00
23579    10948.00
23580    46421.00
23581    46122.00
23582    32225.00
Name: Total Charges, Length: 23583, dtype: float64

In [34]:
automl = AutoML(results_path="regression", mode="Explain")
automl.fit(X,y)

Linear algorithm was disabled.
AutoML directory: regression
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline rmse 72381.653229 trained in 1.83 seconds
2_DecisionTree rmse 33806.950052 trained in 12.18 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost rmse 19944.100875 trained in 148.61 seconds
4_Default_NeuralNetwork rmse 36631.607976 trained in 6.95 seconds
5_Default_RandomForest rmse 36887.793213 trained in 18.01 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 18837.918818 trained in 0.29 seconds
AutoML fit time: 196.59 seconds
AutoML best model: Ensemble


AutoML(results_path='regression')

In [35]:
df["Predictions"] = automl.predict(X)

In [36]:
print("Predictions")
print(df[["Total Charges", "Predictions"]].head())

Predictions
   Total Charges   Predictions
0        4757.01   7439.961726
1        5090.25   7661.853523
2        4948.50   7600.419929
3        4719.75   7525.814851
4       50384.75  58561.460695


In [37]:
automl.report()

Best model,name,model_type,metric_type,metric_value,train_time
,1_Baseline,Baseline,rmse,72381.7,2.23
,2_DecisionTree,Decision Tree,rmse,33807.0,13.1
,3_Default_Xgboost,Xgboost,rmse,19944.1,149.56
,4_Default_NeuralNetwork,Neural Network,rmse,36631.6,7.64
,5_Default_RandomForest,Random Forest,rmse,36887.8,18.99
the best,Ensemble,Ensemble,rmse,18837.9,0.29

Model,Weight
2_DecisionTree,1
3_Default_Xgboost,4

Metric,Score
MAE,6589.05
MSE,354867000.0
RMSE,18837.9
R2,0.932259
MAPE,1.00164

Metric,Score
MAE,10860.5
MSE,1341870000.0
RMSE,36631.6
R2,0.743848
MAPE,0.265641

Metric,Score
MAE,35357.2
MSE,5239100000.0
RMSE,72381.7
R2,-9.90929e-05
MAPE,9.60904

Metric,Score
MAE,5556.59
MSE,397767000.0
RMSE,19944.1
R2,0.92407
MAPE,0.118868

Metric,Score
MAE,13057.2
MSE,1360710000.0
RMSE,36887.8
R2,0.740252
MAPE,0.350196

Metric,Score
MAE,16317.3
MSE,1142910000.0
RMSE,33807.0
R2,0.781828
MAPE,3.85216


# Download outputs

#Binary

In [38]:
# get current working directory
import os
os.getcwd()

'/content'

In [56]:
folders = os.listdir()
foldersML = [x for x in folders if x.startswith('Age_Group')]
print(foldersML)

['Age_Group.zip', 'Age_Group']


In [49]:
!zip -r /content/Age_Group.zip /content/Age_Group

  adding: content/Age_Group/ (stored 0%)
  adding: content/Age_Group/README.html (deflated 30%)
  adding: content/Age_Group/README.md (deflated 68%)
  adding: content/Age_Group/split_validation_indices.npy (deflated 66%)
  adding: content/Age_Group/Ensemble/ (stored 0%)
  adding: content/Age_Group/Ensemble/README.md (deflated 71%)
  adding: content/Age_Group/Ensemble/predictions_ensemble.csv (deflated 58%)
  adding: content/Age_Group/Ensemble/confusion_matrix_normalized.png (deflated 10%)
  adding: content/Age_Group/Ensemble/learning_curves.png (deflated 11%)
  adding: content/Age_Group/Ensemble/roc_curve.png (deflated 6%)
  adding: content/Age_Group/Ensemble/ensemble.json (deflated 44%)
  adding: content/Age_Group/Ensemble/status.txt (stored 0%)
  adding: content/Age_Group/Ensemble/confusion_matrix.png (deflated 11%)
  adding: content/Age_Group/Ensemble/precision_recall_curve.png (deflated 5%)
  adding: content/Age_Group/ldb_performance_boxplot.png (deflated 21%)
  adding: content/Age

#Regression

In [50]:
os.getcwd()

'/content'

In [51]:
folders = os.listdir()
foldersML = [x for x in folders if x.startswith('regression')]
print(foldersML)

['regression']


In [55]:
!zip -r /content/regression.zip /content/regression

updating: content/regression/ (stored 0%)
updating: content/regression/README.html (deflated 29%)
updating: content/regression/README.md (deflated 68%)
updating: content/regression/split_validation_indices.npy (deflated 66%)
updating: content/regression/Ensemble/ (stored 0%)
updating: content/regression/Ensemble/README.md (deflated 52%)
updating: content/regression/Ensemble/predictions_ensemble.csv (deflated 60%)
updating: content/regression/Ensemble/true_vs_predicted.png (deflated 11%)
updating: content/regression/Ensemble/predicted_vs_residuals.png (deflated 12%)
updating: content/regression/Ensemble/learning_curves.png (deflated 12%)
updating: content/regression/Ensemble/ensemble.json (deflated 50%)
updating: content/regression/Ensemble/status.txt (stored 0%)
updating: content/regression/ldb_performance_boxplot.png (deflated 22%)
updating: content/regression/.zip (stored 0%)
updating: content/regression/split_train_indices.npy (deflated 66%)
updating: content/regression/progress.jso