In [1]:
## Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

In [3]:
## Load data

In [4]:
df = pd.read_csv("../data/datasets/2021-11-30 21:10:43.416992.csv", header=None)

In [5]:
df.fillna(0, inplace=True)

In [6]:
df.shape

(400, 2100)

In [9]:
## Extract data

In [10]:
ANTIBIOTIC_LIST = ['Amikacin', 'Ampicillin', 'Ampicillin/Sulbactam', 'Aztreonam', 'Cefazolin', 'Cefepime', 'Cefoxitin', 'Ceftazidime', 'Ceftriaxone', 'Cefuroxime sodium',
                   'Ciprofloxacin', 'Gentamicin', 'Imipenem', 'Levofloxacin', 'Meropenem', 'Nitrofurantoin', 'Piperacillin/Tazobactam', 'Tetracycline', 'Tobramycin', 'Trimethoprim/Sulfamethoxazole']


In [31]:
y, index_names = get_y_index(ANTIBIOTIC_LIST[::-1], df)

In [72]:
df[index_names].describe()

Unnamed: 0,2099,2098,2097,2096,2095,2094,2093,2092,2091,2090,2089,2088,2087,2086,2085,2084,2083,2082,2081,2080
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,3.15875,9.2825,9.6575,91.30125,61.66125,6.7225,6.9125,6.2625,9.36,4.15875,30.5775,57.785,30.06,21.3425,18.0375,31.3525,27.59,30.64,31.6625,17.0625
std,1.483552,6.210807,6.085868,51.829317,61.846206,15.105991,2.780138,13.706687,12.592332,6.398655,6.021182,16.952204,10.555921,12.674455,14.171562,5.949094,10.425143,4.934701,3.071184,16.539573
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.5,0.0,0.0,0.0,0.0,2.0,0.0,1.0
25%,3.5,2.0,4.0,32.0,0.0,1.0,8.0,1.0,2.0,4.0,32.0,64.0,32.0,8.0,1.0,32.0,32.0,32.0,32.0,8.0
50%,4.0,8.0,8.0,128.0,64.0,1.0,8.0,1.0,4.0,4.0,32.0,64.0,32.0,32.0,16.0,32.0,32.0,32.0,32.0,8.0
75%,4.0,16.0,16.0,128.0,128.0,16.0,8.0,8.0,16.0,4.0,32.0,64.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,16.0
max,4.0,16.0,16.0,128.0,128.0,128.0,16.0,128.0,128.0,128.0,32.0,64.0,128.0,64.0,64.0,64.0,64.0,32.0,32.0,64.0


In [27]:
x = df.drop(index_names, axis=1)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, shuffle=False)

In [8]:
## Trainning model

In [7]:
params = {'colsample_bytree': 0.7, 'max_depth': 15, 'min_split_gain': 0.4, 'n_estimators': 400, 'num_leaves': 50, 'reg_alpha': 1.3, 'reg_lambda': 1.1, 'subsample': 0.9, 'subsample_freq': 20}

In [19]:
multi_regressor = MultiOutputRegressor(lgb.LGBMRegressor(**params))

In [35]:
multi_regressor.fit(x_train, y_train)

MultiOutputRegressor(estimator=LGBMRegressor(colsample_bytree=0.7, max_depth=15,
                                             min_split_gain=0.4,
                                             n_estimators=400, num_leaves=50,
                                             reg_alpha=1.3, reg_lambda=1.1,
                                             subsample=0.9, subsample_freq=20))

In [36]:
multi_regressor.score(x_test, y_test)

-0.1748435943601686

In [47]:
y_pred = multi_regressor.predict(x_test)

In [45]:
mse, mae, r2 = get_metrics(y_test, y_pred)

MSE:  465.1994537234406
MAE:  9.968363863916919
R2:  -0.1748435943601686


In [14]:
## Utils

In [16]:
def get_y_index(antibiotic_list: list, data):
    y_list = []
    index_names = []


    for index, value in enumerate(antibiotic_list):
        index_name = int((len(df.columns) - 1) - index)

        y = df[index_name]

        if not len(y_list):
            y_list = np.zeros((len(y),len(antibiotic_list)))

        y_list[:,index] = y

        index_names.append(index_name)
        
    return y_list, index_names


In [43]:
def get_metrics(expected, predicted):
    mse = mean_squared_error(expected, predicted, squared=True)
    mae = mean_absolute_error(expected, predicted)
    r2 = r2_score(expected, predicted)

    return mse, mae, r2

In [48]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [64]:
multi_regressor = MultiOutputRegressor(lgb.LGBMRegressor())

In [70]:
gs, pred = algorithm_pipeline(x_train, x_test, y_train, y_test, multi_regressor, 
                                 param_grid, cv=5)


Fitting 5 folds for each of 972 candidates, totalling 4860 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 45.9min


KeyboardInterrupt: 

In [66]:
param_grid = {
    'n_estimators': [400],
    'estimator__colsample_bytree': [0.7, 0.8],
    'estimator__max_depth': [15,20,25],
    'estimator__num_leaves': [50, 100, 200],
    'estimator__reg_alpha': [1.1, 1.2, 1.3],
    'estimator__reg_lambda': [1.1, 1.2, 1.3],
    'estimator__min_split_gain': [0.3, 0.4],
    'estimator__subsample': [0.7, 0.8, 0.9],
    'estimator__subsample_freq': [20]
}


In [60]:
multi_regressor.fit(x_train, y_train)

MultiOutputRegressor(estimator=LGBMRegressor())

In [61]:
multi_regressor.score(x_test, y_test)

-0.03587396903172638

In [62]:
y_pred = multi_regressor.predict(x_test)

In [63]:
mse, mae, r2 = get_metrics(y_test, y_pred)

MSE:  471.18200546599655
MAE:  10.081788376460638
R2:  -0.03587396903172638
