In [12]:
from typing import Mapping, Any, Union, List, Optional, Callable
from itertools import combinations, product
from tqdm.notebook import tqdm
from copy import deepcopy
from itertools import chain

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt

%matplotlib inline

sns.set_style()

## Data

In [2]:
df = pd.read_csv('../data/heart.csv')
folds = np.load('../data/5folds_bytarget.npy', allow_pickle=True)

In [3]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,63,1,0,140,187,0,0,144,1,4.0,2,2,3,0
293,63,0,0,124,197,0,1,136,1,0.0,1,0,2,0
294,59,1,0,164,176,1,0,90,0,1.0,1,2,1,0
295,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0


## Utils

In [9]:
def do_cross_val(
    X: pd.DataFrame,
    y: pd.Series,
    folds: np.ndarray,
    score_func: Callable,
    estimator: object
):
    scores = []
    preds = []
    targets = []
    
    for train_idx, val_idx in folds:
        # Fit and Predict classifier
        one_pred = estimator.fit(
            X.iloc[train_idx], y.iloc[train_idx]
        ).predict_proba(X.iloc[val_idx])[:,1]
        
        # Score prediction
        one_score = score_func(y.iloc[val_idx], one_pred)
        # Hash all stats, predictions, targets
        preds += list(one_pred)
        targets += list(y.iloc[val_idx])
        scores.append(one_score)
        
    return {
        "scores": scores,
        "preds": preds,
        "targets": targets
    }

def do_oof(
    X: pd.DataFrame,
    y: pd.Series,
    folds: np.ndarray,
    estimator: object
):
    preds = []
    targets = []
    
    for train_idx, val_idx in folds:
        # Fit and Predict classifier
        one_pred = estimator.fit(
            X.iloc[train_idx], y.iloc[train_idx]
        ).predict_proba(X.iloc[val_idx])
        
        # Hash all stats, predictions, targets
        preds += list(one_pred)
        targets += list(y.iloc[val_idx])
        
    return {
        "preds": preds,
        "targets": targets
    }

def plot_conf_matrix(y_true, y_pred):
    cm = confusion_matrix(
        y_true, 
        y_pred
    )
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')
    plt.show()

def fit_validate(
    # Data specific config
    input_df: pd.DataFrame,
    target_col: str,
    # Model specific config
    ml_model,
    # Validation specific config
    folds: np.ndarray,
    score_func: Callable = roc_auc_score,
    verbose: bool = True
):
    X_train = input_df.drop(columns=target_col)
    y_train = input_df[target_col]
    
    cv_result = []
    cv_targets = []
    cv_preds = []
    # Make cross validation
    one_cv_result = do_cross_val(
                        estimator=ml_model, 
                        X=X_train, 
                        y=y_train,
                        folds=folds, 
                        score_func=score_func
    )
    # Hash results
    cv_result.append(one_cv_result['scores'])
    cv_targets.append(one_cv_result['targets'])
    cv_preds.append(one_cv_result['preds'])
        
    result = {
        "folds_result": cv_result,
        "folds_targets":cv_targets,
        "cv_preds": cv_preds,
        "mean_score": np.mean(cv_result),
        "std_score": np.std(cv_result)
    }
    
    if verbose:
        print(f"Mean score: {result['mean_score']}")
        print(f"Score std: {result['std_score']}")
        plot_conf_matrix(result['folds_targets'][0], result['cv_preds'][0])
    
    return result

def grid_search(
    input_df: pd.DataFrame,
    target_col: str,
    use_feature_search: bool,
    # Model specific config
    ml_model,
    search_params_grid: Optional[Mapping[str, Any]],
    # Validation specific config
    folds: np.ndarray,
    score_func: Callable = roc_auc_score,
    maximize_score: bool = True,
    verbose: bool = True   
):
    # check wether we have any params for grid search
    if search_params_grid is None and not use_feature_search:
        raise RuntimeError("Empty Grid Space")
    
    if use_feature_search:
        # Get only features
        all_features = input_df.drop(columns=target_col).columns.tolist()
        all_possible_features_comb = []
        # Compute all possible combinations
        for i in range(1, len(all_features)+1):
            all_possible_features_comb += list(combinations(all_features, i))  
        # Create `search_params_grid` dict if it was None
        if search_params_grid is None:
            search_params_grid = dict()
        # Add columns to dict
        search_params_grid['columns'] = all_possible_features_comb
        
    # Create all combinations of all params
    # Including columns
    serch_pg_keys = search_params_grid.keys()
    all_combs = [search_params_grid[k] for k in serch_pg_keys]
    all_combs = product(*all_combs)
         
    # Put initial `best_score`
    best_score = -np.inf if maximize_score else np.inf
    best_comb = None
    best_result = None
    for temp_comb in tqdm(all_combs):
        # Transfrom params tuple to dict
        # For comfortable loading in model
        temp_params = {k:v for k,v in zip(serch_pg_keys, temp_comb)}
        # Hash params dict for future
        temp_comb = deepcopy(temp_params)
        # Use all columns if `columns` key is missing
        if 'columns' not in serch_pg_keys:
            f_comb = df.columns
        else:
            f_comb = list(temp_params.pop('columns')) + [target_col]
        # Make one fit validate
        temp_result = fit_validate(
            input_df=input_df[f_comb],
            target_col=target_col,
            ml_model=ml_model(**temp_params),
            folds=folds,
            score_func=score_func,
            verbose=False
        )
        # Check results on best score
        # If best save results
        if maximize_score and temp_result['mean_score'] > best_score:
            best_score = temp_result['mean_score']
            best_comb = temp_comb
            best_result = temp_result
        elif not maximize_score and temp_result['mean_score'] < best_score:
            best_score = temp_result['mean_score']
            best_comb = temp_comb
            best_result = temp_result
        else:
            pass

    if verbose:
        print(f"Best params: {best_comb}")
        print(f"Mean score: {best_result['mean_score']}")
        print(f"Score std: {best_result['std_score']}")        
    return best_result            

## SVC

In [10]:
svc_result = grid_search(
    input_df=df,
    target_col='target',
    use_feature_search=False,
    ml_model=SVC,
    search_params_grid={
        "C":[1000.0, 1500.0, 2000.0, 2500.0],
        "degree": [1,2,3,4,5],
        "probability": [True]
    },
    folds=folds
);

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Best params: {'C': 1500.0, 'degree': 1, 'probability': True}
Mean score: 0.8900252525252526
Score std: 0.036693204775323


In [15]:
y_true_oof = list(chain(*svc_result['folds_targets']))

In [16]:
svc_result_pred = list(chain(*svc_result['cv_preds']))

In [18]:
roc_auc_score(y_true_oof, svc_result_pred)

0.8790123456790123

## Random Forest

In [19]:
rf_result = grid_search(
    input_df=df,
    target_col='target',
    use_feature_search=False,
    ml_model=RandomForestClassifier,
    search_params_grid={
        "criterion":['gini', 'entropy'],
        "max_depth": [1,5,11,21],
        "min_samples_split":[2, 4, 6, 8],
        "min_samples_leaf":[1, 2, 3, 4],
        "max_features":["sqrt", "log2", 10, 5]
    },
    folds=folds
);

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Best params: {'criterion': 'gini', 'max_depth': 1, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'sqrt'}
Mean score: 0.9183291245791245
Score std: 0.03999131461434162


In [20]:
rf_result_pred = list(chain(*rf_result['cv_preds']))

In [21]:
roc_auc_score(y_true_oof, rf_result_pred)

0.9168724279835391

# Log Reg

In [22]:
log_reg_result = grid_search(
    input_df=df,
    target_col='target',
    use_feature_search=False,
    ml_model=LogisticRegression,
    search_params_grid={
        "penalty":['l2'],
        "C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
        "max_iter": [50, 100, 1_000, 10_000]
    },
    folds=folds
);

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist


Best params: {'penalty': 'l2', 'C': 1.0, 'max_iter': 100}
Mean score: 0.9058291245791248
Score std: 0.037308402973721295


In [23]:
log_reg_result_pred = list(chain(*log_reg_result['cv_preds']))

In [24]:
roc_auc_score(y_true_oof, log_reg_result_pred)

0.9047553726566072

## GradientBoostingClassifier

In [26]:
gbc_result = grid_search(
    input_df=df,
    target_col='target',
    use_feature_search=False,
    ml_model=LGBMClassifier,
    search_params_grid={
        "learning_rate":[0.01, 0.1],
        "max_depth": [1, 3, 4],
        "n_estimators": [10_000, 15_000, 5_000]
    },
    folds=folds
);

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Best params: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5000}
Mean score: 0.9019570707070708
Score std: 0.0504083494837172


In [27]:
gbc_result_pred = list(chain(*gbc_result['cv_preds']))

In [28]:
roc_auc_score(y_true_oof, gbc_result_pred)

0.9001828989483311

## Blend

In [32]:
val_indices = np.array(list(chain(*[list(el[1]) for el in folds])))
val_indices

array([  5,  10,  12,  14,  15,  19,  21,  22,  23,  24,  34,  40,  47,
        51,  54,  55,  60,  62,  66,  69,  75,  78,  79, 100, 110, 120,
       122, 125, 126, 140, 146, 156, 159, 162, 163, 164, 165, 167, 173,
       176, 177, 178, 193, 203, 204, 209, 215, 220, 225, 228, 229, 232,
       242, 251, 253, 263, 269, 274, 279, 280,   3,   7,  27,  32,  36,
        37,  45,  52,  59,  72,  73,  76,  82,  87,  89,  91,  95,  99,
       104, 105, 108, 109, 111, 123, 127, 132, 133, 135, 137, 138, 141,
       144, 147, 168, 170, 172, 174, 175, 180, 181, 190, 195, 200, 202,
       208, 211, 212, 213, 222, 231, 240, 244, 252, 257, 260, 265, 267,
       290, 292, 295,   4,  11,  13,  20,  26,  31,  38,  39,  42,  43,
        46,  49,  53,  56,  65,  67,  74,  81,  83,  90, 106, 107, 112,
       115, 116, 119, 128, 139, 152, 153, 157, 160, 166, 179, 183, 184,
       186, 188, 192, 196, 198, 206, 217, 218, 219, 223, 224, 234, 235,
       248, 249, 255, 261, 262, 271, 272, 276, 288, 289,   1,   

In [34]:
df['svc_pred'] = None
df.iloc[val_indices, -1] = svc_result_pred

df['rf_pred'] = None
df.iloc[val_indices, -1] = rf_result_pred

df['logreg_pred'] = None
df.iloc[val_indices, -1] = log_reg_result_pred

df['gbc_pred'] = None
df.iloc[val_indices, -1] = gbc_result_pred

In [43]:
roc_auc_score(
    df['target'],
    (df['rf_pred'] * 2.0 + df['logreg_pred'] * 1.0 ) / 3
)

0.9168267032464563

In [44]:
df.to_csv('../data/ml_df.csv', index=False)